# Retrieve content

In [20]:
from collections import namedtuple, Counter
import numpy as np

Dataset = namedtuple("Dataset", ["lemmas", "forms"])

def get_lemmas(file: str = "./dataset/main/dev.txt"):
    out = {
        "positive": Dataset([], []),
        "negative": Dataset([], [])
    }
    
    with open(file) as f:
        for idx, orig_line in enumerate(f):
            if idx == 0:
                headers = orig_line.strip().split("\t")[1:]
                continue
            line = orig_line.split("\t")
            if len(line) != len(headers):
                if orig_line.startswith("#[TAG]"):
                    cat = orig_line.strip()[6:]
                continue
            line = dict(zip(headers, line))
            out[cat].lemmas.append(line["lemma"])
            out[cat].forms.append(line["token"])
    return out

Global = {
    "positive": Dataset([], []),
    "negative": Dataset([], [])
}
for file in ["./dataset/main/train.txt", "./dataset/main/dev.txt", "./dataset/main/test.txt"]:
    current_output = get_lemmas(file)
    for key, dataset in current_output.items():
        for i in range(len(dataset)):
            Global[key][i].extend(dataset[i])

# MLTD and Other metrics function

In [41]:
# from https://github.com/kristopherkyle/lexical_diversity/blob/master/lexical_diversity/lex_div.py


def safe_divide(numerator, denominator):
    if denominator == 0 or denominator == 0.0:
        index = 0
    else: index = numerator/denominator
    return index


def ttr(text):
    ntokens = len(text)
    ntypes = len(set(text))
    
    return safe_divide(ntypes,ntokens)

def mtld(input, min = 10): #original MTLD described in Jarvis & McCarthy 
    def mtlder(text):
        factor = 0
        factor_lengths = 0
        start = 0
        for x in range(len(text)):
            factor_text = text[start:x+1]
            if x+1 == len(text):
                factor += safe_divide((1 - ttr(factor_text)),(1 - .72))
                factor_lengths += len(factor_text)
            else:
                if ttr(factor_text) < .720 and len(factor_text) >= min:
                    factor += 1
                    factor_lengths += len(factor_text)
                    start = x+1
                else:
                    continue

        mtld = safe_divide(factor_lengths,factor)
        return mtld
    input_reversed = list(reversed(input))
    mtld_full = safe_divide((mtlder(input)+mtlder(input_reversed)),2)
    return mtld_full

# Our own code
def partage(k, j):
    return len(set(k).intersection(set(j))) / len(set(k)) * 100

def distribution(data):
    x = list(Counter(data).values())
    return np.std(x)

## Déviation standard

In [42]:
positives = {
    "Taille": len(Global["positive"].forms), 
    "Richesse": len(set(Global["positive"].forms)),
    "MTLD(Lemmes)": mtld(Global["positive"].lemmas), 
    "MTLD(Formes)": mtld(Global["positive"].forms), 
    "Distribution(Lemmes)": distribution(Global["positive"].lemmas),
    "Distribution(Formes)": distribution(Global["positive"].forms),
    "Partage(Lemmes)": partage(Global["positive"].lemmas, Global["negative"].lemmas),
    "Partage(Formes)": partage(Global["positive"].forms, Global["negative"].forms),
}
negatives = {
    "Taille": len(Global["negative"].forms), 
    "Richesse": len(set(Global["negative"].forms)),
    "MTLD(Lemmes)": mtld(Global["negative"].lemmas), 
    "MTLD(Formes)": mtld(Global["negative"].forms), 
    "Distribution(Lemmes)": distribution(Global["negative"].lemmas),
    "Distribution(Formes)": distribution(Global["negative"].forms),
    "Partage(Lemmes)": partage(Global["negative"].lemmas, Global["positive"].lemmas),
    "Partage(Formes)": partage(Global["negative"].forms, Global["positive"].forms),
}

In [43]:
from pandas import DataFrame

print(DataFrame([negatives, positives], index=["Négatifs", "Positifs"]).transpose().to_latex(float_format=lambda x: f"{x:.2f}"))

\begin{tabular}{lrr}
\toprule
{} &  Négatifs &  Positifs \\
\midrule
Taille               & 525220.00 &  44964.00 \\
Richesse             &  80153.00 &  13305.00 \\
MTLD(Lemmes)         &    114.70 &    153.90 \\
MTLD(Formes)         &    286.56 &    384.53 \\
Distribution(Lemmes) &    393.70 &     62.46 \\
Distribution(Formes) &    191.22 &     40.69 \\
Partage(Lemmes)      &     22.39 &     78.12 \\
Partage(Formes)      &     10.72 &     64.55 \\
\bottomrule
\end{tabular}

