In [37]:
import pandas as pd
import spacy

nlp = spacy.load("pt_core_news_sm")

In [50]:
df = pd.read_csv('../data/hatebr_and_rationales.csv', \
                    index_col=0, \
                    converters={"rationales_offensive_1_normalized": \
                    lambda x: x.strip('[]').replace("'", "").split(", "),
                    "rationales_offensive_2_normalized": \
                    lambda x: x.strip('[]').replace("'", "").split(", ")})

In [51]:
ofensivos = df[df['label final'] == 1]
non_ofensivos = df[df['label final'] == 0]

In [56]:
def get_statistics(df):
    sentences = 0
    words = 0
    pos = {}
    for i, row in df.iterrows():
        sentences += len(row.comentario.split('. '))
        words += len(row.normalized_text.split(' '))

        doc = nlp(row.normalized_text)
        for token in doc:
            try:
                pos[token.pos_] += 1
            except: 
                pos[token.pos_] = 1
    
    to_return = {}
    to_return['sentences'] = sentences
    to_return['words'] = words
    to_return['avg sentences'] = sentences/len(df)
    to_return['avg words'] = words/len(df)
    to_return['Noun'] = pos['NOUN']/len(df)
    to_return['Verb'] = pos['VERB']/len(df)
    to_return['Adjective'] = pos['ADJ']/len(df)
    to_return['Adverb'] = pos['ADV']/len(df)
    to_return['Pronoun'] = pos['PRON']/len(df)
    to_return['Subordinating Conjunction'] = pos['SCONJ']/len(df)
    to_return['Coordinating Conjunction'] = pos['CCONJ']/len(df)
    
    return to_return

get_statistics(ofensivos)

{'sentences': 4871,
 'words': 53455,
 'avg sentences': 1.3917142857142857,
 'avg words': 15.272857142857143,
 'Noun': 3.492,
 'Verb': 2.4042857142857144,
 'Adjective': 0.988,
 'Adverb': 1.1262857142857143,
 'Pronoun': 1.1197142857142857,
 'Subordinating Conjunction': 0.5708571428571428,
 'Coordinating Conjunction': 0.4717142857142857}

In [57]:
get_statistics(non_ofensivos)

{'sentences': 4674,
 'words': 42891,
 'avg sentences': 1.3354285714285714,
 'avg words': 12.25457142857143,
 'Noun': 2.7994285714285714,
 'Verb': 1.806857142857143,
 'Adjective': 0.8774285714285714,
 'Adverb': 0.8888571428571429,
 'Pronoun': 0.89,
 'Subordinating Conjunction': 0.4174285714285714,
 'Coordinating Conjunction': 0.4005714285714286}

In [59]:
def get_rationales_statistics(df, annot):
    spans = 0
    words = 0
    pos = {}
    for i, row in df.iterrows():
        if annot == 1:
            r = row.rationales_offensive_1_normalized
        else:
            r = row.rationales_offensive_2_normalized
        spans += len(r)
        words += len(' '.join(r).split(' '))

        doc = nlp(' '.join(r))
        for token in doc:
            try:
                pos[token.pos_] += 1
            except: 
                pos[token.pos_] = 1
    
    to_return = {}
    to_return['spans'] = spans
    to_return['words'] = words
    to_return['avg spans'] = spans/len(df)
    to_return['avg words'] = words/len(df)
    to_return['Noun'] = pos['NOUN']/len(df)
    to_return['Verb'] = pos['VERB']/len(df)
    to_return['Adjective'] = pos['ADJ']/len(df)
    to_return['Adverb'] = pos['ADV']/len(df)
    to_return['Pronoun'] = pos['PRON']/len(df)
    to_return['Subordinating Conjunction'] = pos['SCONJ']/len(df)
    to_return['Coordinating Conjunction'] = pos['CCONJ']/len(df)
    
    return to_return

get_rationales_statistics(df[df['label final'] == 1], 1)

{'spans': 6601,
 'words': 27038,
 'avg spans': 1.886,
 'avg words': 7.725142857142857,
 'Noun': 2.032857142857143,
 'Verb': 1.3182857142857143,
 'Adjective': 0.6194285714285714,
 'Adverb': 0.49914285714285717,
 'Pronoun': 0.44771428571428573,
 'Subordinating Conjunction': 0.21057142857142858,
 'Coordinating Conjunction': 0.13228571428571428}

In [60]:
get_rationales_statistics(df[df['label final'] == 1], 2)

{'spans': 5925,
 'words': 17995,
 'avg spans': 1.6928571428571428,
 'avg words': 5.1414285714285715,
 'Noun': 1.5097142857142858,
 'Verb': 0.7988571428571428,
 'Adjective': 0.4754285714285714,
 'Adverb': 0.2874285714285714,
 'Pronoun': 0.26142857142857145,
 'Subordinating Conjunction': 0.11114285714285714,
 'Coordinating Conjunction': 0.06971428571428571}