### **Importing libraries and data**

In [1]:
import pandas as pd

# from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB

SEED = 3

In [2]:
binary_df = pd.read_csv('../data/2019-05-28_portuguese_hate_speech_binary_classification.csv')
binary_df['hatespeech_G2'] = binary_df['hatespeech_G2'].fillna(0).astype('int')
binary_df.head()

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3
0,@__andrea__b \r\nO cara vive em outro mundo\r\...,1,1,A,1,V,0,E
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0,V,0,C
2,@_carmeloneto \r\nOs 'cumpanhero' quebraram to...,0,1,A,0,B,0,E
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0,V,0,D
4,@_iglira bom dia macaco branco haha,1,0,A,1,I,1,E


In [3]:
hierarchical_df = pd.read_csv('../data/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')
hierarchical_df.head()

Unnamed: 0,text,Hate.speech,Sexism,Body,Racism,Ideology,Homophobia,Origin,Religion,Health,...,Thin.women,Arabic,East.europeans,Africans,South.Americans,Brazilians,Migrants,Homossexuals,Thin.people,Ageing
0,"""não come mel, morde marimbondo""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"não tem pinto, tem orgulho !",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Não vê essa merda de Crepúsculo! Pra isso temo...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"não da tapa na bundinha, da cotovelada nas cos...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,o diminutivo INHO não acompanha a trajetória d...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
hierarchical_classes = pd.read_csv('../data/graph_hierarchical_classes.csv')
hierarchical_classes.head()

Unnamed: 0,Source,Target
0,Hate speech,Racism
1,Hate speech,Origin
2,Origin,People based on origin
3,Hate speech,Migrants
4,Hate speech,Religion


### **Exploratory Data Analysis (EDA)**

In [5]:
binary_df['text'].sort_values()

279     #ALERTA do Jornalista #AlexandreGarcia\r\n Adq...
135     #AecioMaisCItadoQue Bolsonaro em discussão sob...
177     #Agenda 11,12 e13 de Março/SP. #CongressoEvang...
179     #Agenda Março 18 e 19.\r\n Simpósio sobre Ideo...
178     #AgendaMaio  Palestra sobre identidade de gêne...
                              ...                        
2989    “Nós não podemos viver num mundo tão conflituo...
3245    “Orfeu XXI, Música pela vida e dignidade”. 'Nã...
3794    “Rapefugees”: já começa em Portugal _\r\nhttps...
5482    “União Europeia ameaça com sanções países que ...
227     “ai, luigi, mas vc ta militando? q saco”\r\n.\...
Name: text, Length: 5670, dtype: object

In [6]:
hierarchical_df['text'].sort_values()

2582     #MamãeEuQuero\r\n#MamãeEuQuero\r\nMamãe Eu Qu...
4673     Feliz dia da mulher ....da M-U-L-H-E-R\r\n  #...
2603     GOLEIRO BRUNO E\r\nMACARRÃO\r\nMACARRÃO\r\nMA...
2406     JOANA MARANHÃO\r\nMARANHÃO\r\nMARANHÃO\r\nSÓ ...
4112     Não Tanto Quanto Você Militante @FafaGramacho...
                              ...                        
1876    “Com o PNR, Coimbra tem mais encanto“ @vitormr...
3385    “Nós não podemos viver num mundo tão conflituo...
3673    “Orfeu XXI, Música pela vida e dignidade”. 'Nã...
1229    “Rapefugees”: já começa em Portugal _\r\nhttps...
5458    “ai, luigi, mas vc ta militando? q saco”\r\n.\...
Name: text, Length: 5668, dtype: object

In [7]:
print(hierarchical_df.iloc[2603][0], f'Hate.speech: {hierarchical_df.iloc[2603][1]}', sep='\n')
print('\n')
print(hierarchical_df.iloc[4673][0], f'Hate.speech: {hierarchical_df.iloc[4673][1]}', sep='\n')

 GOLEIRO BRUNO E
MACARRÃO
MACARRÃO
MACARRÃO
DE DIA VÃO PRO SÍTIO
DE NOITE ALIMENTAM O CÃO  
Hate.speech: 0


 Feliz dia da mulher ....da M-U-L-H-E-R
  #DiaDaMulher #BolsonaroReiDoMundo
FELIZ DIA DAS MULHERES
Dia Internaci _ https://t.co/2KgqXloDpw
Hate.speech: 0


  print(hierarchical_df.iloc[2603][0], f'Hate.speech: {hierarchical_df.iloc[2603][1]}', sep='\n')
  print(hierarchical_df.iloc[4673][0], f'Hate.speech: {hierarchical_df.iloc[4673][1]}', sep='\n')


In [8]:
binary_df.describe()

Unnamed: 0,hatespeech_comb,hatespeech_G1,hatespeech_G2,hatespeech_G3
count,5670.0,5670.0,5670.0,5670.0
mean,0.315344,0.583774,0.306878,0.168783
std,0.464694,0.492975,0.461239,0.374593
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0
75%,1.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0


In [9]:
hierarchical_df.describe()

Unnamed: 0,Hate.speech,Sexism,Body,Racism,Ideology,Homophobia,Origin,Religion,Health,OtherLifestyle,...,Thin.women,Arabic,East.europeans,Africans,South.Americans,Brazilians,Migrants,Homossexuals,Thin.people,Ageing
count,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,...,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0,5668.0
mean,0.216655,0.11856,0.028934,0.016584,0.016231,0.05681,0.004587,0.005293,0.001059,0.003529,...,0.000353,0.000353,0.000353,0.000706,0.000882,0.000882,0.014467,0.050812,0.000353,0.000706
std,0.412002,0.323299,0.167637,0.127719,0.126376,0.2315,0.067579,0.072566,0.032521,0.059302,...,0.018783,0.018783,0.018783,0.026558,0.02969,0.02969,0.119417,0.219632,0.018783,0.026558
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
print(*[(col, round(100*float(hierarchical_df.describe()[col]['mean']), 1)) for col in [col for col in hierarchical_df.columns if col != 'text']], sep='\n')

('Hate.speech', 21.7)
('Sexism', 11.9)
('Body', 2.9)
('Racism', 1.7)
('Ideology', 1.6)
('Homophobia', 5.7)
('Origin', 0.5)
('Religion', 0.5)
('Health', 0.1)
('OtherLifestyle', 0.4)
('Aborting.women', 0.1)
('Agnostic', 0.0)
('Argentines', 0.0)
('Asians', 0.1)
('Autists', 0.0)
('Black.Women', 0.1)
('Blond.women', 0.0)
('Brazilians.women', 0.0)
('Chinese', 0.1)
('Criminals', 0.1)
('Egyptians', 0.0)
('Fat.people', 2.8)
('Football.players.women', 0.0)
('Gamers', 0.0)
('Homeless', 0.1)
('Homeless.women', 0.0)
('Indigenous', 0.0)
('Iranians', 0.0)
('Japaneses', 0.0)
('Jews', 0.0)
('Jornalists', 0.0)
('Latins', 0.1)
('Left.wing.ideology', 0.5)
('Men.Feminists', 0.0)
('Mexicans', 0.0)
('Muslims.women', 0.0)
('Nordestines', 0.1)
('Old.people', 0.0)
('Polyamorous', 0.0)
('Poor.people', 0.0)
('Rural.people', 0.1)
('Russians', 0.0)
('Sertanejos', 0.0)
('Street.artist', 0.0)
('Ucranians', 0.0)
('Vegetarians', 0.0)
('White.people', 0.0)
('Young.people', 0.0)
('Old.women', 0.0)
('Ugly.people', 2.3)
('

### **Model training**

#### **Preprocessing**

In [51]:
X = hierarchical_df['text']
y = hierarchical_df['Hate.speech']

In [52]:
print(f'X nulls: {X.isna().sum()}')
print(f'y nulls: {y.isna().sum()}')

X nulls: 0
y nulls: 0


In [61]:
def get_pipe(model, params: dict):
    return Pipeline(steps=[
        ('vectorizer', TfidfVectorizer()),
        ('model', model(**params)),
    ])

def validate_pipe(pipe, X, y):
    accuracy = cross_val_score(pipe, X, y, cv=StratifiedKFold(n_splits=5), scoring='accuracy')
    precision = cross_val_score(pipe, X, y, cv=StratifiedKFold(n_splits=5), scoring='precision')
    recall = cross_val_score(pipe, X, y, cv=StratifiedKFold(n_splits=5), scoring='recall')
    f1 = cross_val_score(pipe, X, y, cv=StratifiedKFold(n_splits=5), scoring='f1')

    result_matrix = pd.DataFrame.from_dict({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    })

    print(
        f'Accuracy: {result_matrix['accuracy'].mean():.1%}',
        f'Precision: {result_matrix['precision'].mean():.1%}',
        f'Recall: {result_matrix['recall'].mean():.1%}',
        f'F1: {result_matrix['f1'].mean():.1%}',
        sep='\n',
    )

    return result_matrix

In [62]:
multinomialnb_pipe = get_pipe(MultinomialNB, {})
validate_pipe(multinomialnb_pipe, X, y)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 78.4%
Precision: 68.3%
Recall: 2.5%
F1: 4.5%


Unnamed: 0,accuracy,precision,recall,f1
0,0.783069,0.0,0.0,0.0
1,0.785714,1.0,0.012195,0.024096
2,0.78836,1.0,0.02439,0.047619
3,0.786408,1.0,0.012245,0.024194
4,0.776699,0.413043,0.077551,0.130584


In [63]:
complementnb_pipe = get_pipe(ComplementNB, {})
validate_pipe(complementnb_pipe, X, y)

Accuracy: 79.6%
Precision: 63.4%
Recall: 25.2%
F1: 33.8%


Unnamed: 0,accuracy,precision,recall,f1
0,0.771605,0.43299,0.170732,0.244898
1,0.800705,0.794118,0.109756,0.192857
2,0.823633,0.739583,0.288618,0.415205
3,0.827008,0.769231,0.285714,0.416667
4,0.757282,0.434211,0.404082,0.418605


In [64]:
bernoullinb_pipe = get_pipe(BernoulliNB, {})
validate_pipe(bernoullinb_pipe, X, y)

Accuracy: 78.3%
Precision: 64.5%
Recall: 7.3%
F1: 11.9%


Unnamed: 0,accuracy,precision,recall,f1
0,0.779541,0.25,0.00813,0.015748
1,0.785714,1.0,0.012195,0.024096
2,0.799824,0.827586,0.097561,0.174545
3,0.80053,0.827586,0.097959,0.175182
4,0.747573,0.321739,0.15102,0.205556
