In [161]:
#import all necessary libraries
#supervised and unsupervised (not part of proposal) classification. FastText is used for both. For the flat, just using traditional ML models. 

import pandas as pd
import gensim

from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# testing
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# pipeline 
from sklearn.pipeline import Pipeline

# metrics
from sklearn.metrics import classification_report


# pre-processing
import re
import string 

import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
nltk.download('punkt')
punctuation = list(set(string.punctuation))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liviaclarete/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/liviaclarete/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [162]:
#get hierarchical_data

hierarchical_data = pd.read_csv('https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')
print(hierarchical_data.shape)
hierarchical_data.head(2)

(5668, 80)


Unnamed: 0,text,Hate.speech,Sexism,Body,Racism,Ideology,Homophobia,Origin,Religion,Health,...,Thin.women,Arabic,East.europeans,Africans,South.Americans,Brazilians,Migrants,Homossexuals,Thin.people,Ageing
0,"""não come mel, morde marimbondo""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"não tem pinto, tem orgulho !",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [428]:
# ! pip install -U spacy
# ! python -m spacy download pt_core_news_sm

In [429]:
# ! pip install pt_core_news_lg
# import pt_core_news_lg
# nlp = spacy.load('pt_core_news_lg')

In [430]:
stopwords_manual = ['http?', 'não' , 'mais', 'is?o', 'es[st]?*', 'quan[dt]?', ' ', '\n', '...', 'de o', 'em o', 'rt', 'ter', 'pra', 'a o', 'q', '  ', '..', 'por 0', 'fazer', 'dizer', 'vc']

def preprocessing(text):
    l = []
    split_sentence = text.split()
    for word in split_sentence:
        if len(word) > 2 and word not in stopwords and word not in stopwords_manual and word not in punctuation:
            word = word.lower()
            word = re.sub('@[\w]+','',word) #remove usernames
            word = re.sub('"','',word) #remove quotes
            word = re.sub(',','',word) #remove comma
            word = re.sub('!','',word) #remove dote
            word = re.sub('\.','',word) #remove dote
            word = re.sub('-',' ',word) #remove dote
            word = re.sub(';',' ',word) #remove dote
            word = re.sub('\?',' ',word) #remove dote
            word = re.sub('/',' ',word).strip() #remove dote
            l.append(word)
    return ' '.join(l)

In [431]:
hierarchical_data['text'] = hierarchical_data.text.apply(preprocessing)
hierarchical_data['text'].head()

0                            come mel morde marimbondo
1                                        pinto orgulho
2          merda crepúsculo cinebiografia chuck norris
3                      tapa bundinha cotovelada costas
4    diminutivo inho acompanha trajetória homem ver...
Name: text, dtype: object

In [432]:
# selecting X and y variables
X = hierarchical_data.text
y = hierarchical_data["Hate.speech"]


In [433]:
# 10-fold crossvalidation (Chollet, 2017), combined with holdout validation

In [434]:
def model_evaluate(X, y, model, model_name, n_splits:int=10):
    # kfold with 10 splits split
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # set a pipeline
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', model)
    ])

    # metrics
    scoring = ['accuracy', 'f1', 'f1_micro', 'f1_macro', 'precision', 'recall']

    # cross validation
    scores = cross_validate(pipeline, X, y, scoring=scoring, cv=kf)
    
    # format the results into a dataframe
    results = pd.DataFrame(scores)
    results['model_name'] = model_name
    # save it into the results directory
#     results.to_csv(f'data/results/{model_name}.csv')
    
    return results

In [435]:
models = {
    'ComplementNB':ComplementNB(),
    'LogisticRegression':LogisticRegression(),
    'ForestClassifier':RandomForestClassifier(max_depth=4, n_estimators=150),
    'xgboost': xgb.XGBClassifier()
}

In [436]:
results_list = []
for k, v in models.items():
    results = model_evaluate(X, y, v, k)
    results_list.append(results)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [437]:
total_results = pd.concat(results_list)

In [438]:
total_results.groupby('model_name').mean()

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,test_f1,test_f1_micro,test_f1_macro,test_precision,test_recall
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ComplementNB,0.054601,0.006865,0.845978,0.61411,0.845978,0.758942,0.670826,0.56678
ForestClassifier,0.243105,0.017436,0.783345,0.0,0.783345,0.439256,0.0,0.0
LogisticRegression,0.200961,0.007065,0.868736,0.602591,0.868736,0.761986,0.873751,0.460096
xgboost,0.579769,0.015465,0.870147,0.638321,0.870147,0.779588,0.803925,0.530108


In [None]:
# Next steps
# # grid search

# # https://www.kaggle.com/code/carlosdg/xgboost-with-scikit-learn-pipeline-gridsearchcv/notebook
# model = xgb.XGBClassifier()

# pipeline = Pipeline([
#     ('standard_scaler', StandardScaler()), 
#     ('pca', PCA()), 
#     ('model', model)
# ])

# param_grid = {
#     'pca__n_components': [5, 10, 15, 20, 25, 30],
#     'model__max_depth': [2, 3, 5, 7, 10],
#     'model__n_estimators': [10, 100, 500],
# }

# grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')
# grid.fit(X_train, y_train)


In [None]:
# xgBoost input
# and we save the last layer before the classification to extract 50 dimensions as input to the xgBoost algorithm,7 which is a gradient boosting implementation from the Python library (Chen and Guestrin, 2016).
# or xgBoost, the default parameter setting has been used, except for ‘eta’ and ‘gamma’. In this case, we conducted a grid search combining several values of both (eta: 0, 0.3, 1; and gamma: 0.1,1, 10) in order to obtain the optimal eta and gamma settings.

In [None]:
# neural network

# Methods provided by Keras are then used to map each token in the input to an embedding.
# feature extraction: glove 300 dimentions
# pre-trained Glove word embeddings with 300 dimensions for Portuguese (Hartmann et al., 2017). 

# classification: lstm
# https://github.com/paulafortuna/twitter-hatespeech/blob/master/lstm.py

# The architecture contains an embedding Layer with the weights from the word embeddings 
# extraction procedure, an additional LSTM layer with 50 dimensions, and dropouts at the end of both layers.

# As loss function, we used binary crossentropy and for optimization Adam, 10 epochs and 128 for batch size. 