In [1]:
import pandas as pd
import gensim

# pre-processing
from sklearn.feature_extraction.text import TfidfVectorizer
# Text preprocessing: remove stopwords using gensim
import nltk
stopwords = nltk.corpus.stopwords.words('portuguese')

# models
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# testing
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# pipeline 
from sklearn.pipeline import Pipeline

# metrics
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("./data/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv")
print(df.shape)
df.head(2)

(5668, 80)


Unnamed: 0,text,Hate.speech,Sexism,Body,Racism,Ideology,Homophobia,Origin,Religion,Health,...,Thin.women,Arabic,East.europeans,Africans,South.Americans,Brazilians,Migrants,Homossexuals,Thin.people,Ageing
0,"""não come mel, morde marimbondo""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"não tem pinto, tem orgulho !",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# remove punctuation using the default string library  

# transform all tokens in the tweets to lower case.
def preprocessing(text):
    text = text.lower()

In [4]:
# selecting X and y variables
X = df.text
y = df["Hate.speech"]

In [5]:
# 10-fold crossvalidation (Chollet, 2017), combined with holdout validation

In [6]:
def model_evaluate(X, y, model, model_name, n_splits:int=10):
    # kfold with 10 splits split
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # set a pipeline
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', model)
    ])
    # metrics
    scoring = ['accuracy', 'f1', 'f1_micro', 'f1_macro']
    # cross validation
    scores = cross_validate(pipeline, X, y, scoring=scoring, cv=kf)
    
    # format the results into a dataframe
    results = pd.DataFrame(scores)
    results['model_name'] = model_name
    # save it into the results directory
#     results.to_csv(f'data/results/{model_name}.csv')
    
    return results

In [7]:
models = {
    'ComplementNB':ComplementNB(),
    'LogisticRegression':LogisticRegression(),
    'ForestClassifier':RandomForestClassifier(max_depth=4, n_estimators=150),
    'xgboost': xgb.XGBClassifier()
}

In [8]:
results_list = []
for k, v in models.items():
    results = model_evaluate(X, y, v, k)
    results_list.append(results)

In [9]:
total_results = pd.concat(results_list)

In [10]:
total_results.groupby('model_name').mean()

Unnamed: 0_level_0,fit_time,score_time,test_accuracy,test_f1,test_f1_micro,test_f1_macro
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ComplementNB,0.078757,0.008462,0.848625,0.521511,0.848625,0.715797
ForestClassifier,0.270319,0.018754,0.783345,0.0,0.783345,0.439256
LogisticRegression,0.186008,0.008745,0.865031,0.589457,0.865031,0.754347
xgboost,0.850224,0.02023,0.874913,0.657073,0.874913,0.790285


In [11]:
# # grid search

# # https://www.kaggle.com/code/carlosdg/xgboost-with-scikit-learn-pipeline-gridsearchcv/notebook
# model = xgb.XGBClassifier()

# pipeline = Pipeline([
#     ('standard_scaler', StandardScaler()), 
#     ('pca', PCA()), 
#     ('model', model)
# ])

# param_grid = {
#     'pca__n_components': [5, 10, 15, 20, 25, 30],
#     'model__max_depth': [2, 3, 5, 7, 10],
#     'model__n_estimators': [10, 100, 500],
# }

# grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')
# grid.fit(X_train, y_train)


In [70]:
# xgBoost input
# and we save the last layer before the classification to extract 50 dimensions as input to the xgBoost algorithm,7 which is a gradient boosting implementation from the Python library (Chen and Guestrin, 2016).
# or xgBoost, the default parameter setting has been used, except for ‘eta’ and ‘gamma’. In this case, we conducted a grid search combining several values of both (eta: 0, 0.3, 1; and gamma: 0.1,1, 10) in order to obtain the optimal eta and gamma settings.

In [72]:
# neural network

# Methods provided by Keras are then used to map each token in the input to an embedding.
# feature extraction: glove 300 dimentions
# pre-trained Glove word embeddings with 300 dimensions for Portuguese (Hartmann et al., 2017). 

# classification: lstm
# https://github.com/paulafortuna/twitter-hatespeech/blob/master/lstm.py

# The architecture contains an embedding Layer with the weights from the word embeddings 
# extraction procedure, an additional LSTM layer with 50 dimensions, and dropouts at the end of both layers.

# As loss function, we used binary crossentropy and for optimization Adam, 10 epochs and 128 for batch size. 