In [1]:
import os
import pandas as pd
import numpy as np
from spacy.lang.pt import Portuguese
from spacy.lang.pt import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("comentarios_tratados.csv")
df.head(10)

Unnamed: 0,text,toxic
0,meu nivel de amizade com isis é ela ter meu in...,1.0
1,o cara adultera dados que foram desmascarados...,1.0
2,o cara só é simplesmente o maior vencedor d...,1.0
3,eu to chorando vei vsf e eu nem staneio izone ...,1.0
4,eleitor do bolsonaro é tão ignorante q não per...,1.0
5,vai responder as outras 75 conversas e para de...,1.0
6,tem um do jack com a msm música e agr não sei ...,0.0
7,mais é ruim pra pedir desafio esse técnico do ...,1.0
8,eu fico vendo isso e penso desvantagens do kp...,1.0
9,frio do caralho parece até q to dentro do teu ...,1.0


In [3]:
# df["toxic"] = df["homophobia"] + df["obscene"] + df["insult"] + df["racism"] + df["misogyny"] + df["xenophobia"]
# df.drop(['obscene','insult','racism','misogyny','xenophobia','homophobia'],axis=1,inplace=True)
# df.head()

In [4]:
# df.loc[df['toxic'] == 0, 'toxic'] = 0
# df.loc[df['toxic'] > 0, 'toxic'] = 1
# df.head()

In [5]:
import unicodedata
df['text'] = df['text'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ASCII','ignore').decode('ASCII'))
df.head()

Unnamed: 0,text,toxic
0,meu nivel de amizade com isis e ela ter meu in...,1.0
1,o cara adultera dados que foram desmascarados...,1.0
2,o cara so e simplesmente o maior vencedor d...,1.0
3,eu to chorando vei vsf e eu nem staneio izone ...,1.0
4,eleitor do bolsonaro e tao ignorante q nao per...,1.0


In [6]:
import re
def remove_special_characters(tweet):
    tweet = re.sub(r'http\S+', '', tweet)       # remove os links do tweet
    tweet = re.sub(r't.co\S+', '', tweet)       # remove os links 
    tweet = re.sub(r'#\S+', '', tweet)          # remove hashtags
    tweet = re.sub(r"[.]\s+", '', tweet)        # remove reticências
    emojis = re.compile("["                     # remove os emojis
                      u"\U0001F600-\U0001F64F"  
                      u"\U0001F300-\U0001F5FF"  
                      u"\U0001F680-\U0001F6FF"  
                      u"\U0001F1E0-\U0001F1FF"  
                      u"\U00002500-\U00002BEF"  
                      u"\U00002702-\U000027B0"
                      u"\U00002702-\U000027B0"
                      u"\U000024C2-\U0001F251"
                      u"\U0001f926-\U0001f937"
                      u"\U00010000-\U0010ffff"
                      u"\u2640-\u2642"
                      u"\u2600-\u2B55"
                      u"\u200d"
                      u"\u23cf"
                      u"\u23e9"
                      u"\u231a"
                      u"\ufe0f"  # dingbats
                      u"\u3030"
                      "]+", re.UNICODE)

    tweet = re.sub(emojis, '', tweet)               # remove os emojis
    tweet = tweet.replace('"', "")                  # remove as aspas
    tweet = re.sub("[-*!,$><:.+?=]", '', tweet)     # remove demais caracteres especiais
    tweet = re.sub(r'  ', ' ', tweet)               # remove espaços duplos
    return tweet.lower()

In [7]:
def write_replies(reply):
        tweet_text = remove_special_characters(reply)
        tweet_text = re.sub(r'@\S+', '', tweet_text)
        # Quando houver retweet
        if (tweet_text[0] == 'r'):
            tweet_text = tweet_text[4:]
        return tweet_text

In [8]:
df['text'] = df['text'].apply(write_replies)
df.head()

Unnamed: 0,text,toxic
0,meu nivel de amizade com isis e ela ter meu in...,1.0
1,o cara adultera dados que foram desmascarados...,1.0
2,o cara so e simplesmente o maior vencedor da...,1.0
3,eu to chorando vei vsf e eu nem staneio izone ...,1.0
4,eleitor do bolsonaro e tao ignorante q nao per...,1.0


In [9]:
# Bruxaria 1
import hunspell

def tokenize(sentence):
    tokens_regex = re.compile(r"([., :;\n()\"!?\/&%+])", flags=re.IGNORECASE)
    tokens = re.split(tokens_regex, sentence)
    postprocess = []
    postprocess_regex = re.compile(r"\b(\w+)-(me|te|se|nos|vos|o|os|a|as|lo|los|la|las|lhe|lhes|lha|lhas|lho|lhos|no|na|nas|mo|ma|mos|mas|to|ta|tos|tas)\b", flags=re.IGNORECASE)
    for token in tokens:
        for token2 in re.split(postprocess_regex, token):
            if token2.strip():
                postprocess.append(token2)

    return postprocess

h = hunspell.Hunspell("pt_PT", hunspell_data_dir="./hunspell-pt_PT-20210608/")

In [10]:
def lemmetization(x):
    tokens = tokenize(x)
    lemmas = []
    for token in tokens:
        lemma = h.stem(token)
        if len(lemma) == 1:
            lemmas.append(lemma[0])
        else:
            lemmas.append(token)

    return " ".join(lemmas)

In [11]:
df['text'] = df['text'].apply(lambda x: lemmetization(x))
df.sample(20)

Unnamed: 0,text,toxic
3712,todo os dia cair pelo menos 10 contas de amigo...,0.0
8944,quando voce este feliz voce apreciar a musicam...,0.0
11679,achar ridiculo pessoa sem maturidade senso e p...,1.0
15404,pqp viu,0.0
17980,eu to muito ansioso pqp terca chega logo caraio,0.0
11266,ontem a noite encontrar alguem do tempo do esc...,0.0
13128,chupar esse picole r800 chupar meu pau diamant...,1.0
15549,a garotinha tem 8ano um puta gosto musical e u...,0.0
9095,tao duro rezar pra chover vai toma no cu pra l...,1.0
17591,ate um dia atras tinha falar que o jk so sair ...,0.0


In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/augusto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# Remoção das stopwords
from nltk.corpus import stopwords
stopwords = set(stopwords.words('portuguese'))

text_treated = []

for reply in df.text:
    word_list = reply.split()

    for word in word_list:
        if(word in stopwords):
            word_list.remove(word)
    word_list = " ".join(word_list)
    text_treated.append(word_list)

i = 0
for row in df.iterrows():
    df.iloc[i, 0] = text_treated[i]
    i = i + 1

In [14]:
stopwords

{'a',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'até',
 'com',
 'como',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele',
 'deles',
 'depois',
 'do',
 'dos',
 'e',
 'ela',
 'elas',
 'ele',
 'eles',
 'em',
 'entre',
 'era',
 'eram',
 'essa',
 'essas',
 'esse',
 'esses',
 'esta',
 'estamos',
 'estas',
 'estava',
 'estavam',
 'este',
 'esteja',
 'estejam',
 'estejamos',
 'estes',
 'esteve',
 'estive',
 'estivemos',
 'estiver',
 'estivera',
 'estiveram',
 'estiverem',
 'estivermos',
 'estivesse',
 'estivessem',
 'estivéramos',
 'estivéssemos',
 'estou',
 'está',
 'estávamos',
 'estão',
 'eu',
 'foi',
 'fomos',
 'for',
 'fora',
 'foram',
 'forem',
 'formos',
 'fosse',
 'fossem',
 'fui',
 'fôramos',
 'fôssemos',
 'haja',
 'hajam',
 'hajamos',
 'havemos',
 'hei',
 'houve',
 'houvemos',
 'houver',
 'houvera',
 'houveram',
 'houverei',
 'houverem',
 'houveremos',
 'houveria',
 'houveriam',
 'houvermos',
 'houverá',
 'houverão',
 'houveríamos',
 'houvesse',


In [15]:
# Remove as colunas caso ocorra algum NaN
df.dropna(inplace=True)
df.describe()

Unnamed: 0,toxic
count,21000.0
mean,0.440714
std,0.496485
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [16]:
# Define qual coluna cada variável receberá
x = df.text
y = df.toxic

In [17]:
x.values.shape

(21000,)

In [18]:
from joblib import dump, load

In [19]:
# O CountVectorizer conta quantas vezes cada palavra aparece em cada text
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=3000)
x_vectorized = vectorizer.fit_transform(x.values)

In [20]:
vectorizer_fitted = vectorizer.fit(x.values)
dump(vectorizer_fitted, '../visualization/models/Vectorizer_XGBoost.joblib')

['../visualization/models/Vectorizer_XGBoost.joblib']

In [43]:
model = XGBClassifier(use_label_encoder=False)
targets = y_train.values
scores = cross_validate(model, x_train, y_train, cv=10, 
                        scoring=('f1', 'accuracy','precision'), return_train_score=True);



In [42]:
print(f"F1 = {scores['test_f1'].mean()}")
print(f"acc = {scores['test_accuracy'].mean()}")
print(f"precision = {scores['test_precision'].mean()}")

F1 = 0.6991580877977952
acc = 0.7404761904761905
precision = 0.7149470563051668


In [44]:
model.fit(x_vectorized,y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [46]:
model
dump(model, '../visualization/models/XGBoost.joblib')

['../visualization/models/XGBoost.joblib']

In [None]:
kern = 'rbf'
model = SVC(kernel=kern)
model.fit(x_vectorized, y)
dump(model, f'../visualization/models/SVM_kernel_{kern}.joblib')

In [None]:
kern = 'linear'
model = SVC(kernel=kern)
model.fit(x_vectorized, y)
dump(model, f'../visualization/models/SVM_kernel_{kern}.joblib')

In [None]:
kern = 'sigmoid'
model = SVC(kernel=kern)
model.fit(x_vectorized, y)
dump(model, f'../visualization/models/SVM_kernel_{kern}.joblib')

In [None]:
def retira(sentence):
    new = [palavra for palavra in sentence.split() if palavra not in stopwords]
    return " ".join(new)

In [None]:
df.text = df.text.apply(retira)

In [None]:
x = df.text
y = df.toxic

In [None]:
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x.values)

In [None]:
x_vectorized

In [22]:
x_train,x_test,y_train,y_test = train_test_split(x_vectorized, y, test_size=0.2)

In [None]:
model = SVC(kernel='rbf')
targets = y_train.values
scores = cross_validate(model, x_train, y_train, cv=10, 
                        scoring=('f1', 'accuracy','precision'), return_train_score=True)
print(f"F1 = {scores['test_f1'].mean()}")
print(f"acc = {scores['test_accuracy'].mean()}")
print(f"precision = {scores['test_precision'].mean()}")

In [None]:
model = SVC(kernel='linear')
targets = y_train.values
scores = cross_validate(model, x_train, y_train, cv=10, 
                        scoring=('f1', 'accuracy','precision'), return_train_score=True)
print(f"F1 = {scores['test_f1'].mean()}")
print(f"acc = {scores['test_accuracy'].mean()}")
print(f"precision = {scores['test_precision'].mean()}")

In [None]:
model = SVC(kernel='sigmoid')
targets = y_train.values
scores = cross_validate(model, x_train, y_train, cv=10, 
                        scoring=('f1', 'accuracy','precision'), return_train_score=True)
print(f"F1 = {scores['test_f1'].mean()}")
print(f"acc = {scores['test_accuracy'].mean()}")
print(f"precision = {scores['test_precision'].mean()}")

In [None]:
model = SVC(kernel='poly', degree=2)
targets = y_train.values
scores = cross_validate(model, x_train, y_train, cv=10, 
                        scoring=('f1', 'accuracy','precision'), return_train_score=True)
print(f"F1 = {scores['test_f1'].mean()}")
print(f"acc = {scores['test_accuracy'].mean()}")
print(f"precision = {scores['test_precision'].mean()}")

In [None]:
model = SVC(kernel='poly', degree=3)
targets = y_train.values
scores = cross_validate(model, x_train, y_train, cv=10, 
                        scoring=('f1', 'accuracy','precision'), return_train_score=True)
print(f"F1 = {scores['test_f1'].mean()}")
print(f"acc = {scores['test_accuracy'].mean()}")
print(f"precision = {scores['test_precision'].mean()}")