In [1]:
# ! export MACOSX_DEPLOYMENT_TARGET=10.9
# ! pip install extremetext

In [1]:
# ! pip install scikit-multilearn

In [2]:
# https://github.com/mwydmuch/extremeText
# https://arxiv.org/pdf/1810.11671v1.pdf

In [321]:
import pandas as pd
import extremeText
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.datasets import make_multilabel_classification
import re
import nltk
from nltk.corpus import stopwords
pt_stopwords = stopwords.words('portuguese')

In [322]:
df = pd.read_csv('../data/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')
print(df.shape)
df.head(2)

(5668, 80)


Unnamed: 0,text,Hate.speech,Sexism,Body,Racism,Ideology,Homophobia,Origin,Religion,Health,...,Thin.women,Arabic,East.europeans,Africans,South.Americans,Brazilians,Migrants,Homossexuals,Thin.people,Ageing
0,"""não come mel, morde marimbondo""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"não tem pinto, tem orgulho !",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Preprocessing

In [328]:
stopwords_manual = ['http?', 'mais', 'is?o', 'es[st]?*', 'quan[dt]?', ' ', '\n', '...', 'de o', 'em o', 'rt', 'ter', 'pra', 'a o', 'q', '  ', '..', 'por 0', 'fazer', 'dizer', 'vc']

def preprocessing(text):
    l = []
#     text = ' '.join([w.lemma_ for w in nlp(text)]) #This lemma also performs tokenization)
    split_sentence = text.split()
    for word in split_sentence:
        if len(word) > 3 and word not in pt_stopwords and word not in stopwords_manual: # and word not in punctuation:
            word = word.lower()
            word = re.sub('@[\w]+','',word) #remove usernames
            word = re.sub('"','',word) #remove quotes
            word = re.sub(',','',word) #remove puntuation
            word = re.sub('!','',word) #remove puntuation
            word = re.sub('\.','',word) #remove puntuation
            word = re.sub('-',' ',word) #remove puntuation
            word = re.sub(';',' ',word) #remove puntuation
            word = re.sub('\?',' ',word) #remove puntuation
            word = re.sub('/',' ',word).strip() #remove puntuation

            l.append(word)
    return ' '.join(l)

# def preprocessing(text):

#     text = re.sub('@[\w]+','',text) #remove usernames
#     text = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', text) #remove links
#     text = [w.lemma_ for w in nlp(text)] #This lemma also performs tokenization
#     text = [word for word in text if word not in punctuation]
#     text = [w.lower() for w in text]
#     text = [word for word in text if word not in stopwords and word not in stopwords_manual]
#     text = ' '.join([str(word) for word in text])

#     return text

In [329]:
df['text'] = df.text.apply(preprocessing)

In [330]:
# Data should follow this format from extreme text

train_data_format = """
__label__mariadb-galera __label__mariadb55-mariadb __label__mysql55-mysql mariadb mariadb mysql solaris vulnerability oracle mysql server users availability vectors keys oracle com technetwork topics security html http secunia com http www oracle com technetwork topics security http lists security announce msg00016 html http www oracle com technetwork topics security html http secunia com http www securityfocus security gentoo glsa xml mariadb-galera mariadb55-mariadb-devel ruby-mysql openshift-origin-cartridge-mysql rh-mariadb100-mariadb mariadb-apb-role query-mysql mariadb55-mariadb-test rh-mysql57-mysql rh-mariadb101-mariadb rh-mysql56-mysql mysql mysql-connector-java mariadb55-mariadb-bench mysql55-mysql mysql-apb-role mysql mariadb55-mariadb-server mysql-binuuid-rails rh-mysql80-mysql com.github.brandtg switchboard-mysql rh-mariadb102-mariadb mariadb mariadb55-mariadb rhn-solaris-bootstrap mariadb55-mariadb-libs
"""
# https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/tree/main/extremeText/dataset

In [331]:
# replace "." with "-"
df.columns = df.columns.str.replace('.', '-')

cols = df.columns
cols = ['Hate-speech', 'Sexism', 'Body', 'Racism', 'Ideology', 'Homophobia', 'Origin', 'Religion', 'Health', 'OtherLifestyle', 'Aborting-women', 'Agnostic', 'Argentines', 'Asians', 'Autists', 'Black-Women', 'Blond-women', 'Brazilians-women', 'Chinese', 'Criminals', 'Egyptians', 'Fat-people', 'Football-players-women', 'Gamers', 'Homeless', 'Homeless-women', 'Indigenous', 'Iranians', 'Japaneses', 'Jews', 'Jornalists', 'Latins', 'Left-wing-ideology', 'Men-Feminists', 'Mexicans', 'Muslims-women', 'Nordestines', 'Old-people', 'Polyamorous', 'Poor-people', 'Rural-people', 'Russians', 'Sertanejos', 'Street-artist', 'Ucranians', 'Vegetarians', 'White-people', 'Young-people', 'Old-women', 'Ugly-people', 'Venezuelans', 'Angolans', 'Black-people', 'Disabled-people', 'Fat-women', 'Feminists', 'Gays', 'Immigrants', 'Islamists', 'Lesbians', 'Men', 'Muslims', 'Refugees', 'Trans-women', 'Travestis', 'Women', 'Bissexuals', 'Transexuals', 'Ugly-women', 'Thin-women', 'Arabic', 'East-europeans', 'Africans', 'South-Americans', 'Brazilians', 'Migrants', 'Homossexuals', 'Thin-people', 'Ageing']

def label_value(value, col):
    label = f'__label__{col}'
    if value == 1:
        return label
    else:
        return "None"
    
for i in cols:
    df[i] = df[i].apply(label_value, args=(i,))
    
df.Sexism.value_counts()

  df.columns = df.columns.str.replace('.', '-')


None               4996
__label__Sexism     672
Name: Sexism, dtype: int64

In [332]:
df['label_total'] = df[cols].agg(''.join, axis=1)
df['label_total'] = df['label_total'].str.replace("None", "")
df['label_total'] = df['label_total'].str.replace("__label", " __label")
df['label_total'] = df['label_total'].str.strip()

In [333]:
# looks like not all data has a label
# this function identify the one who got a label, and the one who don't
def identify_label(value):
    if "__label" in value:
        return 1
    else:
        return 0

In [334]:
df['label_binary'] = df['label_total'].apply(identify_label)
df['label_binary'].value_counts()

0    4440
1    1228
Name: label_binary, dtype: int64

In [335]:
df['extremeText_label'] = df['label_total'].astype(str) + df['text'].astype(str)

In [336]:
df['extremeText_label']

0                                   come morde marimbondo
1                                           pinto orgulho
2             merda crepúsculo cinebiografia chuck norris
3                         tapa bundinha cotovelada costas
4       __label__Hate-speech __label__Sexism __label__...
                              ...                        
5663    __label__Hate-speech __label__Homophobia __lab...
5664    __label__Hate-speech __label__Homophobia __lab...
5665    __label__Hate-speech __label__Homophobia __lab...
5666    __label__Hate-speech __label__Homophobia __lab...
5667    __label__Hate-speech __label__Homophobia __lab...
Name: extremeText_label, Length: 5668, dtype: object

In [339]:
print(len(labeled_data))
print(len(non_labeled_data))

1228
4440


In [340]:
labeled_data = list(df[df['label_binary'] == 1]['extremeText_label'].values)
non_labeled_data = list(df[df['label_binary'] == 0]['extremeText_label'].values)

train_data = labeled_data[0:1000] +  non_labeled_data[0:3140]
test_data = labeled_data[1000:] +  non_labeled_data[3140:]


with open('./data/train_hs.txt', 'a') as the_file:
    for item in train_data:
        the_file.write(f"{item}\n")
        
with open('./data/test_hs.txt', 'a') as the_file:
    for item in test_data:
        the_file.write(f"{item}\n")

# Split dataset

In [341]:
# SIMPLE train, test split
# create a new one with this: http://scikit.ml/stratification.html
# X = df.copy()
# del X['Hate.speech']
# del X['text']

# y = df['Hate.speech']

# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [342]:
# train test split data
# https://datascience.stackexchange.com/questions/45174/how-to-use-sklearn-train-test-split-to-stratify-data-for-multi-label-classificat

In [343]:
X,Y = make_multilabel_classification(n_samples=300, n_classes=100, n_labels=10)

In [344]:
X.shape

(300, 20)

In [345]:
Y.shape

(300, 100)

In [346]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X,Y,test_size=0.20)

In [347]:
X_train

array([[1., 4., 3., ..., 3., 2., 2.],
       [0., 6., 2., ..., 7., 0., 4.],
       [2., 1., 2., ..., 1., 4., 3.],
       ...,
       [2., 1., 4., ..., 3., 2., 2.],
       [2., 9., 2., ..., 0., 2., 2.],
       [4., 1., 7., ..., 7., 3., 5.]])

# Modeling
https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/blob/main/extremeText/extremetext_train.py

In [348]:
# https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/blob/main/extremeText/extremetext_train.py

def model_training(train_data):
    # train_supervised uses the same arguments and defaults as the fastText/extremeText cli

    print("Supervised Training")
    # default supervised training
    # model = extremeText.train_supervised(
    #     input=train_data, epoch=100, lr=1.0, wordNgrams=2, verbose=3, minCount=1,
    # )

    # paper supervised training
    model = extremeText.train_supervised(
        input=train_data, epoch=300, lr=0.05, verbose=3, wordNgrams=2, minCount=1, l2=0.003, arity=2, dim=100, tfidfWeights=True
    )
    model.save_model("./model/xt_supervised.bin")
    return model

    # print("Quantization")
    #
    # model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
    #
    # model.save_model("model/xt_quantized.ftz")

In [349]:
# model_training(train_data = "./dataset/train.txt")

In [350]:
model = model_training(train_data = "./data/train_hs.txt")

Supervised Training


Training ...
  Model: sup, loss: softmax
  Features: TF-IDF weights, buckets: 2000000

  Update: SGD, lr: 0.050000, L2: 0.003000, dims: 100, epochs: 300, neg: 5
Reading input file ...
Read 0M words
Number of documents: 45980
Number of words: 23613
Number of labels: 1533
  Input: 2023613 x 100 (771M)
Setting up loss layer ...
  Output: 1533 x 100 (0M)
Starting 12 threads ...
Progress:  89.6% words/sec/thread:   80319 lr:  0.005221 loss:  2.069875 ETA:   0h 0m200 loss:  5.058578 ETA:   0h 2ms/sec/thread:   82421 lr:  0.044019 loss:  4.996305 ETA:   0h 2mlr:  0.043788 loss:  4.890483 ETA:   0h 2m12.6% words/sec/thread:   80965 lr:  0.043681 loss:  4.837677 ETA:   0h 2m.757978 ETA:   0h 2m:   80652 lr:  0.043338 loss:  4.670217 ETA:   0h 2m   0h 2m  80734 lr:  0.041777 loss:  4.266115 ETA:   0h 2mloss:  4.141429 ETA:   0h 2m 0.040655 loss:  4.060303 ETA:   0h 2m   0h 1ms/sec/thread:   80367 lr:  0.013803 loss:  2.229556 ETA:   0h 0mA:   0h 0mlr:  0.013476 loss:  2.219936 ETA:   0h 0ms/sec/

In [351]:
model.predict('O mundo das sapatao é mais ligado')

626 ETA:   0h 0mProgress:  89.6% words/sec/thread:   80316 lr:  0.005204 loss:  2.069346 ETA:   0h 0mProgress:  89.6% words/sec/thread:   80325 lr:  0.005182 loss:  2.068992 ETA:   0h 0mProgress:  89.7% words/sec/thread:   80324 lr:  0.005167 loss:  2.068717 ETA:   0h 0mProgress:  89.7% words/sec/thread:   80309 lr:  0.005159 loss:  2.068431 ETA:   0h 0mProgress:  89.7% words/sec/thread:   80309 lr:  0.005142 loss:  2.068134 ETA:   0h 0mProgress:  89.8% words/sec/thread:   80340 lr:  0.005107 loss:  2.067903 ETA:   0h 0mProgress:  89.8% words/sec/thread:   80337 lr:  0.005093 loss:  2.067703 ETA:   0h 0mProgress:  89.8% words/sec/thread:   80335 lr:  0.005078 loss:  2.067493 ETA:   0h 0mProgress:  89.9% words/sec/thread:   80331 lr:  0.005064 loss:  2.067251 ETA:   0h 0mProgress:  89.9% words/sec/thread:   80331 lr:  0.005047 loss:  2.067005 ETA:   0h 0mProgress:  89.9% words/sec/thread:   80316 lr:  0.005040 loss:  2.066681 ETA:   0h 0mProgress:  90.0% words/sec/thread:   

(('__label__Hate-speech',), array([-0.36828128]))

Progress: 100.0% words/sec/thread:   80270 lr:  0.000000 loss:  1.995195 ETA:   0h 0m
Saving model ...


In [353]:
model.test('./data/test_hs.txt')

Test ...
  Model: sup, loss: softmax
  Features: TF-IDF weights, buckets: 2000000

  Update: SGD, lr: 0.050000, L2: 0.003000, dims: 100, epochs: 300, neg: 5


(3164, 0.9860935524652339, 0.2243797195253506, 0.07110241356816699)

In [355]:
model.test('./data/test_hs.txt', k=3)

Test ...
  Model: sup, loss: softmax
  Features: TF-IDF weights, buckets: 2000000

  Update: SGD, lr: 0.050000, L2: 0.003000, dims: 100, epochs: 300, neg: 5


(3164, 0.9632321955330805, 0.6575332614167566, 0.20939334637964774)

In [356]:
model.test('./data/test_hs.txt', k=5)

Test ...
  Model: sup, loss: softmax
  Features: TF-IDF weights, buckets: 2000000

  Update: SGD, lr: 0.050000, L2: 0.003000, dims: 100, epochs: 300, neg: 5


(3164, 0.7428571428571429, 0.8451636102121539, 0.25701239399869535)

In [293]:
model.predict('bolsonaro')

(('__label__WomenTem',), array([-6.70372248]))

In [294]:
len(model.get_words())

18418

In [295]:
l_4words = []
for word in model.get_words():
    if len(word) > 3:
        l_4words.append(word)

In [296]:
len(l_4words)

17034

In [297]:
l_4words

['</s>',
 'https',
 'mulher',
 'para',
 'mais',
 'como',
 'burra',
 'gorda',
 'pode',
 'homem',
 'isso',
 'está',
 'fufas',
 'feia',
 'essa',
 'quem',
 'gente',
 'http',
 'sapatão',
 'mesmo',
 'minha',
 'muito',
 'contra',
 'aqui',
 'quando',
 '#PNR',
 'refugiados',
 'fazer',
 'mulheres',
 'quer',
 'esse',
 'sobre',
 'tudo',
 'você',
 'coisa',
 'cara',
 'ainda',
 'pelo',
 'agora',
 '@homemdeverdade',
 '#MulherDeVerdade',
 'mundo',
 'nada',
 'gosta',
 'racismo',
 '#Portugal',
 '#RenovarPortugal',
 'pela',
 'eles',
 'sabe',
 'sempre',
 'sapatao',
 'assim',
 'menos',
 'pessoas',
 'acha',
 'porque',
 'toda',
 'hoje',
 '@jpintocoelho60',
 'tenho',
 'nunca',
 'seus',
 'dizer',
 'fala',
 'fica',
 'Trump',
 'todo',
 'feia,',
 '@rpsantos1970',
 'fosse',
 'melhor',
 'falar',
 'vida',
 'nosso',
 'branco',
 'todos',
 'anos',
 'estão',
 'quero',
 'Como',
 'depois',
 'homens',
 'país',
 'feminista',
 'hora',
 'acho',
 'gorda,',
 'esta',
 '@direitafalando',
 '#JoaquinResponde',
 'negro',
 'esquerda',