In [1]:
# ! export MACOSX_DEPLOYMENT_TARGET=10.9
# ! pip install extremetext

In [2]:
# ! pip install scikit-multilearn

In [3]:
# https://github.com/mwydmuch/extremeText
# https://arxiv.org/pdf/1810.11671v1.pdf

In [29]:
import pandas as pd
import extremeText
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.datasets import make_multilabel_classification

In [30]:
data = pd.read_csv('./dataset/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')
print(data.shape)
data.head(2)

(5668, 80)


Unnamed: 0,text,Hate.speech,Sexism,Body,Racism,Ideology,Homophobia,Origin,Religion,Health,...,Thin.women,Arabic,East.europeans,Africans,South.Americans,Brazilians,Migrants,Homossexuals,Thin.people,Ageing
0,"""não come mel, morde marimbondo""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"não tem pinto, tem orgulho !",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Preprocessing
Data should follow this format

In [185]:
train_data_format = """
__label__mariadb-galera __label__mariadb55-mariadb __label__mysql55-mysql mariadb mariadb mysql solaris vulnerability oracle mysql server users availability vectors keys oracle com technetwork topics security html http secunia com http www oracle com technetwork topics security http lists security announce msg00016 html http www oracle com technetwork topics security html http secunia com http www securityfocus security gentoo glsa xml mariadb-galera mariadb55-mariadb-devel ruby-mysql openshift-origin-cartridge-mysql rh-mariadb100-mariadb mariadb-apb-role query-mysql mariadb55-mariadb-test rh-mysql57-mysql rh-mariadb101-mariadb rh-mysql56-mysql mysql mysql-connector-java mariadb55-mariadb-bench mysql55-mysql mysql-apb-role mysql mariadb55-mariadb-server mysql-binuuid-rails rh-mysql80-mysql com.github.brandtg switchboard-mysql rh-mariadb102-mariadb mariadb mariadb55-mariadb rhn-solaris-bootstrap mariadb55-mariadb-libs
"""
# https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/tree/main/extremeText/dataset

In [186]:
df = data.copy()

# replace "." with "-"
df.columns = df.columns.str.replace('.', '-')

cols = df.columns
cols = ['Hate-speech', 'Sexism', 'Body', 'Racism', 'Ideology', 'Homophobia', 'Origin', 'Religion', 'Health', 'OtherLifestyle', 'Aborting-women', 'Agnostic', 'Argentines', 'Asians', 'Autists', 'Black-Women', 'Blond-women', 'Brazilians-women', 'Chinese', 'Criminals', 'Egyptians', 'Fat-people', 'Football-players-women', 'Gamers', 'Homeless', 'Homeless-women', 'Indigenous', 'Iranians', 'Japaneses', 'Jews', 'Jornalists', 'Latins', 'Left-wing-ideology', 'Men-Feminists', 'Mexicans', 'Muslims-women', 'Nordestines', 'Old-people', 'Polyamorous', 'Poor-people', 'Rural-people', 'Russians', 'Sertanejos', 'Street-artist', 'Ucranians', 'Vegetarians', 'White-people', 'Young-people', 'Old-women', 'Ugly-people', 'Venezuelans', 'Angolans', 'Black-people', 'Disabled-people', 'Fat-women', 'Feminists', 'Gays', 'Immigrants', 'Islamists', 'Lesbians', 'Men', 'Muslims', 'Refugees', 'Trans-women', 'Travestis', 'Women', 'Bissexuals', 'Transexuals', 'Ugly-women', 'Thin-women', 'Arabic', 'East-europeans', 'Africans', 'South-Americans', 'Brazilians', 'Migrants', 'Homossexuals', 'Thin-people', 'Ageing']

def label_value(value, col):
    label = f'__label__{col}'
    if value == 1:
        return label
    else:
        return "None"
    
for i in cols:
    df[i] = df[i].apply(label_value, args=(i,))
    
df.Sexism.value_counts()

  df.columns = df.columns.str.replace('.', '-')


None               4996
__label__Sexism     672
Name: Sexism, dtype: int64

In [200]:
df['label_total'] = df[cols].agg(''.join, axis=1)
df['label_total'] = df['label_total'].str.replace("None", "")
df['label_total'] = df['label_total'].str.replace("__label", " __label")
df['label_total'] = df['label_total'].str.strip()

In [214]:
# looks like not all data has a label
# this function identify the one who got a label, and the one who don't
def identify_label(value):
    if "__label" in value:
        return 1
    else:
        return 0

In [241]:
df['label_binary'] = df['label_total'].apply(identify_label)
df['label_binary'].value_counts()

0    4440
1    1228
Name: label_binary, dtype: int64

In [245]:
df['extremeText_label'] = df['label_total'].astype(str) + df['text'].astype(str)

In [246]:
df['extremeText_label']

0                        "não come mel, morde marimbondo"
1                            não tem pinto, tem orgulho !
2       Não vê essa merda de Crepúsculo! Pra isso temo...
3       não da tapa na bundinha, da cotovelada nas cos...
4       __label__Hate-speech __label__Sexism __label__...
                              ...                        
5663    __label__Hate-speech __label__Homophobia __lab...
5664    __label__Hate-speech __label__Homophobia __lab...
5665    __label__Hate-speech __label__Homophobia __lab...
5666    __label__Hate-speech __label__Homophobia __lab...
5667    __label__Hate-speech __label__Homophobia __lab...
Name: extremeText_label, Length: 5668, dtype: object

In [265]:
labeled_data = list(df[df['label_binary'] == 1]['extremeText_label'].values)

In [267]:
with open('./dataset/train_hs.txt', 'a') as the_file:
    for item in labeled_data:
        the_file.write(f"{item}\n")

# Split dataset

In [7]:
# SIMPLE train, test split
# create a new one with this: http://scikit.ml/stratification.html
X = df.copy()
del X['Hate.speech']
del X['text']

y = df['Hate.speech']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
# train test split data
# https://datascience.stackexchange.com/questions/45174/how-to-use-sklearn-train-test-split-to-stratify-data-for-multi-label-classificat

In [17]:
X,Y = make_multilabel_classification(n_samples=300, n_classes=100, n_labels=10)


In [18]:
X.shape

(300, 20)

In [19]:
Y.shape

(300, 100)

In [20]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X,Y,test_size=0.20)

In [21]:
X_train

array([[3., 2., 3., ..., 4., 3., 4.],
       [3., 1., 5., ..., 4., 2., 2.],
       [5., 1., 1., ..., 3., 5., 2.],
       ...,
       [2., 3., 3., ..., 5., 2., 2.],
       [2., 1., 2., ..., 3., 1., 3.],
       [1., 3., 4., ..., 4., 3., 4.]])

In [None]:
df

# Modeling
https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/blob/main/extremeText/extremetext_train.py

In [268]:
# https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/blob/main/extremeText/extremetext_train.py

def model_training(train_data):
    # train_supervised uses the same arguments and defaults as the fastText/extremeText cli

    print("Supervised Training")
    # default supervised training
    # model = extremeText.train_supervised(
    #     input=train_data, epoch=100, lr=1.0, wordNgrams=2, verbose=3, minCount=1,
    # )

    # paper supervised training
    model = extremeText.train_supervised(
        input=train_data, epoch=100, lr=0.05, verbose=3, wordNgrams=2, minCount=1, l2=0.003, arity=2, dim=100, tfidfWeights=True
    )
    model.save_model("xt_supervised.bin")

    # print("Quantization")
    #
    # model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000)
    #
    # model.save_model("model/xt_quantized.ftz")

In [269]:
# model_training(train_data = "./dataset/train.txt")

In [270]:
model_training(train_data = "./dataset/train_hs.txt")

Supervised Training


Training ...
  Model: sup, loss: softmax
  Features: TF-IDF weights, buckets: 2000000

  Update: SGD, lr: 0.05, L2: 0.003, dims: 100, epochs: 100, neg: 5
Reading input file ...
Read 0M words
Number of documents: 8387229874383523824
Number of words: 5887
Number of labels: 911
  Input: 2005887 x 100 (765M)
Setting up loss layer ...
  Output: 911 x 100 (0M)
Starting 12 threads ...
Progress: 100.0% words/sec/thread:   69283 lr:  0.000000 loss:  5.719331 ETA:   0h 0m
Saving model ...
