# Hate Speech Detector 2.0
---
**Data row-wise (or tweet-wise) binder and data duplicator**
1. Load and adapt all desired data.<br />
    **WARNING 1.: All the data must have the same length (i.e. same number of tweets)!**<br />
    **WARNING 2.: At least one dataset must contain classes assignation!**
2. Combine the datasets row-wisely tweet-to-tweet.
3. Perform cardinality analysis for tweet classes combinations.
4. For those class combinations which cardinalities are lower than desired threshold (ex.: min 10 tweets per class combination), perform:
    1. Select tweets relevant for certain class combination.
    2. Randomly select appropriate number of tweets to fill up to desired threshold (ex.: if there's 2 examples, then take randomly 10-2=8 tweets).
    3. Append selected tweets to combined dataset.
5. Save duplicated dataset to .csv file.

In [1]:
import numpy as np
import pandas as pd

import random

In [2]:
LABELS = ['wyz_label', 'groz_label', 'wyk_label', 'odcz_label', 'pon_label', 'styg_label', 'szan_label']
THRESHOLD = 10

## Loading desired data with adaptation

In [3]:
df_pac_scores = pd.read_csv('data/sady_main/sady_pac_scores.csv')

In [4]:
df_labels = df_pac_scores[LABELS]
df_labels.head(2)

Unnamed: 0,wyz_label,groz_label,wyk_label,odcz_label,pon_label,styg_label,szan_label
0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0


In [5]:
df_pac_scores = df_pac_scores.drop([
    'tweet',
    'wyz_label', 'groz_label', 'wyk_label', 'odcz_label',
    'pon_label', 'styg_label', 'szan_label'
], axis = 1)
df_pac_scores.head(2)

Unnamed: 0,id,wyz_PAC_min,wyz_PAC_mean,wyz_PAC_max,groz_PAC_min,groz_PAC_mean,groz_PAC_max,wyk_PAC_min,wyk_PAC_mean,wyk_PAC_max,...,odcz_PAC_max,pon_PAC_min,pon_PAC_mean,pon_PAC_max,styg_PAC_min,styg_PAC_mean,styg_PAC_max,szan_PAC_min,szan_PAC_mean,szan_PAC_max
0,0,-0.000129,0.002574,0.005277,-0.002833,-0.002833,-0.002833,0.0,0.0,0.0,...,0.0,0.00044,0.00044,0.00044,-0.000614,0.000698,0.00201,0.0,0.0,0.0
1,1,0.0,0.0,0.0,-0.000527,-0.000527,-0.000527,-0.001766,-0.001766,-0.001766,...,-0.000751,-0.004546,-0.004546,-0.004546,9.7e-05,9.7e-05,9.7e-05,0.0,0.0,0.0


In [6]:
df_other_scores = pd.read_csv('data/sady_main/sady_other_scores.csv')
df_other_scores = df_other_scores.drop(['id', 'tweet'], axis=1)
df_other_scores.head(2)

Unnamed: 0,s_neg,s_neu,s_pos,n_chars,n_sylls,n_words,nu_words,nl_chars,nl_sylls,nl_words,nlu_words
0,0,18,0,95,36,17,16,98,35,20,19
1,0,9,0,42,17,9,9,68,28,15,14


In [7]:
df_topic_pac_scores = pd.read_csv('data/sady_main/sady_topic_pac_scores.csv')
df_topic_pac_scores = df_topic_pac_scores.drop(['id', 'tweet'], axis=1)
df_topic_pac_scores.head(2)

Unnamed: 0,wyz_min,wyz_mean,wyz_max,groz_min,groz_mean,groz_max,wyk_min,wyk_mean,wyk_max,odcz_min,...,pon_max,styg_min,styg_mean,styg_max,szan_min,szan_mean,szan_max,vulg_min,vulg_mean,vulg_max
0,-0.111111,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011111,0.111111,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.011111,0.111111,-0.111111,0.0,0.111111,0.0,0.033333,0.222222,-0.111111,...,0.0,-0.111111,-0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Row-wise data combination

In [8]:
df_combined = pd.concat([
    df_pac_scores,
    df_other_scores,
    df_topic_pac_scores,
    df_labels
], axis=1)
df_combined.head(2)

Unnamed: 0,id,wyz_PAC_min,wyz_PAC_mean,wyz_PAC_max,groz_PAC_min,groz_PAC_mean,groz_PAC_max,wyk_PAC_min,wyk_PAC_mean,wyk_PAC_max,...,vulg_min,vulg_mean,vulg_max,wyz_label,groz_label,wyk_label,odcz_label,pon_label,styg_label,szan_label
0,0,-0.000129,0.002574,0.005277,-0.002833,-0.002833,-0.002833,0.0,0.0,0.0,...,0.0,0.0,0.0,1,0,0,0,0,0,0
1,1,0.0,0.0,0.0,-0.000527,-0.000527,-0.000527,-0.001766,-0.001766,-0.001766,...,0.0,0.0,0.0,0,0,0,0,0,0,0


## Cardinality analysis

In [9]:
def cardinality_analysis(df, labels):
    df_labels = df[labels + ['id']]
    
    dfagg_labels = df_labels.groupby(labels).count().sort_values(by='id', ascending=False)
    dfagg_labels['%'] = dfagg_labels['id']/len(df_labels)*100
    
    return dfagg_labels

In [10]:
dfagg_cards = cardinality_analysis(df_combined, labels=LABELS)
dfagg_cards

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,id,%
wyz_label,groz_label,wyk_label,odcz_label,pon_label,styg_label,szan_label,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,0,0,0,0,13654,89.817129
0,0,0,0,0,1,0,361,2.374688
0,0,0,0,1,0,0,251,1.651099
0,0,0,0,1,1,0,197,1.295882
0,1,0,0,0,0,0,179,1.177477
0,1,0,0,0,1,0,106,0.697277
1,0,0,0,1,0,0,57,0.374951
1,0,0,0,0,0,0,43,0.282858
1,0,0,1,1,1,0,40,0.263123
0,0,0,1,0,0,0,36,0.236811


## Low-cardinalities tweets duplication

In [11]:
def duplicate_under_threshold(df, dfagg, threshold=5):
    combinations = dfagg[dfagg['id'] < threshold].index
    
    df_dupl = pd.DataFrame(df)
    for combination in combinations:
        df_relev = pd.DataFrame(df)
        for label, c in zip(LABELS, combination):
            df_relev = df_relev[df_relev[label] == c]
        
        rand_pos = [0 if len(df_relev)<=1 else random.randint(0, len(df_relev)-1)
                    for i in range(threshold - len(df_relev))]
        
        for rp in rand_pos:
            row = df_relev.iloc[rp]
            df_dupl = df_dupl.append(row)
    
    for label in LABELS:
        df_dupl[label] = df_dupl[label].astype('int')
    
    return df_dupl

In [12]:
df_duplicated = duplicate_under_threshold(df_combined, dfagg_cards, threshold=THRESHOLD)
df_duplicated

Unnamed: 0,id,wyz_PAC_min,wyz_PAC_mean,wyz_PAC_max,groz_PAC_min,groz_PAC_mean,groz_PAC_max,wyk_PAC_min,wyk_PAC_mean,wyk_PAC_max,...,vulg_min,vulg_mean,vulg_max,wyz_label,groz_label,wyk_label,odcz_label,pon_label,styg_label,szan_label
0,0.0,-0.000129,0.002574,0.005277,-0.002833,-0.002833,-0.002833,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,1,0,0,0,0,0,0
1,1.0,0.000000,0.000000,0.000000,-0.000527,-0.000527,-0.000527,-0.001766,-0.001766,-0.001766,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,2.0,-0.000772,-0.000772,-0.000772,-0.000949,-0.000949,-0.000949,-0.000679,-0.000679,-0.000679,...,0.0,0.0,0.0,0,0,0,0,1,0,0
3,3.0,-0.001673,-0.001673,-0.001673,0.001085,0.001085,0.001085,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,4.0,0.002529,0.002529,0.002529,-0.008458,-0.008458,-0.008458,0.006418,0.006418,0.006418,...,0.0,0.0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935,2935.0,0.006042,0.006042,0.006042,0.004899,0.004899,0.004899,0.007065,0.007065,0.007065,...,0.0,0.0,0.0,1,1,1,1,1,1,0
2935,2935.0,0.006042,0.006042,0.006042,0.004899,0.004899,0.004899,0.007065,0.007065,0.007065,...,0.0,0.0,0.0,1,1,1,1,1,1,0
2935,2935.0,0.006042,0.006042,0.006042,0.004899,0.004899,0.004899,0.007065,0.007065,0.007065,...,0.0,0.0,0.0,1,1,1,1,1,1,0
2935,2935.0,0.006042,0.006042,0.006042,0.004899,0.004899,0.004899,0.007065,0.007065,0.007065,...,0.0,0.0,0.0,1,1,1,1,1,1,0


In [13]:
dfagg_cards = cardinality_analysis(df_duplicated, labels=LABELS)
dfagg_cards

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,id,%
wyz_label,groz_label,wyk_label,odcz_label,pon_label,styg_label,szan_label,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,0,0,0,0,13654,88.472753
0,0,0,0,0,1,0,361,2.339143
0,0,0,0,1,0,0,251,1.626385
0,0,0,0,1,1,0,197,1.276485
0,1,0,0,0,0,0,179,1.159852
0,1,0,0,0,1,0,106,0.68684
1,0,0,0,1,0,0,57,0.369338
1,0,0,0,0,0,0,43,0.278624
1,0,0,1,1,1,0,40,0.259185
0,0,0,1,0,0,0,36,0.233266


## Saving dataset

In [14]:
df_duplicated.drop(['id'], axis=1).to_csv('data/sady_main/sady_simple_ml_classifier.csv', index=False)