# Libraries

In [None]:
import pandas as pd
import numpy  as np
import string 
import re

import pickle
import warnings
warnings.filterwarnings('ignore')

print('Done')



# Help functions

## Optimize dataframe size

In [2]:
def df_optimized(df, verbose=True, **kwargs):
    """
    Reduces size of dataframe by downcasting numerical columns
    :param df: input dataframe
    :param verbose: print size reduction if set to True
    :param kwargs:
    :return:
    """
    in_size = df.memory_usage(index=True).sum()
    for type in ["float", "integer"]:
        l_cols = list(df.select_dtypes(include=type))
        for col in l_cols:
            df[col] = pd.to_numeric(df[col], downcast=type)
            if type == "float":
                df[col] = pd.to_numeric(df[col], downcast="integer")
    out_size = df.memory_usage(index=True).sum()
    ratio = (1 - round(out_size / in_size, 2)) * 100
    GB = out_size / 1000000000
    if verbose:
        print("optimized size by {} % | {} GB".format(ratio, GB))
    return df


## Over and undersample 

In [3]:
# def sample_rows(df, undersample = True, oversample = False, proportion0_1= 0.5, N = 100000):
#     n_ 
#     n_0 = N/
#     if undersample == True:
        
#     return

## Clean and tokenize data

In [4]:
def clean_data(data, remove_special_char_2lower_case = True):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)
   
    # tokenize + remove scpecial characters + set to lower case
    if remove_special_char_2lower_case == True:
        data = text_to_word_sequence(data) 
    else :
    # tokenize 
        data = data.split() 
    
    
    # Remove stopwords
    stop_words = set(stopwords.words('english')) 
    data = [w for w in data if not w in stop_words]         
    
    # Remove digits
    data = ' '.join(word for word in data if not word.isdigit())
    
    
    return text_to_word_sequence(data)

def apply_data_cleaning(X, text, drop_text = False, remove_special_char_2lower_case = True):
    ln = X.shape[0]
    sentences = []
    for i in range(ln):
        tmp = X.iloc[i][f'{text}']
        tmp_clean = clean_data(tmp,remove_special_char_2lower_case = f"{remove_special_char_2lower_case}")
        sentences.append(tmp_clean)
    X["sentences"] = sentences
    if drop_text == True:
        X.drop(columns = f'{text}', inplace = True)
    return X

## Convert text to matrix

In [5]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

## Add numeric features to tensor

In [6]:
def append_features_to_tensor(X_text, X_feature, _max ): #X_train_pad_2.max()

    tmp = np.empty(shape=(X_text.shape[0],X_text.shape[1]+2, X_text.shape[2] ))
    tmp[:,0:X_text.shape[1],:] = X_text

    first_indices = X_text.shape[0] ## # of twitts 
    tmp_punct =  np.zeros(shape = (1,X_text.shape[2]))
    tmp_cap = np.zeros(shape = (1,X_text.shape[2]))

    for i in range(first_indices):
        if X_feature.iloc[i,0] == 1:
            tmp_punct =  tmp_punct*(_max+0.5)

        if X_feature.iloc[i,1] == 1:
            tmp_cap = tmp_cap*(_max+1)   

        tmp[i,200,:] = tmp_punct
        tmp[i,201,:] = tmp_cap

    
    return tmp

## Plot learning curve 
! no need to package

In [7]:
def plot_loss_score(history, title=None):
    fig, ax = plt.subplots(1,2, figsize=(20,7))
    
    # --- LOSS --- 
    
    ax[0].plot(history.history['loss'])
    ax[0].plot(history.history['val_loss'])
    ax[0].set_title('Model loss')
    ax[0].set_ylabel('Loss')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylim((0,3))
    ax[0].legend(['Train', 'Test'], loc='best')
    ax[0].grid(axis="x",linewidth=0.5)
    ax[0].grid(axis="y",linewidth=0.5)
    
    # --- ACCURACY
    
    ax[1].plot(history.history['recall'])
    ax[1].plot(history.history['val_recall'])
    ax[1].set_title('Model Recall')
    ax[1].set_ylabel('recall')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(['Train', 'Test'], loc='best')
    ax[1].set_ylim((0,1))
    ax[1].grid(axis="x",linewidth=0.5)
    ax[1].grid(axis="y",linewidth=0.5)
    
    if title:
        fig.suptitle(title)

# Data load and prep

## Load datasets

In [8]:
# load raw data
pd.set_option('max_colwidth', None) 
df = pd.read_csv("~/Downloads/banData/MeTooHate.csv")[["text", "category"]]
df = df_optimized(df, verbose=True)
df.head()

optimized size by 43.99999999999999 % | 0.007264694 GB


Unnamed: 0,text,category
0,"Entitled, obnoxious, defensive, lying weasel. This thing is to make decisions that may affect my life? He is a women hater.",0
1,Thank you and for what you did for the women and survivors this week. \n,0
2,"Knitting (s) &amp; getting ready for January 19, 2019.",0
3,"Yep just like triffeling women weaponized their poon!! Wonder if Kamala Harris ever extorted Willy Brown after throwing the poon on him, oh yeh, that how she got her first job me too is a JOKE!",1
4,"No, the President wants to end movement posing as the movement.",0


In [9]:
# load re-labeled data
with open('../raw_data/filter_df', 'rb') as handle:
    filter_df = pickle.load(handle)
filter = df_optimized(df, verbose=True)
filter_df.head()

optimized size by 0.0 % | 0.007264694 GB


Unnamed: 0,text,category,scoring_ponctuation,scoring_capital_word,scoring,relabelling
3,"Yep just like triffeling women weaponized their poon!! Wonder if Kamala Harris ever extorted Willy Brown after throwing the poon on him, oh yeh, that how she got her first job me too is a JOKE!",1,0,3,3,1
7,"Save it for Fox Fake News—watched by racists, bigots &amp; other women-hating sexists like yourself. GOP is the party of Putin &amp; the party of Hate—besides women, GOP hates all POC, immigrants, and all ppl not straight. You’re the party of the entitled lying whiny white men.",1,0,9,9,1
49,"As a white male in this era I'm just scared that if I decide to rape or sexually assault a woman, it can be used against me years later. How am I supposed to ask a girl out?!",1,0,6,6,1
57,I heard that received some Shush/Slush money to settle rape charges by her aide who said she just could not remember where or when. . .,1,0,3,3,1
60,"If she hasn't, it's because of people like you shaming her! How dare you tell her how she is or should be feeling! That is why I HATE the movement. Along with all of the other movements! You rape these people to further a political objective! You are despicable!",1,0,6,6,1


## Prep label 2

In [10]:
# Prep df2 -data with new label 2
df2 = filter_df[filter_df["relabelling"]==2]
df2["label"] = df2["relabelling"]
score_punctuation = [score if score==0 else 1 for score in df2["scoring_ponctuation"]]
score_capital_word = [score if score==0 else 1 for score in df2["scoring_capital_word"]]
df2["score_punctuation"] = score_punctuation
df2["score_capital_word"] = score_capital_word
df2.drop(columns = ["category","scoring_ponctuation","scoring_capital_word","scoring","relabelling"], inplace = True)

df2.columns == ["text","label","score_punctuation","score_capital_word"]
df2.head()

Unnamed: 0,text,label,score_punctuation,score_capital_word
78,"Trust me...Not like I did. I'm 61 &amp; to this day I cringe at some of the shit I did. I never sexually assaulted anyone. But, I did things that were flat out wrong under . If I ran into any ladies from 40 years ago, I'd take a knee to apologize. Kavanaugh needs to do same.",2,0,1
200,BELIEVE HER!!!!!!!!!!!!!!!!!!!!!!!!!!!! !!!!!!!!!!!!!!!! BELIEVE BELIEVE BELIEVE!!!!!!!! \n\nYou fucking hypocrite loser,2,1,1
233,"I used to dream of marrying a nice man and living happily ever after, like the romance books.\nNow I dream of having a fantastic job, a lovely home and children. I don’t need a spouse to make me happy, I can learn how to do it on my own.\n\nThat’s what my own events have done",2,0,1
340,"IT’S AS IF ISN’T REALLY ABOUT FAIRNESS AND EQUALITY AT ALL: “If your friend says she wants to cut off every dick in a five mile radius, let her!”\n\n",2,0,1
538,"Partial checklist for women before leaving the house:\n1. dress 2 tight?\n2. breasts 2 big/2 obvious?\n3. ass 2 curvy?\n4. skirt 2 short/2 sexy?\n5. hair 2 seductive?\n6. make-up 2 sexy?\n7. Will I b harassed?\n8. Are you ready, Girl?\nMen:\n1. Did I pee?\n",2,0,1


## Prep label 0

In [11]:
# Rename category--> label and add score columns
df0 = df[df["category"]==0]
df0["label"] = df["category"]
df0.drop(columns =["category"], inplace = True)
df0["score_punctuation"] = 0
df0["score_capital_word"] = 0
#shuffle rows before sampling
df0 = df0.sample(frac=1)
print(df0.shape)

df0.head()

(711840, 4)


Unnamed: 0,text,label,score_punctuation,score_capital_word
119031,"I know, Linda is nothing but \nCowardly Islamist!!\nHiding behind Terrorists organization Islam1 Brotherhood ! \n\nHey \nWhat about sexual assault \nBy Keith Ellison?\nMinnesota police department officials refused to investigate!! Don't believe Woman!?",0,0,0
175057,"At risk of being accused of mansplaining, please note that 2018 and 2016 are quite different:\n* Nobody has spent the last 20 years vilifying Beto (like they did Hillary)\n* The movement started since then\n* Kav showed that doesn't care about women like nothing else has",0,0,0
677336,Bugbee &amp; Conkle's -,0,0,0
110744,Thanks to I wouldn’t want to hire a woman at all,0,0,0
484801,I don't know when Jason figures out I'm NOT in dating? As he keeps trying to persuade me by saying NOT all are the like the few apples u've dated aka your Sherri &amp; that I should give,0,0,0


## Prep df with labels 0 and 2

## New df

In [12]:
# new df
DF = pd.concat([df0, df2], axis = 0)
# # reshuffle and set index
DF = DF.sample(frac=1)
print(DF.shape)
DF.reset_index(inplace = True)

DF= DF[["text","label","score_punctuation","score_capital_word"]]
assert(DF.shape[0]== df0.shape[0]+df2.shape[0])
assert(DF.shape[1]== 4)

(721931, 4)


In [13]:
DF.head()


Unnamed: 0,text,label,score_punctuation,score_capital_word
0,"Correct . If someone want to know power of HashTag, ask Nana - Alok Nath - Chetan Bhagat - Anu Malllik..",0,0,0
1,"This guy is clearly quite ignorant of the movement. 243 comments, and only myself and 1 other person identified this guy's colossal mistake in misusing the hashtag. !\n",0,0,0
2,It was a grave mistake for you to ignore the pained testimony of Christine Blasey Ford. - we will fund your opponent in 2020.,0,0,0
3,"In another news, the victim refuses to file a formal complaint regarding her allegations. \n\n",0,0,0
4,Headlines Are Here To Stay: It’s Not Just A ‘Women’s Issue’,0,0,0


### New df saved

# Replace emojis by text

In [14]:
# DF = df_optimized(DF, verbose=True)
# DF.dropna(inplace = True)
# with open('../raw_data/DF', 'wb') as f1:
#     pickle.dump(DF, f1)