In [1]:
import os
import re
import string
import numpy as np
import pandas as pd
from itertools import groupby
import nltk
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('../data/raw/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:


# Removing duplicates, if words occured more than  2 times in comment.
def remove_duplicates(text_before):
    my_dict = dict()
    text_after = list()
    for word in text_before.split():
        if word not in my_dict.keys():
            my_dict[word] = 1
        else:
            my_dict[word] = my_dict[word] + 1
    
    for key,value in my_dict.items():
        if value>=2:
            text_after.append(key)
        else:
            text_after.append(key)
    return " ".join(text_after)

In [4]:
df['comment_text_clean'] = df['comment_text'].apply(lambda text : remove_duplicates(text))

In [5]:
df['comment_text_len'] = df['comment_text_clean'].apply(lambda x: len(x.split()))

In [6]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_clean,comment_text_len
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,Explanation Why the edits made under my userna...,41
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,D'aww! He matches this background colour I'm s...,17
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"Hey man, I'm really not trying to edit war. It...",39
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" More I can't make any real suggestions on im...",82
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"You, sir, are my hero. Any chance you remember...",13


In [7]:
def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove links,remove punctuation
    and remove stop words containing numbers"""
    text = text.lower()                                            # Converts the text to lowercase using regex 
    text = re.sub(r"\[.*?\]","",text)                              # Replace's the text into 'nothing" if text is present inside squre brackets.
    text = re.sub("https?://\S+|www\.\S+","",text)                 # Removes the links from the comments.
    text = re.sub("<.*?>+","",text)                                # Remove unwanted
    text = re.sub("[%s]" % re.escape(string.punctuation),"",text)  # Remove punctuations
    text = re.sub("\n","",text)                                    # Remove next line symbols '\n'
    text = re.sub("\w*\d\w*","",text)                              # Takes only albhabet and digits.
    return text
df['comment_text_clean'] = df['comment_text_clean'].apply(lambda text : clean_text(text))

In [8]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_clean,comment_text_len
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,41
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour im seem...,17
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really not trying to edit war its j...,39
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i cant make any real suggestions on impr...,82
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...,13


In [9]:
## removing stop words
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c', 'cu']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

df['comment_text_clean'] = df['comment_text_clean'].apply(remove_stopwords)

In [10]:
# Stemming the texts
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

df['comment_text_clean'] = df['comment_text_clean'].apply(stemm_text)

In [11]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_clean,comment_text_len
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explan edit made usernam hardcor metallica fan...,41
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww match background colour seem stuck thank ...,17
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man realli tri edit war guy constant remov...,39
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,cant make real suggest improv wonder section...,82
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chanc rememb page that,13


In [21]:
df.to_csv('../data/processed/processed.csv', index=False)

In [14]:
from sklearn.model_selection import train_test_split

y = df[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].astype(float)
X = df['comment_text_clean']

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.33,random_state=32)


In [17]:
print('train_x shape is {}' .format({X_train.shape}))
print('test_x shape is {}' .format({X_val.shape}))
print('train_y shape is {}' .format({y_train.shape}))

train_x shape is {(106912,)}
test_x shape is {(52659,)}
train_y shape is {(106912, 6)}


In [18]:
encoder = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4" 
preprocessing = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [19]:
bert_preprocessing_model = hub.KerasLayer(preprocessing) # preprocessing step in bert-base model
bert_model = hub.KerasLayer(encoder) # BERT-base model encoder

NameError: name 'hub' is not defined

In [24]:
df2 = pd.read_csv('../data/processed/cleaned_data.csv')
df2.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,daww he matches this background colour im seem...,0,0,0,0,0,0
2,000113f07ec002fd,hey man im really not trying to edit war its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cant make any real suggestions on impr...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
