In [23]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import os.path
from tqdm import tqdm

np.random.seed(500)

In [24]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [25]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [26]:
len(train)

159571

In [27]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


Below code drops any row from the training set which does not contains any comment

In [28]:
train.dropna(subset=['comment_text'], inplace=True)
len(train)

159571

Below code make all comments lowercase

In [29]:
train['comment_text'] = [entry.lower() for entry in train['comment_text']]
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0


In [30]:
test['comment_text'] = [entry.lower() for entry in test['comment_text']]
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you'll...
1,0000247867823ef7,== from rfc == \n\n the title is fine as it is...
2,00013b17ad220c46,""" \n\n == sources == \n\n * zawe ashton on lap..."
3,00017563c3f7919a,":if you have a look back at the source, the in..."
4,00017695ad8997eb,i don't anonymously edit articles at all.


Below code tokenize the train and test data

In [31]:
train['comment_text']= [word_tokenize(entry) for entry in tqdm(train['comment_text'])]
train.head()

100%|██████████| 159571/159571 [01:15<00:00, 2127.03it/s]


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[explanation, why, the, edits, made, under, my...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[d'aww, !, he, matches, this, background, colo...",0,0,0,0,0,0
2,000113f07ec002fd,"[hey, man, ,, i, 'm, really, not, trying, to, ...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[``, more, i, ca, n't, make, any, real, sugges...",0,0,0,0,0,0
4,0001d958c54c6e35,"[you, ,, sir, ,, are, my, hero, ., any, chance...",0,0,0,0,0,0


In [32]:
test['comment_text']= [word_tokenize(entry) for entry in tqdm(test['comment_text'])]
test.head()

100%|██████████| 153164/153164 [01:09<00:00, 2204.86it/s]


Unnamed: 0,id,comment_text
0,00001cee341fdb12,"[yo, bitch, ja, rule, is, more, succesful, the..."
1,0000247867823ef7,"[==, from, rfc, ==, the, title, is, fine, as, ..."
2,00013b17ad220c46,"[``, ==, sources, ==, *, zawe, ashton, on, lap..."
3,00017563c3f7919a,"[:, if, you, have, a, look, back, at, the, sou..."
4,00017695ad8997eb,"[i, do, n't, anonymously, edit, articles, at, ..."


Below code performs the Lemmatization over training and testing data

In [38]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [39]:
from tqdm import tqdm
if os.path.isfile('train_pickle.pkl'):
    train = pd.read_pickle('train_pickle.pkl');
else:
    final_comment_text = []
    for entry in tqdm(train['comment_text']):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        final_comment_text.append(Final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'
    train['comment_text_final'] = final_comment_text
    train.to_pickle("train_pickle.pkl")
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_final
0,0000997932d777bf,"[explanation, why, the, edits, made, under, my...",0,0,0,0,0,0,"[explanation, edits, make, username, hardcore,..."
1,000103f0d9cfb60f,"[d'aww, !, he, matches, this, background, colo...",0,0,0,0,0,0,"[match, background, colour, seemingly, stick, ..."
2,000113f07ec002fd,"[hey, man, ,, i, 'm, really, not, trying, to, ...",0,0,0,0,0,0,"[hey, man, really, try, edit, war, guy, consta..."
3,0001b41b1c6bb37e,"[``, more, i, ca, n't, make, any, real, sugges...",0,0,0,0,0,0,"[ca, make, real, suggestion, improvement, wond..."
4,0001d958c54c6e35,"[you, ,, sir, ,, are, my, hero, ., any, chance...",0,0,0,0,0,0,"[sir, hero, chance, remember, page]"


In [40]:
if os.path.isfile('test_pickle.pkl'):
    test = pd.read_pickle('test_pickle.pkl');
else:
    final_comment_text = []
    for entry in tqdm(test['comment_text']):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        final_comment_text.append(Final_words)
    # The final processed set of words for each iteration will be stored in 'text_final'
    test['comment_text_final'] = final_comment_text
    test.to_pickle("test_pickle.pkl")
test.head()

Unnamed: 0,id,comment_text,comment_text_final
0,00001cee341fdb12,"[yo, bitch, ja, rule, is, more, succesful, the...","[yo, bitch, ja, rule, succesful, ever, whats, ..."
1,0000247867823ef7,"[==, from, rfc, ==, the, title, is, fine, as, ...","[rfc, title, fine, imo]"
2,00013b17ad220c46,"[``, ==, sources, ==, *, zawe, ashton, on, lap...","[source, zawe, ashton, lapland]"
3,00017563c3f7919a,"[:, if, you, have, a, look, back, at, the, sou...","[look, back, source, information, update, corr..."
4,00017695ad8997eb,"[i, do, n't, anonymously, edit, articles, at, ...","[anonymously, edit, article]"


In [37]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']