In [17]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import os.path

np.random.seed(500)

In [18]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [19]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [20]:
len(train)

159571

In [21]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


Below code drops any row from the training set which does not contains any comment

In [22]:
train.dropna(subset=['comment_text'], inplace=True)
len(train)

159571

Below code make all comments lowercase

In [23]:
train['comment_text'] = [entry.lower() for entry in train['comment_text']]
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0


In [24]:
test['comment_text'] = [entry.lower() for entry in test['comment_text']]
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you'll...
1,0000247867823ef7,== from rfc == \n\n the title is fine as it is...
2,00013b17ad220c46,""" \n\n == sources == \n\n * zawe ashton on lap..."
3,00017563c3f7919a,":if you have a look back at the source, the in..."
4,00017695ad8997eb,i don't anonymously edit articles at all.


Below code tokenize the train and test data

In [25]:
train['comment_text']= [word_tokenize(entry) for entry in train['comment_text']]
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[explanation, why, the, edits, made, under, my...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[d'aww, !, he, matches, this, background, colo...",0,0,0,0,0,0
2,000113f07ec002fd,"[hey, man, ,, i, 'm, really, not, trying, to, ...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[``, more, i, ca, n't, make, any, real, sugges...",0,0,0,0,0,0
4,0001d958c54c6e35,"[you, ,, sir, ,, are, my, hero, ., any, chance...",0,0,0,0,0,0


In [11]:
test['comment_text']= [word_tokenize(entry) for entry in test['comment_text']]
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,"[yo, bitch, ja, rule, is, more, succesful, the..."
1,0000247867823ef7,"[==, from, rfc, ==, the, title, is, fine, as, ..."
2,00013b17ad220c46,"[``, ==, sources, ==, *, zawe, ashton, on, lap..."
3,00017563c3f7919a,"[:, if, you, have, a, look, back, at, the, sou..."
4,00017695ad8997eb,"[i, do, n't, anonymously, edit, articles, at, ..."


Below code performs the Lemmatization over training and testing data

In [26]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [28]:
if os.path.isfile('final_train.csv'):
    train = pd.read_csv('final_train.csv');
else:
    for index,entry in enumerate(train['comment_text']):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'text_final'
        train.loc[index,'comment_text_final'] = str(Final_words)
    train.to_csv("final_train.csv")
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_final
0,0,0000997932d777bf,"['explanation', 'why', 'the', 'edits', 'made',...",0,0,0,0,0,0,"['explanation', 'edits', 'make', 'username', '..."
1,1,000103f0d9cfb60f,"[""d'aww"", '!', 'he', 'matches', 'this', 'backg...",0,0,0,0,0,0,"['match', 'background', 'colour', 'seemingly',..."
2,2,000113f07ec002fd,"['hey', 'man', ',', 'i', ""'m"", 'really', 'not'...",0,0,0,0,0,0,"['hey', 'man', 'really', 'try', 'edit', 'war',..."
3,3,0001b41b1c6bb37e,"['``', 'more', 'i', 'ca', ""n't"", 'make', 'any'...",0,0,0,0,0,0,"['ca', 'make', 'real', 'suggestion', 'improvem..."
4,4,0001d958c54c6e35,"['you', ',', 'sir', ',', 'are', 'my', 'hero', ...",0,0,0,0,0,0,"['sir', 'hero', 'chance', 'remember', 'page']"


In [None]:
if os.path.isfile('final_test.csv'):
    test = pd.read_csv('final_test.csv');
else:
    for index,entry in enumerate(test['comment_text']):
        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'text_final'
        test.loc[index,'comment_text_final'] = str(Final_words)
    test.to_csv("final_test.csv");
test.head()

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
vectorizer.vocabulary_

In [12]:
X = vectorizer.transform(corpus)
print(X)
print(vectorizer.get_feature_names())
print(X.shape)

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 5)	0.5386476208856763
  (1, 3)	0.281088674033753
  (1, 1)	0.6876235979836938
  (2, 8)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 0)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(4, 9)
