# Data Cleaning
The following material is inspired by jagangupta in the following link. I will be using his function to remove peices of text that might lead to overfitting. (ip address, usernames, links) If the model caught on that a particular ip was regularly a toxic poster, it might learn a decision rule that might not apply well to the test set.

In [None]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('C:\\Users\\harri\\.kaggle\\competitions\\jigsaw-toxic-comment-classification-challenge\\train.csv\\train.csv').fillna(' ')
test = pd.read_csv('C:\\Users\\harri\\.kaggle\\competitions\\jigsaw-toxic-comment-classification-challenge\\test.csv\\test.csv').fillna(' ')

X_train = train['comment_text']
X_test = test['comment_text']

### Text Examples

Let's print the first 10 examples.


In [None]:
for i in range(0, 10):
    print(X_train[i])
    print('\n')

### Class Imbalance

In [None]:
x=train.iloc[:,2:].sum()
#Class Occurence Plot
plt.figure(figsize=(10,5))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("# per Class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type ', fontsize=12)
#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

### Word Cloud Visualization: Toxic Comments

In [None]:
from wordcloud import WordCloud, STOPWORDS
from PIL import Image

toxic_subset = train[train.toxic==1]
text = toxic_subset.comment_text.values
wc = WordCloud(background_color = "black", max_words = 2000,stopwords = stopword)
wc.generate("".join(text))

#show wordcloud
plt.figure(figsize = (20,10))
plt.axis("off")
plt.title("Word Frequencies: Toxic Comments", fontsize = 20)
plt.imshow(wc.recolor(colormap ="viridis", alpha = 0.95))
plt.show()

### Feature Building

In [None]:
df = pd.concat([train.iloc[:,0:2], test.iloc[:,0:2]])
df = df.reset_index(drop=True)
df.shape

In [None]:
#Unique word count
df['count_unique_word']=df["comment_text"].apply(lambda x: len(set(str(x).split())))
#punctuation count
df["count_punctuations"] =df["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
upper case words count
df["count_words_upper"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

### Cleaning Function
Cleaning out leaky features and preprocessing our text data.

In [None]:
import spacy 


def clean(comment):
    nlp = spacy.load('en_cor_web_md')
    #conv to lowercase
    comment = comment.lower()
    #replace new line
    comment = re.sub('\\n','',comment)
    #remove ip 
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", "",comment)
    #remove username
    comment=re.sub("\[\[.*\]","",comment)
    #remove urls
    comment = re.sub("http://.*com", '', comment)
    #article ids
    comment = re.sub("\d:\d\d\s{0,5}$", '', comment)
    
    #run spacy pipeline on comment for tokenization and lemmatization.
    doc = nlp(comment)
    comment = ''
    for token in doc:
        comment = comment.join(token.lemma_ + ' ')
    return comment

df['comment_text'] = df['comment_text'].apply(clean)

### Adding Features

In [None]:
tfidf = TfidfVectorizer(min_df=200,  max_features=10000, 
            strip_accents='unicode', analyzer='word',ngram_range=(1,2),
            use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfidf.fit(df.comment_text)

train =  tfidf.transform(df[:train.shape[0],  1])
test = tfidf.transform(df[train.shape[0]:,1])


In [None]:
train.to_csv('C:\\Users\\harri\\.kaggle\\competitions\\jigsaw-toxic-comment-classification-challenge\\train.csv\\clean_train.csv')
test.to_csv('C:\\Users\\harri\\.kaggle\\competitions\\jigsaw-toxic-comment-classification-challenge\\train.csv\\clean_test.csv')