# CLEAN TEXT DATA

## 1. Import libraries

**Import neccessary packages and modules**

In [1]:
import time
import os
t = time.time()
import json
import string
import random
import math
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
import tensorflow as tf
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

**Import nlp packages and modules**

In [2]:
import nltk
# nltk.download()
import nltk, re, time
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

## 2. Load and Inspect Data

**Set directory**

In [3]:
input_dir = "../input/"

**Load train and test data**

In [4]:
train_data = pd.read_csv(input_dir+'train.csv') 
test_data = pd.read_csv(input_dir+'test.csv')

**Inspect train and test data**

In [5]:
train_data.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [6]:
print("Shape of train data:", train_data.shape)

Shape of train data: (159571, 8)


In [7]:
test_data.head(20)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
5,0001ea8717f6de06,Thank you for understanding. I think very high...
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...
7,000247e83dcc1211,:Dear god this site is horrible.
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ..."
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...


In [8]:
print("Shape of test data:", test_data.shape)

Shape of test data: (153164, 2)


## 3. Preprocess the text data

In [9]:
# load list of stopwords
sw = set(stopwords.words("english"))
# load teh snowball stemmer
stemmer = SnowballStemmer("english")
# translator object to replace punctuations with space
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [10]:
print(sw)

{'hasn', 'couldn', 'wasn', 'she', 'be', 'now', 'ain', "aren't", 'an', 'don', "she's", "that'll", 'do', 'needn', 'yours', 'mustn', 'being', 'after', 'during', 'there', 'that', 'how', 'aren', 'having', 'both', 'shan', 'him', 'than', 'when', 'it', 'were', 'they', 'further', 'same', 'ourselves', 'you', "you'd", 'whom', 'again', 'out', "isn't", 'once', 'other', 'am', 'y', 'them', 'd', 'yourselves', 'we', 't', 'isn', 'most', "it's", 'which', 'weren', 'into', 'hers', 'i', 'our', 'under', 'in', 'so', 'of', 'why', 'by', 'only', 'my', 'been', "don't", 'didn', 'against', 'can', "needn't", 'some', 'who', "you've", "shouldn't", 'until', 'won', "you'll", 'theirs', 'with', 'myself', 'own', 'should', 'those', 'his', "you're", 'are', 'and', 'doesn', 'any', 'had', 'because', 'from', 'ma', 'its', 'll', 'shouldn', "wasn't", "weren't", 'their', "hasn't", 'to', "didn't", 'this', 'me', 'about', 'nor', 'ours', 'has', 'her', 'if', "mightn't", 'yourself', 'very', 'too', 'a', 'your', 'while', 'these', 'below', "

**Function for preprocessing text**

In [11]:
def clean_text(text):    
    """
    A function for preprocessing text
    """    
    text = str(text)    
    # replacing the punctuations with no space,which in effect deletes the punctuation marks 
    text = text.translate(translator)    
    # remove stop word
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)    
    # stemming
    text = [stemmer.stem(word) for word in text.split()]
    text = " ".join(text)     
    # Clean the text
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return(text)

**Clean train and test data**

In [12]:
t1 = time.time()
train_data['comment_text'] = train_data['comment_text'].apply(clean_text)
print("Finished cleaning the train set.", "Time needed:", time.time()-t1,"sec")

Finished cleaning the train set. Time needed: 89.9083309173584 sec


In [13]:
t2 = time.time()
test_data['comment_text'] = test_data['comment_text'].apply(clean_text)
print("Finished cleaning the test set.", "Time needed:", time.time()-t2,"sec")

Finished cleaning the test set. Time needed: 79.10418558120728 sec


**Inspect the cleaned train and test data**

In [14]:
train_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explan edit made usernam hardcor metallica fan...,0,0,0,0,0,0
1,000103f0d9cfb60f,aww match background colour seem stuck thank t...,0,0,0,0,0,0
2,000113f07ec002fd,hey man realli tri edit war guy constant remov...,0,0,0,0,0,0
3,0001b41b1c6bb37e,make real suggest improv wonder section statis...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chanc rememb page,0,0,0,0,0,0
5,00025465d4725e87,congratul well use tool well talk,0,0,0,0,0,0
6,0002bcb3da6cb337,cocksuck piss around work,1,1,1,0,1,0
7,00031b1e95af7921,vandal matt shirvington articl revert pleas ban,0,0,0,0,0,0
8,00037261f536c51d,sorri word nonsens offens anyway intend write ...,0,0,0,0,0,0
9,00040093b2687caa,align subject contrari dulithgow,0,0,0,0,0,0


In [15]:
test_data.head(10)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule succes ever what hate sad mof...
1,0000247867823ef7,rfc titl fine imo
2,00013b17ad220c46,sourc zaw ashton lapland
3,00017563c3f7919a,look back sourc inform updat correct form gues...
4,00017695ad8997eb,anonym edit articl
5,0001ea8717f6de06,thank understand think high would revert witho...
6,00024115d4cbde0f,pleas add nonsens wikipedia edit consid vandal...
7,000247e83dcc1211,dear god site horribl
8,00025358d4737918,fool believ number correct number lie ponder...
9,00026d1092fe71cc,doubl redirect fix doubl redirect blank outer ...


## 4. Create columns for length of the data

**A function for finding the length of text**

In [16]:
def find_length(text):
    """
    A function to find the length
    """
    text = str(text)
    return len(text.split())

**Create the column of text length in train and test data**

In [17]:
train_data['length'] = train_data['comment_text'].apply(find_length)
train_data.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,length
0,0000997932d777bf,explan edit made usernam hardcor metallica fan...,0,0,0,0,0,0,23
1,000103f0d9cfb60f,aww match background colour seem stuck thank t...,0,0,0,0,0,0,10
2,000113f07ec002fd,hey man realli tri edit war guy constant remov...,0,0,0,0,0,0,21
3,0001b41b1c6bb37e,make real suggest improv wonder section statis...,0,0,0,0,0,0,52
4,0001d958c54c6e35,sir hero chanc rememb page,0,0,0,0,0,0,5
5,00025465d4725e87,congratul well use tool well talk,0,0,0,0,0,0,6
6,0002bcb3da6cb337,cocksuck piss around work,1,1,1,0,1,0,4
7,00031b1e95af7921,vandal matt shirvington articl revert pleas ban,0,0,0,0,0,0,7
8,00037261f536c51d,sorri word nonsens offens anyway intend write ...,0,0,0,0,0,0,38
9,00040093b2687caa,align subject contrari dulithgow,0,0,0,0,0,0,4


In [18]:
test_data['length'] = test_data['comment_text'].apply(find_length)
test_data.head(10)

Unnamed: 0,id,comment_text,length
0,00001cee341fdb12,yo bitch ja rule succes ever what hate sad mof...,44
1,0000247867823ef7,rfc titl fine imo,4
2,00013b17ad220c46,sourc zaw ashton lapland,4
3,00017563c3f7919a,look back sourc inform updat correct form gues...,15
4,00017695ad8997eb,anonym edit articl,3
5,0001ea8717f6de06,thank understand think high would revert witho...,8
6,00024115d4cbde0f,pleas add nonsens wikipedia edit consid vandal...,17
7,000247e83dcc1211,dear god site horribl,4
8,00025358d4737918,fool believ number correct number lie ponder...,44
9,00026d1092fe71cc,doubl redirect fix doubl redirect blank outer ...,21


## 5. Save the modified train and test data

**Save directory**

In [19]:
save_dir = "../input/"

**Save the modified train and test data to the designated directory**

In [20]:
train_data.to_csv(save_dir+"modified_train_data.csv",header=True, index=False)

In [21]:
test_data.to_csv(save_dir+"modified_test_data.csv",header=True, index=False)

**Inpect the saved train and test csv**

In [22]:
pd.read_csv(save_dir+"modified_train_data.csv").head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,length
0,0000997932d777bf,explan edit made usernam hardcor metallica fan...,0,0,0,0,0,0,23
1,000103f0d9cfb60f,aww match background colour seem stuck thank t...,0,0,0,0,0,0,10
2,000113f07ec002fd,hey man realli tri edit war guy constant remov...,0,0,0,0,0,0,21
3,0001b41b1c6bb37e,make real suggest improv wonder section statis...,0,0,0,0,0,0,52
4,0001d958c54c6e35,sir hero chanc rememb page,0,0,0,0,0,0,5
5,00025465d4725e87,congratul well use tool well talk,0,0,0,0,0,0,6
6,0002bcb3da6cb337,cocksuck piss around work,1,1,1,0,1,0,4
7,00031b1e95af7921,vandal matt shirvington articl revert pleas ban,0,0,0,0,0,0,7
8,00037261f536c51d,sorri word nonsens offens anyway intend write ...,0,0,0,0,0,0,38
9,00040093b2687caa,align subject contrari dulithgow,0,0,0,0,0,0,4


In [23]:
pd.read_csv(save_dir+"modified_train_data.csv").head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,length
0,0000997932d777bf,explan edit made usernam hardcor metallica fan...,0,0,0,0,0,0,23
1,000103f0d9cfb60f,aww match background colour seem stuck thank t...,0,0,0,0,0,0,10
2,000113f07ec002fd,hey man realli tri edit war guy constant remov...,0,0,0,0,0,0,21
3,0001b41b1c6bb37e,make real suggest improv wonder section statis...,0,0,0,0,0,0,52
4,0001d958c54c6e35,sir hero chanc rememb page,0,0,0,0,0,0,5
5,00025465d4725e87,congratul well use tool well talk,0,0,0,0,0,0,6
6,0002bcb3da6cb337,cocksuck piss around work,1,1,1,0,1,0,4
7,00031b1e95af7921,vandal matt shirvington articl revert pleas ban,0,0,0,0,0,0,7
8,00037261f536c51d,sorri word nonsens offens anyway intend write ...,0,0,0,0,0,0,38
9,00040093b2687caa,align subject contrari dulithgow,0,0,0,0,0,0,4
