In [1]:
import numpy as np
import pandas as pd
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import time
import string
import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_stop_words():
    f = open('../datasets/stopwords.txt', 'r')
    stop_words = set([line.strip() for line in f])
    f.close()
    return stop_words

def stemming(df):
    time1 = time.time()
    stemmer = SnowballStemmer('english')
    df = df.apply(lambda x: ' '.join([stemmer.stem(item) for item in x.split()]))
    time2 = time.time()
    print('Stemming, time:', time2 - time1)
    return df

def normalization(df):
    time1 = time.time()
    df = df.apply(lambda x: x.lower())
    re = list()
    re.append('https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
    re.append('[^a-zA-Z\']')

    for r in re:
        df = df.replace(r, ' ', regex = True)

    for ch in list(string.ascii_lowercase):
        r = '[%s]{3,}' % ch
        df = df.replace(r, ch, regex = True)
        
    stop_words = get_stop_words()
    
    df = df.apply(lambda x: ' '.join([item for item in x.split() if (item not in stop_words) 
                                      and (len(item) > 2) and (len(item) < 20)]))
  
    time2 = time.time()
    print('Cleaned data, time:', time2 - time1)
    df = stemming(df)
    time3 = time.time()
    
    for ch in list(string.ascii_lowercase):
        r = '[%s]{2,}' % ch
        df = df.replace(r, ch, regex = True)
        
    
    df = df.apply(lambda x: ' '.join([item for item in x.split() if (item not in stop_words) 
                                      and (len(item) > 2) and (len(item) < 20)]))
    time4 = time.time()
    
    
    print('Cleaned data after stemming, time:, time:', time4 - time3)
    return df

In [3]:
train = pd.read_csv('../datasets/train.csv', index_col='id')
test_X = pd.read_csv('../datasets/test.csv', index_col='id')
test_y = pd.read_csv('../datasets/test_labels.csv', index_col='id')
train.head()

Unnamed: 0_level_0,comment_text,toxic
id,Unnamed: 1_level_1,Unnamed: 2_level_1
c51ed32bc0300a61,October 2008 \n You currently appear to be eng...,0
7e1cb02f190eeccf,Jean-Louis Heinrich \n\nYou are editing contra...,0
c38f023ad1df8577,Putting 'citation required' on just about ever...,0
dfb8d36b145916eb,My Horse \n\nmy horse is probily a mix of ever...,0
b1ca628a1aa1c94f,"""\n\n Hiatus? \n\nEminem is finally done with ...",0


In [4]:
X = train[['comment_text']] 
y = train[['toxic']] 
print('Train dataset:')
X['comment_text'] = normalization(X['comment_text'])
print('Test dataset:')
test_X['comment_text'] = normalization(test_X['comment_text'])
X['comment_text'] = X['comment_text'].apply(lambda x: 'no comment' if x == '' else x)
test_X['comment_text'] = test_X['comment_text'].apply(lambda x: 'no comment' if x == '' else x)

Train dataset:
Cleaned data, time: 8.522449970245361
Lemmating, time: 7.326029062271118
Cleaned data after stemming, time:, time: 4.119665861129761
Test dataset:
Cleaned data, time: 3.8118820190429688
Lemmating, time: 2.4599390029907227
Cleaned data after stemming, time:, time: 1.8665552139282227


In [5]:
train['comment_text'] = X['comment_text']
train.to_csv('../datasets/processed_train.csv')
test_X.to_csv('../datasets/processed_test.csv')