In [1]:
import os
import pandas as pd

In [2]:
dir_path = os.path.realpath('..')

In [46]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [47]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


## Data cleaning

In [48]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Clean text

In [28]:
import string
import nltk
nltk.data.path.append("/Users/joaeechew/dev/nltk_data")

from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary

In [15]:
print(df.comment_text[:10])

id
22256635     Nonsense?  kiss off, geek. what I said is true...
27450690     "\n\n Please do not vandalize pages, as you di...
54037174     "\n\n ""Points of interest"" \n\nI removed the...
77493077     Asking some his nationality is a Racial offenc...
79357270     The reader here is not going by my say so for ...
82428052         Fried chickens \n\nIs dat sum fried chickens?
87311443     Why can you put English for example on some pl...
114749757    Guy Fawkes \n\nim a resident in bridgwater and...
138560519    as far as nicknames go this article is embarra...
139353149    Woodland Meadows\nGood to hear that you correc...
Name: comment_text, dtype: object


In [21]:
df.comment_text.iloc[10234]

"Response to the lies in your RFC TAB \n\nSo what happens when all of your and  wikitricks and threats fail? Then what? Are you going to be willing to engage in reasonable discussion?\n\nDo you want to discuss this reasonably or merely continue to engage in this sily pantomime?\n\nYou are wrong about Canaen. I have nothing to do with him. Your personal attack on him is based on lies. Out of decency, you should remove them. And if you are not going to be decent, how will we progress?\n\n Vegan smegma \n\n male penis]]\n\nOn a more productive note, and frankly, I produce tons of the stuff, I would like to discuss with you the vegan aspects of smegma and whether or not you might consider it suitable for your inclusion. \n\nIf you are not sure what that means, there is a very good article on the subject here ; , although perhaps we ought consider starting a separate page on vegan smegma and other cruelty free animal products that may be considered by some vegan.\n\nFrom the vegan point of 

First start with simpler models (more cleaning).

I want to:
- Keep exclamation marks only as it may be an indicator for toxicity
- Keep uppercase as it might be toxic
- Remove stopwords (to try with and without)
- Stemming (to try with and without)
- Remove numbers

In [44]:
def preprocess_text(corpus):
    """Takes a corpus in list format and applies basic preprocessing steps of word tokenization,
     removing of english stop words, and lemmatization."""
    processed_corpus = []
    english_words = set(nltk.corpus.words.words())
    english_stopwords = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[\w|!]+')
    for row in corpus:
        word_tokens = tokenizer.tokenize(row)
        word_tokens_no_digits = [t for t in word_tokens if not t.isdigit()]
        word_tokens_english = [t for t in word_tokens_no_digits if t in english_words or not t.isalpha()]
        word_tokens_no_stops = [t for t in word_tokens_english if not t in english_stopwords]
        word_tokens_no_stops_lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in word_tokens_no_stops]
        processed_corpus.append(word_tokens_no_stops_lemmatized)
    return processed_corpus

In [49]:
%%time
df_train.comment_text = preprocess_text(df_train.comment_text)
df_test.comment_text = preprocess_text(df_test.comment_text)

CPU times: user 1min 47s, sys: 1.47 s, total: 1min 49s
Wall time: 1min 54s


In [50]:
df_train[100:150]

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1065542064,"[3rd, party, cover, subject, v, r, P]",0,0,0,0,0,0
1086099304,"[good, eve, I, agree, mostly, fine, afternoon,...",0,0,0,0,0,0
1098067176,"[interesting, new, user, would, immense, knowl...",0,0,0,0,0,0
1099662066,"[March, stop, continue, vandalize, blocked, ad...",0,0,0,0,0,0
1102537245,"[spelling, Pluto, article, I, undid, article, ...",1,0,1,0,1,0
1108786298,"[photo, request]",0,0,0,0,0,0
1115316430,"[userpage!, click, codelyoko193]",0,0,0,0,0,0
1130875035,"[refuse, explain, reason, block]",0,0,0,0,0,0
1137429102,"[said, look, people, repost, material, true, a...",0,0,0,0,0,0
1148598836,"[vandalism, edit, Adam, please, explain, going...",0,0,0,0,0,0


## Output

In [51]:
path = 'data/interim/train.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_train.to_csv(full_path, header=True, index=True)

In [52]:
path = 'data/interim/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_test.to_csv(full_path, header=True, index=True)