In [31]:
import os
import pandas as pd

In [32]:
dir_path = os.path.realpath('..')

In [33]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [34]:
df_train.loc[63564363128]

comment_text     REDIRECT Talk:Lakewood Church Central Campus
toxic                                                       0
severe_toxic                                                0
obscene                                                     0
threat                                                      0
insult                                                      0
identity_hate                                               0
Name: 63564363128, dtype: object

In [35]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


## Data cleaning

In [36]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Clean text

In [37]:
import string
import nltk
nltk.data.path.append("/Users/joaeechew/dev/nltk_data")

from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary

from os import listdir
from collections import Counter

First start with simpler models (more cleaning).

I want to:
- Keep exclamation marks only as it may be an indicator for toxicity
- Keep uppercase as it might be toxic
- Remove stopwords (to try with and without)
- Stemming (to try with and without)
- Remove numbers

In [53]:
def process_text(corpus, vocab):
    """Takes a corpus in list format and applies basic preprocessing steps of word tokenization,
     removing of english stop words, and lemmatization. Returns processed corpus and vocab."""
    processed_corpus = []
    english_words = set(nltk.corpus.words.words())
    english_stopwords = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[\w|!]+')
    for row in corpus:
        tokens = tokenizer.tokenize(row)
        tokens = [t for t in tokens if not t.isdigit()]
        tokens = [t for t in tokens if t in english_words or not t.isalpha()]
        tokens = [t for t in tokens if not t in english_stopwords]
        tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
        tokens = ' '.join(tokens)
        if tokens == '':
            tokens = 'cleaned'
        processed_corpus.append(tokens)
        vocab.update(tokens)
    return processed_corpus, vocab

In [57]:
%%time
vocab = Counter()
df_train.comment_text, vocab = process_text(df_train.comment_text, vocab)
df_test.comment_text, vocab = process_text(df_test.comment_text, vocab)

CPU times: user 1min 27s, sys: 297 ms, total: 1min 28s
Wall time: 1min 28s


In [58]:
print(vocab.most_common(50))
print(len(vocab))

[(' ', 8482273), ('e', 6353497), ('i', 3756433), ('a', 3665418), ('t', 3547298), ('n', 3343782), ('o', 3151722), ('r', 3030999), ('l', 2688386), ('s', 2445816), ('c', 1972028), ('d', 1809572), ('p', 1418099), ('u', 1414904), ('I', 1385482), ('m', 1313992), ('g', 1240393), ('h', 1028126), ('y', 941802), ('K', 679289), ('f', 641508), ('w', 615215), ('k', 614394), ('b', 589760), ('v', 584110), ('N', 420559), ('_', 404735), ('L', 402442), ('W', 333516), ('!', 318612), ('T', 172878), ('x', 149516), ('A', 134245), ('E', 111853), ('R', 67948), ('q', 60387), ('j', 54642), ('S', 50962), ('M', 49818), ('X', 42155), ('C', 41987), ('J', 38948), ('z', 38295), ('F', 37879), ('1', 36748), ('|', 33260), ('0', 33168), ('2', 30708), ('D', 29170), ('B', 28952)]
750


In [59]:
df_train[0:10]

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22256635,kiss geek I said true I account,1,0,0,0,0,0
27450690,vandalize edit W S continue blocked,0,0,0,0,0,0
54037174,interest I removed interest section added kind...,0,0,0,0,0,0
77493077,nationality aware shown support towards community,0,0,0,0,0,0
79357270,The reader going say ethereal vocal style dark...,0,0,0,0,0,0
82428052,sum fried,0,0,0,0,0,0
87311443,put English example people like,0,0,0,0,0,0
114749757,Guy resident go carnival every year town enjoy...,0,0,0,0,0,0
138560519,far go article embarrassing fish golden fish f...,0,0,0,0,0,0
139353149,hear corrected,0,0,0,0,0,0


In [60]:
# keep tokens with a min occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

237


## Output

In [61]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text data = '\n'.join(lines)
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [62]:
# save tokens to a vocabulary file

path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

save_list(tokens, full_path)

In [63]:
path = 'data/processed/train.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_train.to_csv(full_path, header=True, index=True)

In [64]:
path = 'data/processed/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_test.to_csv(full_path, header=True, index=True)