In [1]:
import os
import pandas as pd

In [2]:
dir_path = os.path.realpath('..')

In [3]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [4]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


## Data cleaning

In [5]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Clean text

In [6]:
import string
import nltk
nltk.data.path.append("/Users/joaeechew/dev/nltk_data")

from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary

from os import listdir
from collections import Counter

First start with simpler models (more cleaning).

I want to:
- Keep exclamation marks only as it may be an indicator for toxicity
- Keep uppercase as it might be toxic
- Remove stopwords (to try with and without)
- Stemming (to try with and without)
- Remove numbers

In [7]:
def process_text(corpus, vocab):
    """Takes a corpus in list format and applies basic preprocessing steps of word tokenization,
     removing of english stop words, and lemmatization. Returns processed corpus and vocab."""
    processed_corpus = []
    english_words = set(nltk.corpus.words.words())
    english_stopwords = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[\w|!]+')
    for row in corpus:
        word_tokens = tokenizer.tokenize(row)
        word_tokens_no_digits = [t for t in word_tokens if not t.isdigit()]
        word_tokens_english = [t for t in word_tokens_no_digits if t in english_words or not t.isalpha()]
        word_tokens_no_stops = [t for t in word_tokens_english if not t in english_stopwords]
        word_tokens_no_stops_lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in word_tokens_no_stops]
        processed_corpus.append(word_tokens_no_stops_lemmatized)
        vocab.update(word_tokens_no_stops_lemmatized)
    return processed_corpus, vocab

In [8]:
%%time
vocab = Counter()
df_train.comment_text, vocab = process_text(df_train.comment_text, vocab)
df_test.comment_text, vocab = process_text(df_test.comment_text, vocab)

CPU times: user 1min 43s, sys: 621 ms, total: 1min 44s
Wall time: 1min 44s


In [9]:
print(vocab.most_common(50))
print(len(vocab))

[('I', 379991), ('WIKI_LINK', 313510), ('article', 124798), ('page', 118990), ('The', 101726), ('use', 70128), ('would', 63711), ('one', 59963), ('edit', 58939), ('like', 55635), ('talk', 51022), ('please', 49581), ('may', 45440), ('deletion', 44252), ('see', 41889), ('image', 41563), ('think', 39360), ('link', 38259), ('also', 35300), ('make', 34680), ('time', 34584), ('know', 33068), ('information', 30459), ('people', 29877), ('used', 29380), ('need', 28992), ('copyright', 28065), ('hi', 27704), ('free', 27098), ('EXTERNAL_LINK', 26877), ('made', 26835), ('policy', 25284), ('name', 23842), ('A', 23456), ('speedy', 22820), ('way', 22464), ('could', 22357), ('source', 22282), ('add', 21650), ('content', 21491), ('want', 20840), ('even', 20830), ('section', 20686), ('vandalism', 20336), ('good', 20328), ('get', 20327), ('fair', 20209), ('work', 19822), ('help', 19776), ('well', 19106)]
99177


In [10]:
df_train[0:10]

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22256635,"[kiss, geek, I, said, true, I, account]",1,0,0,0,0,0
27450690,"[vandalize, edit, W, S, continue, blocked]",0,0,0,0,0,0
54037174,"[interest, I, removed, interest, section, adde...",0,0,0,0,0,0
77493077,"[nationality, aware, shown, support, towards, ...",0,0,0,0,0,0
79357270,"[The, reader, going, say, ethereal, vocal, sty...",0,0,0,0,0,0
82428052,"[sum, fried]",0,0,0,0,0,0
87311443,"[put, English, example, people, like]",0,0,0,0,0,0
114749757,"[Guy, resident, go, carnival, every, year, tow...",0,0,0,0,0,0
138560519,"[far, go, article, embarrassing, fish, golden,...",0,0,0,0,0,0
139353149,"[hear, corrected]",0,0,0,0,0,0


In [12]:
# keep tokens with a min occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

26196


## Output

In [13]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text data = '\n'.join(lines)
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [14]:
# save tokens to a vocabulary file

path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

save_list(tokens, full_path)

In [15]:
path = 'data/processed/train.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_train.to_csv(full_path, header=True, index=True)

In [16]:
path = 'data/processed/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_test.to_csv(full_path, header=True, index=True)