In [1]:
import os
import pandas as pd

In [2]:
dir_path = os.path.realpath('..')

In [3]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [4]:
df_train.loc[63564363128]

comment_text     REDIRECT Talk:Lakewood Church Central Campus
toxic                                                       0
severe_toxic                                                0
obscene                                                     0
threat                                                      0
insult                                                      0
identity_hate                                               0
Name: 63564363128, dtype: object

In [5]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


## Data cleaning

In [6]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

## Clean text

In [7]:
import string
import nltk
nltk.data.path.append("/Users/joaeechew/dev/nltk_data")

from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary

from os import listdir
from collections import Counter

First start with simpler models (more cleaning).

I want to:
- Keep exclamation marks only as it may be an indicator for toxicity
- Keep uppercase as it might be toxic
- Remove stopwords (to try with and without)
- Stemming (to try with and without)
- Remove numbers

In [8]:
def process_text(corpus, vocab):
    """Takes a corpus in list format and applies basic preprocessing steps of word tokenization,
     removing of english stop words, and lemmatization. Returns processed corpus and vocab."""
    processed_corpus = []
    english_words = set(nltk.corpus.words.words())
    english_stopwords = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'[\w|!]+')
    for row in corpus:
        tokens = tokenizer.tokenize(row)
        tokens = [t for t in tokens if not t.isdigit()]
        tokens = [t for t in tokens if t in english_words or not t.isalpha()]
        tokens = [t for t in tokens if not t in english_stopwords]
        tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
        tokens = ' '.join(tokens)
        if tokens == '':
            tokens = 'cleaned'
        processed_corpus.append(tokens)
        vocab.update(tokens)
    return processed_corpus, vocab

In [9]:
%%time
vocab = Counter()
df_train.comment_text, vocab = process_text(df_train.comment_text, vocab)
df_test.comment_text, vocab = process_text(df_test.comment_text, vocab)

KeyboardInterrupt: 

In [10]:
print(vocab.most_common(50))
print(len(vocab))

[(' ', 26537), ('e', 19714), ('i', 11637), ('t', 11581), ('a', 11423), ('n', 10835), ('o', 10370), ('r', 9908), ('l', 8577), ('s', 7799), ('!', 6244), ('c', 6143), ('d', 5582), ('u', 4684), ('p', 4450), ('m', 4037), ('g', 4004), ('h', 3446), ('y', 3186), ('w', 2145), ('k', 2104), ('f', 1995), ('b', 1952), ('v', 1804), ('I', 1738), ('A', 929), ('T', 468), ('x', 422), ('G', 246), ('j', 222), ('q', 214), ('F', 177), ('H', 175), ('S', 157), ('_', 156), ('J', 149), ('z', 145), ('Y', 145), ('M', 127), ('C', 127), ('P', 122), ('R', 118), ('B', 117), ('E', 103), ('N', 96), ('D', 96), ('1', 86), ('|', 80), ('2', 77), ('0', 55)]
68


In [11]:
df_train[0:10]

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0
82428052,Fried chickens \n\nIs dat sum fried chickens?,0,0,0,0,0,0
87311443,Why can you put English for example on some pl...,0,0,0,0,0,0
114749757,Guy Fawkes \n\nim a resident in bridgwater and...,0,0,0,0,0,0
138560519,as far as nicknames go this article is embarra...,0,0,0,0,0,0
139353149,Woodland Meadows\nGood to hear that you correc...,0,0,0,0,0,0


In [12]:
# keep tokens with a min occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

65


## Output

In [13]:
# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text data = '\n'.join(lines)
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

In [14]:
# save tokens to a vocabulary file

path = 'data/processed/vocab.txt'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

save_list(tokens, full_path)

In [15]:
path = 'data/processed/train.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_train.to_csv(full_path, header=True, index=True)

In [16]:
path = 'data/processed/test.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_test.to_csv(full_path, header=True, index=True)