In [7]:
import pandas as pd
import numpy as np
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import re

PATH = '../../data/'

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')

print(train.shape)

(159571, 8)


In [8]:
APO = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "tryin'":"trying"
}

In [9]:
lem = WordNetLemmatizer()
tok = TweetTokenizer()

def clean(comment):
    comment = comment.lower()
    comment = re.sub(r'\n+', ' ', comment)
    comment = re.sub('\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}', '',comment) # remove leaky elements like ip,user
    comment = re.sub('\[\[.*\]', '',comment)    #removing usernames
    comment = re.sub('[=",]', '', comment)
    comment = re.sub(' +', ' ', comment)
    text = tok.tokenize(comment)
    text = [APO[word] if word in APO else word for word in text]
    text = tok.tokenize(' '.join(text))
    text = [lem.lemmatize(word, 'v') for word in text]
    text = ' '.join(text)
    if text == '': text = 'na'
    return text

# word count
def word_count(comment): return len(comment.split())
# unique word count
def unique_word_count(comment): return len(set(comment.split()))
# find the count of consecutive question marks (i.e. ???)
def multi_question_mark_count(comment): return len(re.findall(r'\?{2,}', comment))
# find the count of consecutive exclamation marks (i.e. !!!)
def multi_exclamation_mark_count(comment): return len(re.findall(r'!{2,}', comment))
# find the count of uppercase letters
def uppercase_letter_count(comment): return len(re.findall(r'[A-Z]', comment))
# count ellipsis (3 or more . (i.e. ...))
def ellipsis_count(comment): return len(re.findall(r'\.{3,}', comment))
# count period and ellipsis
def period_count(comment): return len(re.findall(r'\.+', comment))
# count parentheses pairs
def parentheses_pair_count(comment): return len(re.findall(r'\(.*\)', comment))

print('inplace na')
train['comment_text'].fillna('na', inplace=True)
test['comment_text'].fillna('na', inplace=True)

print('comment text cleaned')
train['comment_text_cleaned'] = train['comment_text'].apply(clean)
test['comment_text_cleaned'] = test['comment_text'].apply(clean)

print('word count')
train['word_count'] = train['comment_text'].apply(word_count)
test['word_count'] = test['comment_text'].apply(word_count)
train['cleaned_word_count'] = train['comment_text_cleaned'].apply(word_count)
test['cleaned_word_count'] = test['comment_text_cleaned'].apply(word_count)

print('unique word count')
train['unique_word_count'] = train['comment_text'].apply(unique_word_count)
test['unique_word_count'] = test['comment_text'].apply(unique_word_count)
train['cleaned_unique_word_count'] = train['comment_text_cleaned'].apply(unique_word_count)
test['cleaned_unique_word_count'] = test['comment_text_cleaned'].apply(unique_word_count)

print('consecutive question marks')
train['consecutive_question_marks'] = train['comment_text'].apply(multi_question_mark_count)
test['consecutive_question_marks'] = test['comment_text'].apply(multi_question_mark_count)
train['cleaned_consecutive_question_marks'] = train['comment_text_cleaned'].apply(multi_question_mark_count)
test['cleaned_consecutive_question_marks'] = test['comment_text_cleaned'].apply(multi_question_mark_count)

print('consecutive exclamation marks')
train['consecutive_exclamation_marks'] = train['comment_text'].apply(multi_exclamation_mark_count)
test['consecutive_exclamation_marks'] = test['comment_text'].apply(multi_exclamation_mark_count)
train['cleaned_consecutive_exclamation_marks'] = train['comment_text_cleaned'].apply(multi_exclamation_mark_count)
test['cleaned_consecutive_exclamation_marks'] = test['comment_text_cleaned'].apply(multi_exclamation_mark_count)

print('uppercase letters')
train['uppercase_letters'] = train['comment_text'].apply(uppercase_letter_count)
test['uppercase_letters'] = test['comment_text'].apply(uppercase_letter_count)
train['cleaned_uppercase_letters'] = train['comment_text_cleaned'].apply(uppercase_letter_count)
test['cleaned_uppercase_letters'] = test['comment_text_cleaned'].apply(uppercase_letter_count)

print('ellipsis')
train['ellipsis'] = train['comment_text'].apply(ellipsis_count)
test['ellipsis'] = test['comment_text'].apply(ellipsis_count)
train['cleaned_ellipsis'] = train['comment_text_cleaned'].apply(ellipsis_count)
test['cleaned_ellipsis'] = test['comment_text_cleaned'].apply(ellipsis_count)

print('period and ellipsis')
train['period'] = train['comment_text'].apply(period_count)
test['period'] = test['comment_text'].apply(period_count)
train['cleaned_period'] = train['comment_text_cleaned'].apply(period_count)
test['cleaned_period'] = test['comment_text_cleaned'].apply(period_count)

print('parentheses pairs')
train['parentheses_paird'] = train['comment_text'].apply(parentheses_pair_count)
test['parentheses_paird'] = test['comment_text'].apply(parentheses_pair_count)
train['cleaned_parentheses_pair'] = train['comment_text_cleaned'].apply(parentheses_pair_count)
test['cleaned_parentheses_pair'] = test['comment_text_cleaned'].apply(parentheses_pair_count)

print(train.columns.values)

inplace na
comment text cleaned
word count
unique word count
consecutive question marks
consecutive exclamation marks
uppercase letters
ellipsis
period and ellipsis
parentheses pairs
['id' 'comment_text' 'toxic' 'severe_toxic' 'obscene' 'threat' 'insult'
 'identity_hate' 'comment_text_cleaned' 'word_count' 'cleaned_word_count'
 'unique_word_count' 'cleaned_unique_word_count'
 'consecutive_question_marks' 'cleaned_consecutive_question_marks'
 'consecutive_exclamation_marks' 'cleaned_consecutive_exclamation_marks'
 'uppercase_letters' 'cleaned_uppercase_letters' 'ellipsis'
 'cleaned_ellipsis' 'period' 'cleaned_period' 'parentheses_paird'
 'cleaned_parentheses_pair']


In [10]:
train.to_csv(PATH + 'cleaned_train.csv')
test.to_csv(PATH + 'cleaned_test.csv')
print('done')

done
