In [1]:
import re, os, sys, string, itertools
import numpy as np
import pandas as pd
from nltk.tokenize import WordPunctTokenizer, word_tokenize, StanfordSegmenter, sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
raw_train = pd.read_csv("./train.csv")
raw_test = pd.read_csv("./test.csv")
label_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
raw_train = raw_train[:300]
raw_test = raw_test[:100]
#raw_train

In [3]:
# raw_train.comment_text.dtypes # Pandas string

In [4]:
# raw_train.comment_text.value_counts()

In [5]:
#for x in raw_train.comment_text.head(5):
#   print(x)

In [6]:
def basic_clean_data(series):
    series = series.copy()
    series = series.str.replace(r"\d", "")
    series = series.str.replace(r"[^a-zA-Z0-9.,\"!]+", " ")
    series = series.str.replace("'", "")
    series = series.str.replace(r"\\n{1,}", " line ")
    return series

In [7]:
def remove_ip_address(series): #comment_text as series
    series = series.copy()
    regulation = re.compile(r'(([0-9]{1,}\.){2,}[0-9]{1,})') #ip format
    series = series.str.replace(regulation, ' ')
    return series

In [8]:
def unique_list(l):
    ulist = []
    for x in l:
        if x not in ulist:
            ulist.append(x)
    return ulist

#test_x = "teeeeeeeeest, test, test, test if we have repeated words"
#test_x = ' '.join(unique_list(test_x.split(" ")))
#test_x: test, test if we have repeated words

In [9]:
def remove_repeat(series):
    series = series.copy()
    series = series.apply(lambda x: ' '.join(unique_list(str(x).split(" "))))
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    series = series.apply(lambda x: pattern.sub(r"\1", x))
    return series

In [10]:
#test_x = "teeeeeeeeest, test, test, test if we have repeated words"
#pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
#test_x = pattern.sub(r"\1", test_x)
#test_x

In [11]:
# WordNet® is a large lexical database of English. Nouns, verbs, 
# adjectives and adverbs are grouped into sets of cognitive synonyms (synsets), 
# each expressing a distinct concept

def get_part_of_speech(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [12]:
def normalize(text):
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    stop_words = set(stopwords.words('english'))
    # one to one mapping the filters characters to its len's " "
    translate_map = str.maketrans(filters, " " * len(filters)) 
    
    text = text.lower()
    text = text.translate(translate_map)
    
    tokens = nltk.word_tokenize(text)
    
    tags = nltk.pos_tag(tokens)
    
    normalized_text = [WordNetLemmatizer().lemmatize(tag[0], pos=get_part_of_speech(tag[1])) for tag in tags if tag[0] not in stop_words if len(tag[0]) > 2]

    return normalized_text

In [13]:
def preprocess(series, 
               basic_clean = True, remove_ip= True,
               normalization = True,remove_rep = True):
    series = series.copy()
    
    if basic_clean:
        series = basic_clean_data(series)
    
    if remove_ip:
        series = remove_ip_address(series)
        
    if normalization:
        series = series.apply(lambda x: normalize(x))
    
    if remove_rep:
        series = remove_repeat(series)
    
    return series

In [14]:
raw_train.comment_text = preprocess(raw_train.comment_text)
raw_test.comment_text = preprocess(raw_test.comment_text)

In [15]:
#raw_train

In [16]:
#write out
raw_train.to_csv("./train_preprocessed_v1.csv", index=False)
raw_test.to_csv("./test_preprocessed_v1.csv", index=False)

In [17]:
train_preprocessed = pd.read_csv("./train_preprocessed_v1.csv")

In [18]:
train_preprocessed.comment_text

0      ['explanation', 'edits', 'make', 'username', '...
1      ['aww', 'match', 'background', 'colour', 'seem...
2      ['hey', 'man', 'really', 'try', 'edit', 'war',...
3      ['make', 'real', 'suggestion', 'improvement', ...
4          ['sir', 'hero', 'chance', 'remember', 'page']
                             ...                        
295    ['user', 'worthless', 'goddamn', 'faggot', 'fu...
296    ['unsourced', 'aesthetic', 'opinion', 'therefo...
297    ['image', 'popclassic', 'jpg', 'tag', 'image',...
298       ['fuck', 'administrator', 'authority', 'tell']
299                         ['adorably', 'disingenuous']
Name: comment_text, Length: 300, dtype: object