In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:20.4f}'.format

In [2]:
test = pd.read_csv("data/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
train = pd.read_csv("data/jigsaw-unintended-bias-in-toxicity-classification/train.csv")

In [17]:
stops = set(stopwords.words('english'))
stops

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [45]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer

# intialize stemmer
ps = PorterStemmer() 
ls = LancasterStemmer()

# define stopwords
stops = set(stopwords.words('english'))

approved_stop_words = {"not", "get", "against", "haven", "haven't","aren't", 
                       "aren", "should", "shouldn", "shouldn't", "themselves", 
                       "them", "under", "over", 'won', "won't", "wouldn'", 
                       "wouldn't"}
stops = stops - approved_stop_words

def clean_text(text, stop_ws, stemmer=ps):
    
    t = text.replace("-", " ").split(" ")
    t = [w.strip(string.punctuation) for w in t]
    t = [w.lower() for w in t if w not in stop_ws]
    
    if stemmer:
        t = [stemmer.stem(w) for w in t]
    
    return t

In [46]:
train.columns

Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count',
       'stemmed_comments'],
      dtype='object')

In [19]:
train['stemmed_comments'] = train.comment_text.apply(lambda x: clean_text(x, stops))

In [47]:
train.rename({'stemmed_comments': 'clean_porter'}, inplace=True, axis=1)

In [53]:
train['clean_lancaster'] = train.comment_text.apply(lambda x: clean_text(x, stops, ls))

In [54]:
train['clean_unstemmed'] = train.comment_text.apply(lambda x: clean_text(x, stops, False))

In [55]:
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count,clean_porter,clean_lancaster,clean_unstemmed
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0.0,0,4,"[thi, cool, it', like, would, want, mother, re...","[thi, cool, it's, lik, would, want, moth, read...","[this, cool, it's, like, would, want, mother, ..."
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0.0,0,4,"[thank, thi, would, make, life, lot, less, anx...","[thank, thi, would, mak, lif, lot, less, anxy,...","[thank, this, would, make, life, lot, less, an..."
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0.0,0,4,"[thi, urgent, design, problem, kudo, take, ver...","[thi, urg, design, problem, kudo, tak, very, i...","[this, urgent, design, problem, kudos, taking,..."
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,0,0,0,0,0.0,0,4,"[is, someth, i'll, abl, instal, site, when, re...","[is, someth, i'll, abl, instal, sit, when, rel...","[is, something, i'll, able, install, site, whe..."
4,59856,0.8936,haha you guys are a bunch of losers.,0.0213,0.0,0.0213,0.8723,0.0,0.0,0.0,...,0,0,1,0,0.0,4,47,"[haha, guy, bunch, loser]","[hah, guy, bunch, los]","[haha, guys, bunch, losers]"


In [56]:
train.to_pickle('train_stemmed.pkl')

In [21]:
test['stemmed_comments'] = test.comment_text.apply(lambda x: clean_text(x, stops))

In [49]:
test.rename({'stemmed_comments': 'clean_porter'}, inplace=True, axis=1)

In [51]:
test['clean_lancaster'] = test.comment_text.apply(lambda x: clean_text(x, stops, ls))

In [52]:
test['clean_unstemmed'] = test.comment_text.apply(lambda x: clean_text(x, stops, False))

In [57]:
test.head()

Unnamed: 0,id,comment_text,clean_porter,clean_lancaster,clean_unstemmed
0,7000000,Jeff Sessions is another one of Trump's Orwell...,"[jeff, session, anoth, one, trump', orwellian,...","[jeff, sess, anoth, on, trump's, orwel, cho, h...","[jeff, sessions, another, one, trump's, orwell..."
1,7000001,I actually inspected the infrastructure on Gra...,"[i, actual, inspect, infrastructur, grand, chi...","[i, act, inspect, infrastruct, grand, chief, s...","[i, actually, inspected, infrastructure, grand..."
2,7000002,No it won't . That's just wishful thinking on ...,"[no, won't, , that', wish, think, democrat, fa...","[no, won't, , that's, wish, think, democr, fau...","[no, won't, , that's, wishful, thinking, democ..."
3,7000003,Instead of wringing our hands and nibbling the...,"[instead, wring, hand, nibbl, peripheri, issu,...","[instead, wring, hand, nibbl, periphery, issu,...","[instead, wringing, hands, nibbling, periphery..."
4,7000004,how many of you commenters have garbage piled ...,"[mani, comment, garbag, pile, high, yard, bald...","[many, com, garb, pil, high, yard, bald, tir, ...","[many, commenters, garbage, piled, high, yard,..."


In [58]:
test.to_pickle('test_stemmed.pkl')

In [36]:
def make_ngrams(clean, n=2):
    grams = []
    
    for i in range(len(clean)-(n-1)):
        grams.append(' '.join(clean[i:i+n]))

    return grams

def ngrams(preprocessed, n):
    '''
    Covert a list of preprocessed strings into ngrams of length n.
    Should return X ngrams where X is equal to the length of the string less (n - 1).
    '''
    ngrams_tuples = []
    
    # ensure that all ngrams are of length n by specifying list position of 
    # first item in last ngram
    last_ngram_start = len(preprocessed) - (n - 1)     
    
    # for each string from position i through last ngram start position, create 
    # a tuple of length n
    for i in range(last_ngram_start):
        ngrams_tuples.append(tuple(preprocessed[i:i + n]))

    return ngrams_tuples

In [37]:
sentence = "Hi my name is Robert and I like wine and cheese, but only on Tuesday."

In [41]:
cleaned_sentence = clean_text(sentence, stops)
cleaned_sentence

['hi', 'name', 'robert', 'i', 'like', 'wine', 'chees', 'tuesday']

In [39]:
ngrams(cleaned_sentence, 5)

[('hi', 'name', 'robert', 'i', 'like'),
 ('name', 'robert', 'i', 'like', 'wine'),
 ('robert', 'i', 'like', 'wine', 'chees'),
 ('i', 'like', 'wine', 'chees', 'tuesday')]

In [40]:
make_ngrams(cleaned_sentence, 5)

['hi name robert i like',
 'name robert i like wine',
 'robert i like wine chees',
 'i like wine chees tuesday']

In [None]:
def find_top_k(items, k):
    '''
    Find the K most frequently occuring items
    Inputs:
        items: a list of items
        k: integer 
    Returns: sorted list of K tuples
    '''
    # create our dict and our holder for key/value tuples
    counter_tuples = []

    # iterate through the list, creating a dict key with value 1 if the item
    # isn't already represented, and incrementing value by 1 if it is
    ct_dict = iterate_dict(items)
        
    # iterate through the dict and append key/value pairs in tuple forms at a
    # list; sort tuples in list and return top k values
    for i, v in ct_dict.items():
        counter_tuples.append((i, v))

    tuples_sorted = sort_count_pairs(counter_tuples)

    top_k = tuples_sorted[:k]

    return top_k

In [59]:
def add_text_cleaning_cols(df):
    
    df['split'] = df["comment_text"].apply(lambda x: x.split(" "))
    df['cleaned_w_stopwords'] = df["comment_text"].apply(clean_text,args=(None,False),)

    df['cleaned_no_stem'] = df["comment_text"].apply(clean_text,)
    df['cleaned_porter'] = df["comment_text"].apply(clean_text,args=("Porter",),)
    df['cleaned_lancaster'] = df["comment_text"].apply(clean_text,args=("Lancaster",),)

    df['perc_upper'] = df["comment_text"].apply(lambda x: round((len(re.findall(r'[A-Z]',x)) / len(x)), 3))

    df['num_exclam'] = df["comment_text"].apply(lambda x:(len(re.findall(r'!',x))))
    
    df['num_words'] = df["split"].apply(lambda x: len(x))
    
    df['perc_stopwords'] = round((df.num_words - df['cleaned_no_stem'].apply(lambda x: len(x)))/df.num_words,3) 
    
    df['num_upper_words'] = df["split"].apply(lambda x: sum(map(str.isupper, x)) 
)