### Loading data

In [1]:
import pandas as pd
import numpy as np
import operator
import re
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from textblob import TextBlob

Using TensorFlow backend.


In [2]:
dat_tr = pd.read_csv('train.csv')
dat_te = pd.read_csv('test.csv')

In [3]:
# Make sure all 'comment_text' values are string
dat_tr["comment_text"] = dat_tr["comment_text"].astype(str)
dat_te["comment_text"] = dat_te["comment_text"].astype(str)

In [4]:
df = pd.concat([dat_tr ,dat_te], sort = False)

### Loading embeddings

In [5]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [6]:
embed_glove = load_embed('glove.840B.300d.txt')

In [7]:
embed_paragram = load_embed('paragram_300_sl999.txt')

### Function to build vocab and check coverage of embeddings

In [8]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [9]:
vocab = build_vocab(df["comment_text"])

In [10]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key = operator.itemgetter(1))[::-1]

    return unknown_words

In [11]:
oov_glove = check_coverage(vocab, embed_glove)

Found embeddings for 15.519% of vocab
Found embeddings for 89.608% of all text


In [12]:
oov_paragram = check_coverage(vocab, embed_paragram)

Found embeddings for 8.796% of vocab
Found embeddings for 78.632% of all text


This means that only 15.519% of the data has embedding in GloVe, and only 8.8% has embedding in paragram. The second line implies that about 10% of the data is useless if GloVe is used, and about 20% if paragram is used. This needs to be improved. First, we need to see some OOV (out of vocabulary) word in the data.

In [13]:
oov_glove[:10]

[("isn't", 42192),
 ("That's", 39614),
 ("won't", 31075),
 ("he's", 25672),
 ("Trump's", 24673),
 ("aren't", 21696),
 ("wouldn't", 20611),
 ('Yes,', 20040),
 ('that,', 19210),
 ("wasn't", 19084)]

In [14]:
oov_paragram[:10]

[('I', 908830),
 ('The', 459088),
 ('Trump', 165619),
 ('It', 162342),
 ('You', 152288),
 ('If', 151757),
 ('And', 134926),
 ('This', 128001),
 ('They', 106274),
 ('We', 96380)]

Some problems found here is that the vocab has punctuation (e.g. comma ","), contractions (e.g. "isn't") not found in the GloVE. For paragram, the main problem is with capital/lower lettter. In the next step, some manipulations will be done to increase the coverage of the embeddings. This will include lowering capital letters, eliminating contractions, eliminating punctuation, spelling correction, and number elimination.

### Text manipulation to increase coverage of the embeddings

1. Lowering capitals

In [15]:
# Lowering all capital, add it to GloVe if it's not there
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [16]:
add_lower(embed_glove, vocab)
oov_glove = check_coverage(vocab, embed_glove)

Added 25061 words to embedding
Found embeddings for 15.638% of vocab
Found embeddings for 89.637% of all text


In [17]:
add_lower(embed_paragram, vocab)
oov_paragram = check_coverage(vocab, embed_paragram)

Added 0 words to embedding
Found embeddings for 8.796% of vocab
Found embeddings for 78.632% of all text


In [19]:
df['lowered_comment'] = df['comment_text'].apply(lambda x: x.lower())

2. Eliminating contractions

In [20]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [21]:
# Function to clean contractions based on the map above
def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [22]:
df['treated_comment'] = df['lowered_comment'].apply(lambda x: clean_contractions(x, contraction_mapping))

In [23]:
vocab = build_vocab(df['treated_comment'])

In [24]:
oov_glove = check_coverage(vocab, embed_glove)

Found embeddings for 13.506% of vocab
Found embeddings for 90.394% of all text


In [25]:
oov_paragram = check_coverage(vocab, embed_paragram)

Found embeddings for 13.699% of vocab
Found embeddings for 90.399% of all text


The coverage of paragram embedding is now as good as GloVe

3. Eliminating punctuations

In [26]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [27]:
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

In [28]:
def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

In [29]:
df['treated_comment'] = df['treated_comment'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

In [30]:
vocab = build_vocab(df['treated_comment'])

In [31]:
oov_glove = check_coverage(vocab, embed_glove)

Found embeddings for 54.207% of vocab
Found embeddings for 99.723% of all text


In [32]:
oov_paragram = check_coverage(vocab, embed_paragram)

Found embeddings for 57.633% of vocab
Found embeddings for 99.739% of all text


In [33]:
oov_glove[:20]

[('brexit', 2043),
 ('theglobeandmail', 1423),
 ('québec', 1365),
 ('drumpf', 1294),
 ('deplorables', 1274),
 ('trumpcare', 869),
 ('sb91', 841),
 ('theguardian', 795),
 ('klastri', 754),
 ('trumpism', 599),
 ('ᴀ', 551),
 ('✰', 550),
 ('ᴀɴᴅ', 540),
 ('―', 507),
 ('auwe', 500),
 ('naïve', 497),
 ('»', 469),
 ('«', 454),
 ('trumpsters', 416),
 ('trumpian', 410)]

4. Misspells and other ambiguous texts

There is still a lot of OOV words. Most of them are related to politics (Trump, Hillary, Obama, Justin Trudeau) and news platform. Here, an attempt is made to translate these OOVs manually. It should be noted that this is not an exclusive list. 

In [34]:
mispell_dict = {'theglobeandmail': 'news', 'theguardian':'news', 'québec':'quebec', 'drumpf':'trump', 'trumpcare': 'trump', 'trumpism':'trump', 'trumpian':'trump', 'trumpsters':'trump', 'ᴀɴᴅ': 'and', 'naïve': 'naive', 'québécois': 'quebecois', 'ᴛʜᴇ': 'the', 'montréal': 'montreal', 'ʜᴏᴍᴇ': 'home', 'ᴜᴘ': 'up', 'ʙʏ': 'by', 'yᴏᴜ': 'you', 'ᴀᴛ': 'at', 'koncerned': 'concerned', 'thedonald': 'trump', 'trumpkins':'trump', 'washingtontimes': 'news', 'ᴄʜᴇᴄᴋ': 'check', 'ғᴏʀ': 'for', 'ᴄᴏᴍᴘᴜᴛᴇʀ': 'computer', 'ᴛʜɪs': 'this', 'ᴍᴏɴᴛʜ': 'month', 'ᴡᴏʀᴋɪɴɢ': 'working', 'chrétien': 'chretien', 'ᴊᴏʙ':'job', 'ᴏғ':'of', 'ʜᴏᴜʀʟʏ':'hourly', 'ᴡᴇᴇᴋ':'week', 'ʟɪɴᴋ':'link', 'ᴛᴏ':'to', 'ʜᴀᴠᴇ':'have', 'ᴄᴀɴ':'can', 'ᴇɴᴅ':'end', '😀':'smiley', '😂':'laugh', '😉':'wink', 'trumpies':'trump', 'trumpty':'trump', 'trumpettes':'trump', '😃':'smiley', '😊':'smiley', 'torontosun':'news', 'vancouversun':'news', 'theintercept':'news', 'realdonaldtrump':'trump', 'trumpland':'trump', 'drump':'trump', 'trumpnuts':'trump', 'trumpo':'trump', 'nationalobserver': 'news', 'thefederalist':'news', 'trumpanzees':'trump', 'trumpski':'trump', 'hawaiʻi':'hawaii', 'trumpites':'trump', 'trumpie':'trump', 'americamagazine':'news', 'thecanadianencyclopedia':'news', 'trumpians':'trump', 'trumptards':'trump', 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon'}

In [35]:
def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

In [36]:
df['treated_comment'] = df['treated_comment'].apply(lambda x: correct_spelling(x, mispell_dict))

In [37]:
vocab = build_vocab(df['treated_comment'])

In [38]:
oov_glove = check_coverage(vocab, embed_glove)

Found embeddings for 54.223% of vocab
Found embeddings for 99.739% of all text


In [39]:
oov_paragram = check_coverage(vocab, embed_paragram)

Found embeddings for 57.651% of vocab
Found embeddings for 99.754% of all text


5. Remove numbers

In [40]:
def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [41]:
df['treated_comment'] = df['treated_comment'].apply(lambda x: clean_numbers(x))

### Applying the steps above for train and test data

In [42]:
# For train data
dat_tr['treated_comment'] = dat_tr['comment_text'].apply(lambda x: x.lower())
dat_tr['treated_comment'] = dat_tr['treated_comment'].apply(lambda x: clean_contractions(x, contraction_mapping))
dat_tr['treated_comment'] = dat_tr['treated_comment'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
dat_tr['treated_comment'] = dat_tr['treated_comment'].apply(lambda x: correct_spelling(x, mispell_dict))
dat_tr['treated_comment'] = dat_tr['treated_comment'].apply(lambda x: clean_numbers(x))

In [43]:
# For test data
dat_te['treated_comment'] = dat_te['comment_text'].apply(lambda x: x.lower())
dat_te['treated_comment'] = dat_te['treated_comment'].apply(lambda x: clean_contractions(x, contraction_mapping))
dat_te['treated_comment'] = dat_te['treated_comment'].apply(lambda x: clean_special_chars(x, punct, punct_mapping))
dat_te['treated_comment'] = dat_te['treated_comment'].apply(lambda x: correct_spelling(x, mispell_dict))
dat_te['treated_comment'] = dat_te['treated_comment'].apply(lambda x: clean_numbers(x))

### Meta embedding

In this project, the weighted average between the two embedding matrices will be used. That is 0.7 weight for GloVe and 0.3 weight for paragram

In [162]:
len(embed_glove)

2221077

In [163]:
len(embed_paragram)

1703756

In [234]:
def build_meta(embed1,embed2,weight1,weight2):
    meta_embed={}
    word_union=set(embed1.keys()).union(set(embed2.keys()))
    word_only_in_embed1=set(embed1.keys())-set(embed2.keys())
    word_only_in_embed2=set(embed2.keys())-set(embed1.keys())
    for word in word_union:
        if word in word_only_in_embed1: 
            meta_embed[word]=embed1[word]
        elif word in word_only_in_embed2: 
            meta_embed[word]=embed2[word]
        else: 
            meta_embed[word]=embed1[word]*weight1+embed2[word]*weight2           
    return meta_embed

In [235]:
meta_embed=build_meta(embed_glove,embed_paragram,0.7,0.3)

In [236]:
len(meta_embed)

2912339

### Feature Engineering

TextBlob polarity value will be added for sentiment analysis of the comments. 

In [54]:
dat_te['polarity'] = dat_te['treated_comment'].apply(lambda comment: TextBlob(comment).polarity)

In [55]:
dat_tr['polarity'] = dat_tr['treated_comment'].apply(lambda comment: TextBlob(comment).polarity)

Total number of words; Number of unique words; Textblob word subjectivity

In [57]:
dat_te['subjectivity'] = dat_te['treated_comment'].apply(lambda comment: TextBlob(comment).subjectivity)
dat_tr['subjectivity'] = dat_tr['treated_comment'].apply(lambda comment: TextBlob(comment).subjectivity)

In [67]:
dat_te['unique_word_count']=dat_te['treated_comment'].apply(lambda x:len(set(x.split())))
dat_tr['unique_word_count']=dat_tr['treated_comment'].apply(lambda x:len(set(x.split())))

In [70]:
dat_te['total_word_count']=dat_te['treated_comment'].apply(lambda x:len(x.split()))
dat_tr['total_word_count']=dat_tr['treated_comment'].apply(lambda x:len(x.split()))

### Exporting dataset and meta-embedding file

In [148]:
# Saving data
dat_tr.to_csv("train_treated.csv")
dat_te.to_csv("test_treated.csv")

In [237]:
# Saving meta embedding pickle file
with open('meta_embed.pickle', 'wb') as handle:
    pickle.dump(meta_embed, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [150]:
with open('glove.pickle', 'wb') as handle:
    pickle.dump(embed_glove, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [151]:
with open('embed_paragram.pickle', 'wb') as handle:
    pickle.dump(embed_paragram, handle, protocol=pickle.HIGHEST_PROTOCOL)