In [113]:
import pandas as pd 
import numpy as np
import re
import difflib as diff
import spellchecker


## Preprocessing

Preprocessing is a vital part of the data cleaning process and can lead to huge gains in performance

In [114]:
df = pd.read_csv("../Data/train.csv")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


We will have to do a few things in order to prepare this data for training

1. Clean up the text features
2. Turn selected text into target indices for easier prediction

It would be possible to train a model using selected text as a target using an encoder decoder model but for ease of programming and prediction we decided it would be a better idea to predict indices and then use the indices to slice the original text

To do that we will need to find the starting and ending indices for the selected text within the true text

In [115]:
df["text_split"]=df.text.apply(lambda row: str(row).split())
df["text_split"].head()

0          [I`d, have, responded,, if, I, were, going]
1    [Sooo, SAD, I, will, miss, you, here, in, San,...
2                      [my, boss, is, bullying, me...]
3                 [what, interview!, leave, me, alone]
4    [Sons, of, ****,, why, couldn`t, they, put, th...
Name: text_split, dtype: object

In [116]:
df["selected_text_split"]=df.selected_text.apply(lambda row: str(row).split())
df["selected_text_split"]

0              [I`d, have, responded,, if, I, were, going]
1                                              [Sooo, SAD]
2                                           [bullying, me]
3                                       [leave, me, alone]
4                                        [Sons, of, ****,]
                               ...                        
27476                                            [d, lost]
27477                                    [,, don`t, force]
27478                     [Yay, good, for, both, of, you.]
27479                     [But, it, was, worth, it, ****.]
27480    [All, this, flirting, going, on, -, The, ATG, ...
Name: selected_text_split, Length: 27481, dtype: object

Both text and selected text have been split on a word level now we need to match indices. The following code will find the indice of the selected text within the real text. These will become the target feature

In [117]:
def index_finder(text,selected):
    return text.index(selected[0])
df["initial_indice"] = df.apply(lambda x: index_finder(x.text_split,x.selected_text_split),axis=1)
df["initial_indice"]

ValueError: ("'onna' is not in list", 'occurred at index 18')

In [118]:
df.iloc[18]

textID                                                     af3fed7fc3
text                       is back home now      gonna miss every one
selected_text                                                    onna
sentiment                                                    negative
text_split             [is, back, home, now, gonna, miss, every, one]
selected_text_split                                            [onna]
Name: 18, dtype: object

The above was an unintended result from the data. Whoever inputted the data meant to input "gonna" but actually input "onna". We will manually fix this case and hope there arent too many more mistakes

In [119]:
df.selected_text_split[18] = ["gonna"]
df.selected_text_split.iloc[18]

['gonna']

Now lets try again

In [120]:
def index_finder(text,selected):
    return text.index(selected[0])
df["initial_indice"] = df.apply(lambda x: index_finder(x.text_split,x.selected_text_split),axis=1)
df["initial_indice"]

ValueError: ("'.no' is not in list", 'occurred at index 27')

it happened again. It is worth testing how many times this error happens in order to gauge what approach must be taken

In [121]:
errors = 0
def index_finder(text,selected):
    global errors
    try:
        testing = text.index(selected[0])
    except:
        errors+=1


df["initial_indice"] = df.apply(lambda x: index_finder(x.text_split,x.selected_text_split),axis=1)
errors

1821

There are far too many to fix by hand. 1821 Is a very unreasonable number for 2 people to sit down and fix. We will turn to a more elegant solution. Difflib is a library that will find the most similar word and for our purposes it will be good enough

In [122]:
def index_finder(text,selected):
    return text.index(diff.get_close_matches(selected[0],text)[0])
df["initial_indice"] = df.apply(lambda x: index_finder(x.text_split,x.selected_text_split),axis=1)

IndexError: ('list index out of range', 'occurred at index 27')

In [123]:
df.iloc[27]

textID                                                        bdc32ea43c
text                   On the way to Malaysia...no internet access to...
selected_text                                               .no internet
sentiment                                                       negative
text_split             [On, the, way, to, Malaysia...no, internet, ac...
selected_text_split                                      [.no, internet]
initial_indice                                                      None
Name: 27, dtype: object

Thsi was also unexpected. The selected text data starts with a period. It starts with the 3rd period in a grouping of 3. In this case we will have to do a much better job of cleaning the data. The next step to try and fix this problem would be to use regex to remove the punctuation

In [147]:
def clean(row):
    row = row.replace('.', ' ')
    row = row.replace(',', '')
    row = row.replace("'", "")
    row = re.sub("\d+", "<NUM>", row)
    row = re.sub("\*+", "<CURSE>", row)
    row = re.sub("^@.*", "<USER>", row)
    row = re.sub("^#.*", "<HASH>", row)
    row = re.sub("^((https|http|ftp|file)?:\/\/).*", "<LINK>", row)
    row = re.sub("[0-9]+:[0-9]+(am|AM|pm|PM)?", "<DATE>", row)
    row = row.lower().strip()
    return row.split()
df["text_split"] = df.text.apply(lambda row: clean(str(row)))
df["selected_text_split"] = df.selected_text.apply(lambda row: clean(str(row)))
df["text_split"]




0               [i`d, have, responded, if, i, were, going]
1        [sooo, sad, i, will, miss, you, here, in, san,...
2                             [my, boss, is, bullying, me]
3                     [what, interview!, leave, me, alone]
4        [sons, of, <curse>, why, couldn`t, they, put, ...
                               ...                        
27476    [wish, we, could, come, see, u, on, denver, hu...
27477    [i`ve, wondered, about, rake, to, the, client,...
27478    [yay, good, for, both, of, you, enjoy, the, br...
27479                   [but, it, was, worth, it, <curse>]
27480    [all, this, flirting, going, on, -, the, atg, ...
Name: text_split, Length: 27481, dtype: object

The below function will now check the tweets for any common mispellings and change them. This operation can take some time

In [146]:
spell = spellchecker.SpellChecker()

def check_spelling(row):
    mispelled = spell.unknown(row)
    for word in mispelled:
        if word not in ["<curse>", "<num>", "<user>", "<hash>"]:
            row[row.index(word)] = spell.correction(word)
    return row 
df["text_split"] = df.text_split.apply(lambda x: check_spelling(x))
df["selected_text_split"] = df.selected_text_split.apply(lambda x: check_spelling(x))

df.text_split.head()

In [None]:
df.selected_text_split.head()

And we can check out our new text data

In [0]:
pd.to_csv("preprocessed_train.csv")