### Load data


In [54]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation


data_set = DataPreparation.remove_stopwords(DataPreparation.load_data())
data_set = DataPreparation.clean_data(data_set)


data_set = data_set.sample(frac=0.005).reset_index(drop=True)
data_set

Unnamed: 0,positive,negative,neutral
0,would great trick happy,far worst day since january th unhappy crying...,decide new legislative party leader future cou...
1,nailed it brought tears eyes,played fm long time really should unhappy,bjp rushes shake uppercaste tag
2,mini campervan happy,tell unhappy,govt
3,thanks recent follow much appreciated happy want,nani lies unconscious,antiterror operation lucknows outskirts ends
4,definitely looking gave try anyway happy,blast okay love you,quaint digs budget or even less
5,happy,year haha unhappy,offered help


---


### Tokenization

In [55]:
import nltk
import pandas as pd

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokenized_data_frame = pd.DataFrame()
for column in data_set.columns:
    tokenized_data_frame[column] = data_set[column].astype(str).apply(tokenizer.tokenize)

print(tokenized_data_frame.shape)


(6, 3)


In [56]:
tokenized_data_frame

Unnamed: 0,positive,negative,neutral
0,"[would, great, trick, happy]","[far, worst, day, since, january, th, unhappy,...","[decide, new, legislative, party, leader, futu..."
1,"[nailed, it, brought, tears, eyes]","[played, fm, long, time, really, should, unhappy]","[bjp, rushes, shake, uppercaste, tag]"
2,"[mini, campervan, happy]","[tell, unhappy]",[govt]
3,"[thanks, recent, follow, much, appreciated, ha...","[nani, lies, unconscious]","[antiterror, operation, lucknows, outskirts, e..."
4,"[definitely, looking, gave, try, anyway, happy]","[blast, okay, love, you]","[quaint, digs, budget, or, even, less]"
5,[happy],"[year, haha, unhappy]","[offered, help]"


---

### Stemmming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_data = pd.DataFrame()

for column in tokenized_data_frame.columns:
    stemmed_data[column] = tokenized_data_frame[column].apply(lambda row: [stemmer.stem(word) for word in row])
print(stemmed_data)


---

### Lemmatization

In [57]:
import spacy

nlp = spacy.load('en_core_web_md')
lemmatized_words = pd.DataFrame()
for column in data_set.columns:
    lemmatized_words[column] = data_set[column].astype(str).apply(
        lambda row: [token.lemma_ for token in nlp(row)]
    )
    
lemmatized_words

Unnamed: 0,positive,negative,neutral
0,"[would, great, trick, happy]","[far, bad, day, , since, january, th, unhappy...","[decide, new, legislative, party, leader, futu..."
1,"[nail, it, bring, tear, eye]","[play, fm, long, time, really, should, unhappy]","[bjp, rush, shake, uppercaste, tag]"
2,"[mini, campervan, happy]","[tell, unhappy]",[govt]
3,"[thank, recent, follow, much, appreciated, hap...","[nani, lie, unconscious]","[antiterror, operation, lucknow, outskirt, end]"
4,"[definitely, looking, , gave, try, anyway, ha...","[blast, okay, love, you]","[ , quaint, dig, budget, or, even, less]"
5,[happy],"[year, haha, unhappy]","[offer, help]"


---


### Stemming + misspellings

#### misspelling correction with jaccard_distance

In [66]:
from nltk.corpus import words
from rapidfuzz import process
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
import nltk

nltk.download('words')

stemmer = PorterStemmer()
corrected_stemmed_data = pd.DataFrame()

def get_closest_word(word, threshold=80):
    match = process.extractOne(word, words.words(), score_cutoff=threshold)
    if match :
        return match[0]
    return word




[nltk_data] Downloading package words to /home/codespace/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [67]:
for column in tokenized_data_frame.columns:
    corrected_stemmed_data[column] = tokenized_data_frame[column].apply(
        lambda row: [stemmer.stem(get_closest_word(token)) for token in row])


corrected_stemmed_data

Unnamed: 0,positive,negative,neutral
0,"[would, great, trick, happi]","[far, worst, day, sinc, a, th, unhappi, cri, cri]","[decid, new, legisl, parti, leader, futur, cou..."
1,"[a, it, brought, a, angeley]","[splay, chaffman, long, time, realli, should, ...","[b, brush, shake, as, tag]"
2,"[adminicl, am, happi]","[tell, unhappi]",[g]
3,"[thank, recent, follow, much, appreci, happi, ...","[a, alli, unconsci]","[an, oper, know, outskirt, amend]"
4,"[definit, look, gave, tri, anyway, happi]","[blast, a, love, you]","[quaint, dig, budget, or, even, less]"
5,[happi],"[year, a, unhappi]","[goffer, help]"


---
### Lemmatization with misspelling

In [69]:
corrected_lemmatizide_data = pd.DataFrame()

for column in data_set.columns:
    corrected_lemmatizide_data[column] = data_set[column].astype(str).apply(
        lambda row: [token.lemma_ 
                     for token in nlp(" ".join(get_closest_word(word) 
                                               for word in row.split()))]
    )


In [62]:
corrected_lemmatizide_data

Unnamed: 0,positive,negative,neutral
0,"[would, great, trick, happy]","[far, bad, day, since, a, th, unhappy, cry, cry]","[decide, new, legislative, party, leader, futu..."
1,"[a, it, bring, a, angeleyes]","[splay, chaffman, long, time, really, should, ...","[b, brush, shake, as, tag]"
2,"[adminicle, be, happy]","[tell, unhappy]",[g]
3,"[thank, recent, follow, much, appreciate, happ...","[a, Allies, unconscious]","[an, operation, know, outskirt, amend]"
4,"[definitely, looking, gave, try, anyway, happy]","[blast, a, love, you]","[quaint, dig, budget, or, even, less]"
5,[happy],"[year, a, unhappy]","[goffer, help]"


---
### Correct Slang words 