### Load data


In [32]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation


data_set = DataPreparation.remove_stopwords(DataPreparation.load_data())
data_set = DataPreparation.remove_punctuation(data_set)


data_set = data_set.sample(frac=0.2).reset_index(drop=True)
print(data_set)

                                              positive  \
0                            Surround positivity happy   
1    Heres full art Glyph Keeper mummy token intere...   
2              protect layer fat KISSES TheFashionIcon   
3                                                  see   
4                                      hype real happy   
..                                                 ...   
232                                      figures happy   
233  Thats kaki tane amazing Foyound place Germany ...   
234  Superstar thanks much kind words Look forward ...   
235                                    Good luck happy   
236                                        Thank happy   

                                              negative  \
0    good morning nightmare im still wondering madd...   
1    but use purple flowercrown display picture unh...   
2                                      flavorful dream   
3             tired theres nothing great combo unhappy   
4            

---


### Tokenization

In [37]:
import nltk
import pandas as pd

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokenized_data_frame = pd.DataFrame()
for column in data_set.columns:
    tokenized_data_frame[column] = data_set[column].astype(str).apply(tokenizer.tokenize)


In [38]:
print(tokenized_data_frame)

                                              positive  \
0                        [Surround, positivity, happy]   
1    [Heres, full, art, Glyph, Keeper, mummy, token...   
2        [protect, layer, fat, KISSES, TheFashionIcon]   
3                                                [see]   
4                                  [hype, real, happy]   
..                                                 ...   
232                                   [figures, happy]   
233  [Thats, kaki, tane, amazing, Foyound, place, G...   
234  [Superstar, thanks, much, kind, words, Look, f...   
235                                [Good, luck, happy]   
236                                     [Thank, happy]   

                                              negative  \
0    [good, morning, nightmare, im, still, wonderin...   
1    [but, use, purple, flowercrown, display, pictu...   
2                                   [flavorful, dream]   
3      [tired, theres, nothing, great, combo, unhappy]   
4            

---

### Stemmming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_data = pd.DataFrame()

for column in tokenized_data_frame.columns:
    stemmed_data[column] = tokenized_data_frame[column].apply(lambda row: [stemmer.stem(word) for word in row])
print(stemmed_data)


                                              positive  \
0                             [surround, posit, happi]   
1    [here, full, art, glyph, keeper, mummi, token,...   
2          [protect, layer, fat, kiss, thefashionicon]   
3                                                [see]   
4                                  [hype, real, happi]   
..                                                 ...   
232                                     [figur, happi]   
233  [that, kaki, tane, amaz, foyound, place, germa...   
234  [superstar, thank, much, kind, word, look, for...   
235                                [good, luck, happi]   
236                                     [thank, happi]   

                                              negative  \
0    [good, morn, nightmar, im, still, wonder, madd...   
1    [but, use, purpl, flowercrown, display, pictur...   
2                                      [flavor, dream]   
3           [tire, there, noth, great, combo, unhappi]   
4            

---

### Lemmatization

In [47]:
import spacy

nlp = spacy.load('en_core_web_md')
lemmatized_words = pd.DataFrame()
for column in data_set.columns:
    lemmatized_words[column] = data_set[column].astype(str).apply(
        lambda row: [token.lemma_ for token in nlp(row)]
    )
    
print(lemmatized_words)

                                              positive  \
0                        [surround, positivity, happy]   
1    [Heres, full, art, Glyph, Keeper, mummy, token...   
2          [protect, layer, fat, kiss, thefashionicon]   
3                                                [see]   
4                                  [hype, real, happy]   
..                                                 ...   
232                                    [figure, happy]   
233  [that, s, kaki, tane, amazing, Foyound, place,...   
234  [Superstar, thank, much, kind, word, look, for...   
235                                [good, luck, happy]   
236                                     [thank, happy]   

                                              negative  \
0    [good, morning, nightmare, I, m, still, wonder...   
1    [but, use, purple, flowercrown, display, pictu...   
2                                   [flavorful, dream]   
3    [tired, there, s, nothing, great, combo, unhappy]   
4            