### Load data


In [16]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation


data_set = DataPreparation.remove_stopwords(DataPreparation.load_data())
data_set = DataPreparation.remove_punctuation(data_set)


data_set = data_set.sample(frac=0.02).reset_index(drop=True)
print(data_set)

                                             positive  \
0                                 Im really out happy   
1   Well fun Thanks stopping giving shit tonight f...   
2                           Haha thanks fansnim happy   
3     class hall missions heroic dungeons surprised D   
4   link Seventeens Yokohama Concert English subs DDL   
5                                          Look happy   
6                                      Ill like happy   
7                          Everyone getting punched D   
8               Facts Parrots Minecraft coming soon D   
9   think Im probably beyond repair sounds tea eti...   
10                                a scan tumblr happy   
11                               thanks recent follow   
12                      Thanks Lala Much needed happy   
13                                                uss   
14                RockSoc still going beginning smile   
15              inspiration everyone Miss Jones happy   
16                           Pl

---


### Tokenization

In [17]:
import nltk
import pandas as pd

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokenized_data_frame = pd.DataFrame()
for column in data_set.columns:
    tokenized_data_frame[column] = data_set[column].astype(str).apply(tokenizer.tokenize)


In [18]:
print(tokenized_data_frame)

                                             positive  \
0                            [Im, really, out, happy]   
1   [Well, fun, Thanks, stopping, giving, shit, to...   
2                      [Haha, thanks, fansnim, happy]   
3   [class, hall, missions, heroic, dungeons, surp...   
4   [link, Seventeens, Yokohama, Concert, English,...   
5                                       [Look, happy]   
6                                  [Ill, like, happy]   
7                     [Everyone, getting, punched, D]   
8        [Facts, Parrots, Minecraft, coming, soon, D]   
9   [think, Im, probably, beyond, repair, sounds, ...   
10                           [a, scan, tumblr, happy]   
11                           [thanks, recent, follow]   
12                [Thanks, Lala, Much, needed, happy]   
13                                              [uss]   
14          [RockSoc, still, going, beginning, smile]   
15        [inspiration, everyone, Miss, Jones, happy]   
16                       [Play,

---

### Stemmming

In [19]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_data = pd.DataFrame()

for column in tokenized_data_frame.columns:
    stemmed_data[column] = tokenized_data_frame[column].apply(lambda row: [stemmer.stem(word) for word in row])
print(stemmed_data)


                                             positive  \
0                            [im, realli, out, happi]   
1   [well, fun, thank, stop, give, shit, tonight, ...   
2                       [haha, thank, fansnim, happi]   
3   [class, hall, mission, heroic, dungeon, surpri...   
4   [link, seventeen, yokohama, concert, english, ...   
5                                       [look, happi]   
6                                  [ill, like, happi]   
7                            [everyon, get, punch, d]   
8            [fact, parrot, minecraft, come, soon, d]   
9   [think, im, probabl, beyond, repair, sound, te...   
10                           [a, scan, tumblr, happi]   
11                            [thank, recent, follow]   
12                   [thank, lala, much, need, happi]   
13                                              [uss]   
14                 [rocksoc, still, go, begin, smile]   
15               [inspir, everyon, miss, jone, happi]   
16                           [p

---

### Lemmatization

In [20]:
import spacy

nlp = spacy.load('en_core_web_md')
lemmatized_words = pd.DataFrame()
for column in data_set.columns:
    lemmatized_words[column] = data_set[column].astype(str).apply(
        lambda row: [token.lemma_ for token in nlp(row)]
    )
    
print(lemmatized_words)

                                             positive  \
0                          [I, m, really, out, happy]   
1   [well, fun, thank, stop, give, shit, tonight, ...   
2                       [haha, thank, fansnim, happy]   
3   [class, hall, mission, heroic, dungeon, surpri...   
4   [link, Seventeens, Yokohama, Concert, English,...   
5                                       [look, happy]   
6                                  [ill, like, happy]   
7                       [everyone, getting, punch, d]   
8          [Facts, Parrots, Minecraft, come, soon, d]   
9   [think, I, m, probably, beyond, repair, sound,...   
10                           [a, scan, tumblr, happy]   
11                            [thank, recent, follow]   
12                   [thank, Lala, much, need, happy]   
13                                              [uss]   
14                 [RockSoc, still, go, begin, smile]   
15        [inspiration, everyone, Miss, Jones, happy]   
16                          [pl

---


### Stemming + misspellings

#### misspelling correction with jaccard_distance

In [None]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
import nltk

nltk.download('words')

correct_words = set(w for w in words.words() if len(w) > 1)  # Only use words with at least 2 characters
stemmer = PorterStemmer()
corrected_stemmed_data = pd.DataFrame()

# for column in tokenized_data_frame.columns:
#     corrected_stemmed_data[column] = tokenized_data_frame[column].apply(
#         lambda row: [
#             stemmer.stem(min(
#                 ((jaccard_distance(set(ngrams(word, 2)), set(ngrams(w, 2))), w) 
#                  for w in correct_words if len(word) > 1),  # Ensure 'word' has at least 2 chars
#                 key=lambda x: x[0]
#             )[1]) for word in row if len(word) > 1  # Skip short words entirely
#         ] if isinstance(row, list) else row
    # )
from nltk.corpus import words 

correct_words = words.words()

for column in tokenized_data_frame.columns:
    for row in tokenized_data_frame[column]:
        print(row)
        bigrams = set(ngrams(row, 2))
        print(set(ngrams(row, 2)))
        for word in correct_words:
            print(jaccard_distance(bigrams, set(ngrams(word, 2))))
            print()
        # if word_bigrams:
        # distances = [
        #     (jaccard_distance(bigrams, set(ngrams(w, 2))), w)
        #     for w in correct_words
        #     if set(ngrams(w, 2))
        # ]
        # if distances:
        #     print(min(distances, key=lambda x: x[0])[1])
        # print(distances)
        # print(min(distances, key=lambda x: x[0])[1] if distances else row)
        break
    break
        


In [None]:

from nltk.metrics.distance  import edit_distance 

correct_words = words.words()
stemmer = PorterStemmer()
corrected_stemmed_words = pd.DataFrame()
for columms in tokenized_data_frame.columns:
    corrected_data = []
    for word in data:
        temp = [(edit_distance(word, w),w) for w in correct_words if w[0]==word[0]] 
        closest_word = min(temp, key=lambda x: x[0])[1] if temp else word
        stemmed_word = stemmer.stem(closest_word)
        corrected_data.append(stemmed_word)
    stemmed_words.append(corrected_data)