### Load data


In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation


data_set = DataPreparation.remove_stopwords(DataPreparation.load_data())
data_set = DataPreparation.clean_data(data_set)


data_set = data_set.sample(frac=0.005).reset_index(drop=True)
data_set

---


### Tokenization

In [None]:
import nltk
import pandas as pd

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
tokenized_data_frame = pd.DataFrame()
for column in data_set.columns:
    tokenized_data_frame[column] = data_set[column].astype(str).apply(tokenizer.tokenize)

print(tokenized_data_frame.shape)


In [None]:
tokenized_data_frame

---

### Stemmming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stemmed_data = pd.DataFrame()

for column in tokenized_data_frame.columns:
    stemmed_data[column] = tokenized_data_frame[column].apply(lambda row: [stemmer.stem(word) for word in row])
stemmed_data


---

### Lemmatization

In [None]:
import spacy

nlp = spacy.load('en_core_web_md')
lemmatized_words = pd.DataFrame()
for column in data_set.columns:
    lemmatized_words[column] = data_set[column].astype(str).apply(
        lambda row: [token.lemma_ for token in nlp(row)]
    )
    
lemmatized_words

---


### Stemming + misspellings

#### misspelling correction with jaccard_distance

In [None]:
from nltk.corpus import words
from rapidfuzz import process
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
import nltk

nltk.download('words')

stemmer = PorterStemmer()
corrected_stemmed_data = pd.DataFrame()

def get_closest_word(word, threshold=80):
    match = process.extractOne(word, words.words(), score_cutoff=threshold)
    if match :
        return match[0]
    return word




In [None]:
for column in tokenized_data_frame.columns:
    corrected_stemmed_data[column] = tokenized_data_frame[column].apply(
        lambda row: [stemmer.stem(get_closest_word(token)) for token in row])


corrected_stemmed_data

---
### Lemmatization with misspelling

In [None]:
corrected_lemmatizide_data = pd.DataFrame()

for column in data_set.columns:
    corrected_lemmatizide_data[column] = data_set[column].astype(str).apply(
        lambda row: [token.lemma_ 
                     for token in nlp(" ".join(get_closest_word(word) 
                                               for word in row.split()))]
    )


In [None]:
corrected_lemmatizide_data

---
### Correct Slang words 

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.tokenizer import SocialTokenizer
import pandas as pd

# Initialize text preprocessor
text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    annotate={"hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored"},
    fix_html=True,
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

# Process data
processed_data = pd.DataFrame()
for column in data_set.columns:
    processed_data[column] = data_set[column].astype(str).apply(
        lambda row: " ".join(text_processor.pre_process_doc(row)) if pd.notnull(row) else row
    )

In [None]:
processed_data

: 