### Load data


In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation

data_set = DataPreparation().load_data()


data_set = data_set.sample(frac=0.005).reset_index(drop=True)
data_set

---


### Tokenization

In [None]:
import nltk
import pandas as pd

tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

data_set['processed_tweet'] = data_set['tweet'].apply(lambda x: tokenizer.tokenize(x))
data_set



---

### Stemmming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
tokens = data_set['tweet'].apply(lambda x: x.split())
data_set['processed_tweet'] = tokens.apply(lambda x: [stemmer.stem(y) for y in x])
data_set

---

### Lemmatization

In [None]:
#python -m spacy download en_core_web_md
import spacy

nlp = spacy.load('en_core_web_md')
data_set['processed_tweet'] = data_set['tweet'].apply(lambda x: [token.lemma_ for token in nlp(x)])
data_set

---


### Stemming + misspellings

#### misspelling correction with jaccard_distance

In [None]:
from nltk.corpus import words
from rapidfuzz import process
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
import nltk

nltk.download('words')

stemmer = PorterStemmer()
corrected_stemmed_data = pd.DataFrame()

def get_closest_word(word, threshold=80):
    match = process.extractOne(word, words.words(), score_cutoff=threshold)
    if match :
        return match[0]
    return word




In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
tokens = data_set['tweet'].apply(lambda x: x.split())
data_set['processed_tweet'] = tokens.apply(lambda x: [stemmer.stem(get_closest_word(y)) for y in x])
data_set

---
### Lemmatization with misspelling

In [None]:
corrected_lemmatizide_data = pd.DataFrame()

nlp = spacy.load('en_core_web_md')

data_set['processed_tweet'] = data_set['tweet'].apply(lambda x:
                                                        [token.lemma_ for token in nlp
                                                                        (" ".join(get_closest_word(word) 
                                                                        for word in x.split()))
                                                        ])
data_set

---
### Correct Slang words 

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.tokenizer import SocialTokenizer
import pandas as pd

# Initialize text preprocessor
text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    annotate={"hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored"},
    fix_html=True,
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

data_set['processed_tweet'] = data_set['tweet'].apply(lambda x: text_processor.pre_process_doc(x) if pd.notnull(x) else x)

In [None]:
data_set["processed_tweet"]