### Load data

In [3]:
import nltk
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))
from  src.data_cleaning import DataCleaning

data_frame = DataCleaning.load_data()
# data_frame_cleaned_from_stop_words = DataCleaning.remove_stopwords(data_frame)



# data_frame = data_frame.sample(frac=0.02)

# data_set = DataCleaning.remove_stopwords(data_frame)


---

### Tokenization

In [5]:
def tokenization(data_frame) -> list:
    tokenizer = tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = []
    for data in data_frame:
        tokens.append(tokenizer.tokenize(data))
    return tokens



In [8]:
import pandas as pd

tokens_positive = tokenization(data_frame=data_frame["positive"])
pd.DataFrame(tokens_positive).shape


# print(tokens_positive[0])

(1186, 28)

---

### Stemmming

In [None]:
def stemming(data_frame) -> list:
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    stemmed_words = []
    tokenized_data_frame = tokenization(data_frame)
    for data in tokenized_data_frame:
        stemmed_words.append([stemmer.stem(word) for word in data])
    return stemmed_words

In [None]:
stemmed_words = stemming(data_frame=data_frame["positive"])
# print(stemmed_words)

---

### Lemmatization

In [None]:
import spacy
def lemmatization(data_frame) -> list:
    nlp = spacy.load('en_core_web_md')
    lemmatized_words = []
    for data in data_frame:
        data = nlp(data)
        lemmatized_words.append([token.lemma_ for token in data])
    return lemmatized_words

In [None]:
lemmaized_words = lemmatization(data_frame=data_frame["positive"])
print(lemmaized_words)

---

### Stemming + misspellings

#### misspelling correction with jaccard_distance

In [None]:
from nltk.stem import PorterStemmer
import nltk 
from nltk.metrics.distance import jaccard_distance 
from nltk.util import ngrams
nltk.download('words') 
from nltk.corpus import words 

def stemming_with_misspelling_correction(data_frame) -> list:
    correct_words = words.words()
    stemmer = PorterStemmer()
    stemmed_words = []
    tokenized_data_frame = tokenization(data_frame)
    for data in tokenized_data_frame:
        corrected_data = []
        for word in data:
            distances = []
            word_bigrams = set(ngrams(word, 2))
            if word_bigrams:
                distances = [
                    (jaccard_distance(word_bigrams, set(ngrams(w, 2))), w)
                    for w in correct_words
                    if set(ngrams(w, 2))
                ]
            closest_word = min(distances, key=lambda x: x[0])[1] if distances else word
            stemmed_word = stemmer.stem(closest_word)
            corrected_data.append(stemmed_word)
        stemmed_words.append(corrected_data)
    return stemmed_words


In [None]:
print(stemming(data_frame=data_frame["positive"][:5]))


#### Edit distance Method (Levenshtein distance)

In [None]:

from nltk.metrics.distance  import edit_distance 

def stemming_with_levenshtein_distance(data_frame) -> list:
    correct_words = words.words()
    stemmer = PorterStemmer()
    stemmed_words = []
    tokenized_data_frame = tokenization(data_frame)
    for data in tokenized_data_frame:
        corrected_data = []
        for word in data:
            temp = [(edit_distance(word, w),w) for w in correct_words if w[0]==word[0]] 
            closest_word = min(temp, key=lambda x: x[0])[1] if temp else word
            stemmed_word = stemmer.stem(closest_word)
            corrected_data.append(stemmed_word)
        stemmed_words.append(corrected_data)
    return stemmed_words


In [None]:
print(stemming_with_levenshtein_distance(data_frame=data_frame["positive"][:5]))

### Lemmatization with misspelling

In [None]:
import spacy
from nltk.corpus import words
from nltk.metrics.distance import edit_distance
import nltk

nltk.download('words')

def lemmatization_with_misspelling(data_frame):
    nlp = spacy.load('en_core_web_md')
    lemmatized_words = []
    correct_words = words.words()
    
    for data in data_frame:
        data = nlp(data)
        corrected_data = []
        
        for word in data:
            word_text = word.text
            temp = [(edit_distance(word_text, w), w) for w in correct_words if w[0] == word_text[0]]
            closest_word = min(temp, key=lambda x: x[0])[1] if temp else word_text
            lemma = nlp(closest_word)[0].lemma_
            corrected_data.append(lemma)
        
        lemmatized_words.append(corrected_data)
    
    return lemmatized_words


In [None]:
print(lemmatization_with_misspelling(data_frame=data_frame["positive"][:5]))


---

### Correct Slang words 

In [2]:
def ekphrasis_preprocessing(data_set) -> list:
    from ekphrasis.classes.preprocessor import TextPreProcessor
    from ekphrasis.dicts.emoticons import emoticons
    from ekphrasis.classes.tokenizer import SocialTokenizer

    text_processor = TextPreProcessor(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'],
        annotate={"hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored"},
        fix_html=True,
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=True,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    )
    processed_data = []
    for data in data_set:
        processed_data.append(text_processor.pre_process_doc(data))
    return processed_data
        


In [3]:
ekphrasis_preprocessing(data_frame["negative"][:5])

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading english - 1grams ...


[['how', 'unhappy', 'some', 'dogs', 'like', 'it', 'though'],
 ['talking',
  'to',
  'my',
  'over',
  'driver',
  'about',
  'where',
  'i',
  'am',
  'goinghe',
  'said',
  'he',
  "'",
  'd',
  'love',
  'to',
  'go',
  'to',
  'new',
  'york',
  'too',
  'but',
  'since',
  'trump',
  'it',
  "'",
  's',
  'probably',
  'not'],
 ['does',
  'anybody',
  'know',
  'if',
  'the',
  'rand',
  "'",
  's',
  'likely',
  'to',
  'fall',
  'against',
  'the',
  'dollar',
  '?',
  'i',
  'got',
  'some',
  'money',
  'i',
  'need',
  'to',
  'change',
  'into',
  'r',
  'but',
  'it',
  'keeps',
  'getting',
  'stronger',
  'unhappy'],
 ['i', 'miss', 'going', 'to', 'gigs', 'in', 'liverpool', 'unhappy'],
 ['there', 'isnt', 'a', 'new', 'riverdale', 'tonight', '?', 'unhappy']]

In [3]:
ekphrasis_preprocessing(data_frame_cleaned_from_stop_words["negative"][:5])



  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading english - 1grams ...
Reading english - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading english - 1grams ...


[['how', 'unhappy', 'some', 'dogs', 'like', 'it', 'though'],
 ['talking',
  'to',
  'my',
  'over',
  'driver',
  'about',
  'where',
  'i',
  'am',
  'goinghe',
  'said',
  'he',
  "'",
  'd',
  'love',
  'to',
  'go',
  'to',
  'new',
  'york',
  'too',
  'but',
  'since',
  'trump',
  'it',
  "'",
  's',
  'probably',
  'not'],
 ['does',
  'anybody',
  'know',
  'if',
  'the',
  'rand',
  "'",
  's',
  'likely',
  'to',
  'fall',
  'against',
  'the',
  'dollar',
  '?',
  'i',
  'got',
  'some',
  'money',
  'i',
  'need',
  'to',
  'change',
  'into',
  'r',
  'but',
  'it',
  'keeps',
  'getting',
  'stronger',
  'unhappy'],
 ['i', 'miss', 'going', 'to', 'gigs', 'in', 'liverpool', 'unhappy'],
 ['there', 'isnt', 'a', 'new', 'riverdale', 'tonight', '?', 'unhappy']]