### Load data

In [2]:
import nltk
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))
from  src.data_cleaning import DataCleaning

data_frame = DataCleaning.load_data()




---

### Tokenization

In [3]:
def tokenization(data_frame) -> list:
    tokenizer = tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = []
    for data in data_frame:
        tokens.append(tokenizer.tokenize(data))
    return tokens



In [4]:
tokens_positive = tokenization(data_frame=data_frame["positive"])
# print(tokens_positive[0])

---

### Stemmming

In [5]:
def stemming(data_frame) -> list:
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    stemmed_words = []
    tokenized_data_frame = tokenization(data_frame)
    for data in tokenized_data_frame:
        stemmed_words.append([stemmer.stem(word) for word in data])
    return stemmed_words

In [6]:
stemmed_words = stemming(data_frame=data_frame["positive"])
# print(stemmed_words)

---

### Lemmatization

In [7]:
import spacy
def lemmatization(data_frame) -> list:
    nlp = spacy.load('en_core_web_md')
    lemmatized_words = []
    for data in data_frame:
        data = nlp(data)
        lemmatized_words.append([token.lemma_ for token in data])
    return lemmatized_words

In [8]:
lemmaized_words = lemmatization(data_frame=data_frame["positive"])
print(lemmaized_words)

[['an', 'insperation', 'in', 'all', 'aspect', ':', 'fashion'], [' ', 'fitness'], [' ', 'beauty', 'and', 'personality', '.', ':)', 'kiss', 'thefashionicon'], ['Apka', 'Apna', 'Awam', 'Ka', 'Channel', 'Frankline', 'Tv', 'Aam', 'Admi', 'production', 'please', 'visit', 'or', 'like', ' ', 'Share', ':)', 'Fb', 'Page', ':', '...'], ['beautiful', 'album', 'from', ' ', 'the', 'great', 'unsung', 'guitar', 'genius', 'of', 'our', 'time', '-', 'and', 'I', 'have', 'meet', 'the', 'great', 'backstage'], ['good', 'luck', 'to', 'rich', 'riding', 'for', 'great', 'project', 'in', 'this', 'Sunday', '.', 'can', 'you', 'donate', '?'], ['Omg', 'he', '...', 'kiss', '...', 'he', 'cry', 'with', 'joy'], ['happy', 'anniv', 'ming', 'and', 'papi', '!', '!', '!', '!', '!', 'love', 'love', 'happy'], ['thank', 'happy'], ['come', 'on', 'tweep'], [' ', 'join', ' ', 'vote', 'for', 'the', 'singer', '!', 'do', 'spread', 'the', 'word', '.', ':D'], ['thank', 'for', 'the', 'great', 'review', '!', 'smile'], ['Yay', 'another', '

---

### Stemming + misspellings

#### misspelling correction with jaccard_distance

In [9]:
from nltk.stem import PorterStemmer
import nltk 
from nltk.metrics.distance import jaccard_distance 
from nltk.util import ngrams
nltk.download('words') 
from nltk.corpus import words 

def stemming_with_misspelling_correction(data_frame) -> list:
    correct_words = words.words()
    stemmer = PorterStemmer()
    stemmed_words = []
    tokenized_data_frame = tokenization(data_frame)
    for data in tokenized_data_frame:
        corrected_data = []
        for word in data:
            distances = []
            word_bigrams = set(ngrams(word, 2))
            if word_bigrams:
                distances = [
                    (jaccard_distance(word_bigrams, set(ngrams(w, 2))), w)
                    for w in correct_words
                    if set(ngrams(w, 2))
                ]
            closest_word = min(distances, key=lambda x: x[0])[1] if distances else word
            stemmed_word = stemmer.stem(closest_word)
            corrected_data.append(stemmed_word)
        stemmed_words.append(corrected_data)
    return stemmed_words


[nltk_data] Downloading package words to /home/hasabir/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [10]:
print(stemming(data_frame=data_frame["positive"][:5]))


[['an', 'insper', 'in', 'all', 'aspect', 'fashion'], ['fit'], ['beauti', 'and', 'person', 'kiss', 'thefashionicon'], ['apka', 'apna', 'awam', 'ka', 'channel', 'franklin', 'tv', 'aam', 'admi', 'product', 'pleas', 'visit', 'or', 'like', 'share', 'fb', 'page'], ['beauti', 'album', 'from', 'the', 'greatest', 'unsung', 'guitar', 'geniu', 'of', 'our', 'time', 'and', 'i', 've', 'met', 'the', 'great', 'backstag']]


#### Edit distance Method (Levenshtein distance)

In [11]:

from nltk.metrics.distance  import edit_distance 

def stemming_with_levenshtein_distance(data_frame) -> list:
    correct_words = words.words()
    stemmer = PorterStemmer()
    stemmed_words = []
    tokenized_data_frame = tokenization(data_frame)
    for data in tokenized_data_frame:
        corrected_data = []
        for word in data:
            temp = [(edit_distance(word, w),w) for w in correct_words if w[0]==word[0]] 
            closest_word = min(temp, key=lambda x: x[0])[1] if temp else word
            stemmed_word = stemmer.stem(closest_word)
            corrected_data.append(stemmed_word)
        stemmed_words.append(corrected_data)
    return stemmed_words


In [12]:
print(stemming_with_levenshtein_distance(data_frame=data_frame["positive"][:5]))

[['a', 'inspher', 'in', 'all', 'aspect', 'fascio'], ['fit'], ['beauti', 'and', 'person', 'k', 'thessalonian'], ['aka', 'ana', 'adam', 'k', 'chane', 'franklin', 't', 'adam', 'adai', 'productida', 'pace', 'vicia', 'o', 'le', 'shane', 'f', 'page'], ['bealtin', 'album', 'from', 'the', 'greaten', 'unsung', 'guitar', 'geniu', 'of', 'our', 'time', 'and', 'i', 'v', 'met', 'the', 'great', 'backstag']]


### Lemmatization with misspelling

In [13]:
import spacy
from nltk.corpus import words
from nltk.metrics.distance import edit_distance
import nltk

nltk.download('words')

def lemmatization_with_misspelling(data_frame):
    nlp = spacy.load('en_core_web_md')
    lemmatized_words = []
    correct_words = words.words()
    
    for data in data_frame:
        data = nlp(data)
        corrected_data = []
        
        for word in data:
            word_text = word.text
            temp = [(edit_distance(word_text, w), w) for w in correct_words if w[0] == word_text[0]]
            closest_word = min(temp, key=lambda x: x[0])[1] if temp else word_text
            lemma = nlp(closest_word)[0].lemma_
            corrected_data.append(lemma)
        
        lemmatized_words.append(corrected_data)
    
    return lemmatized_words


[nltk_data] Downloading package words to /home/hasabir/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [14]:
print(lemmatization_with_misspelling(data_frame=data_frame["positive"][:5]))


[['a', 'inspheration', 'in', 'all', 'aspect', ':', 'fascio'], [' ', 'fitness'], [' ', 'beauty', 'and', 'personality', '.', ':)', 'k', 'thessalonian'], ['aka', 'Ana', 'adam', 'k', 'chane', 'Franklin', 't', 'adam', 'Adai', 'productidae', 'pace', 'vicia', 'o', 'les', ' ', 'shane', ':)', 'f', 'page', ':', '...'], ['bealtine', 'album', 'from', ' ', 'the', 'greaten', 'unsung', 'guitar', 'genius', 'of', 'our', 'time', '-', 'and', 'I', "'", 'meet', 'the', 'great', 'backstage']]


---

In [2]:

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    annotate={"hashtag", "allcaps", "elongated", "repeated", "emphasis", "censored"},
    fix_html=True,
    unpack_hashtags=True,
    unpack_contractions=True,
    spell_correct_elong=True,
    tokenizer='social_tokenizer',
    dicts=[emoticons]
)


for data in data_frame["positive"][:5]:
    print(text_processor.pre_process_doc(data))


Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading english - 1grams ...
generating cache file for faster loading...
reading ngrams /home/hasabir/.ekphrasis/stats/english/counts_1grams.txt
Reading english - 2grams ...
generating cache file for faster loading...
reading ngrams /home/hasabir/.ekphrasis/stats/english/counts_2grams.txt


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading english - 1grams ...


NameError: name 'data_frame' is not defined