### Download Dataset

In [None]:
%%bash
wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

### Install Dependecies

In [2]:
!pip install emoji
!pip install symspellpy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Imports

In [82]:
import re
import pandas as pd
import emoji as e
import spacy
import pkg_resources
import requests
from symspellpy import SymSpell, Verbosity
import nltk
nltk.download('words')
from nltk.corpus import words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from xgboost import XGBClassifier

[nltk_data] Downloading package words to /home/ilya-prg/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Text Preprocessing

In [30]:
def load_spellcorrector():
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
      "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
      "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
    
    return sym_spell

In [31]:
dataset1 = pd.read_csv("./data/full_dataset/goemotions_1.csv")

In [32]:
HTML_CLEANER = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
URL_CLEANER = re.compile(r'https?://\S+|www\.\S+')
nlp = spacy.load("en_core_web_sm")
sym_spell = load_spellcorrector()

In [33]:
slang_meanings = {
    "brb": "Be right back",
    "lmao": "Laughing my ass off",
    "lol": "Laughing out loud",
    "ppl": "People",
    "afk": "Away from keyboard",
    "asap": "As soon as possible",
    "btw": "By the way",
    "fyi": "For your information",
    "gg": "Good game",
    "gl": "Good luck",
    "gr8": "Great",
    "idk": "I don't know",
    "ikr": "I know, right?",
    "imo": "In my opinion",
    "imho": "In my humble opinion",
    "irl": "In real life",
    "jk": "Just kidding",
    "l8r": "Later",
    "lmk": "Let me know",
    "nvm": "Never mind",
    "omw": "On my way",
    "rofl": "Rolling on the floor laughing",
    "smh": "Shaking my head",
    "tbh": "To be honest",
    "tldr": "Too long; didn't read",
    "ttyl": "Talk to you later",
    "w/e": "Whatever",
    "w/o": "Without",
    "wtf": "What the fuck",
    "yolo": "You only live once",
    "b4": "Before",
    "cu": "See you",
    "dm": "Direct message",
    "fomo": "Fear of missing out",
    "ftw": "For the win",
    "gtg": "Got to go",
    "hmu": "Hit me up",
    "nsfw": "Not safe for work",
    "op": "Original poster",
    "srsly": "Seriously",
    "tmi": "Too much information",
    "u": "You",
    "ur": "Your",
    "yw": "You're welcome"
}

emoticon_map = {
        ":)": "Smile",
        ":‑)": "Smile",
        ":(": "Sad",
        ":‑(": "Sad",
        ":D": "Big grin",
        ":‑D": "Big grin",
        ";)": "Wink",
        ";‑)": "Wink",
        ":P": "Tongue out",
        ":‑P": "Tongue out",
        ":O": "Surprise",
        ":‑O": "Surprise",
        ":|": "Neutral",
        ":‑|": "Neutral",
        ":*": "Kiss",
        ":‑*": "Kiss",
        ":/": "Confused",
        ":‑/": "Confused",
        ">:(": "Angry",
        ">:‑(": "Angry",
        "XD": "Laughing hard",
        "x‑D": "Laughing hard",
        ":‑[": "Sad",
        ":‑]": "Happy",
        ":-{": "Sad",
        ":-}": "Happy",
        ":-@": "Screaming",
        ":-#": "Sealed lips",
        ":-X": "Sealed lips",
        ":-!": "Exclamation",
        ":-&": "Tongue-tied",
        ":-+": "Confused",
        ":-^": "Smirk",
        ":-<": "Sad"
    }
correlation_whitelist = {
    "reddit",
    "youtube",
    "facebook",
    "twitter",
    "tiktok",
    "snapchat",
    "instagram",
    "whatsapp",
    "linkedin",
    "\"",
    "!",
    "?"
}


In [34]:
def remove_punctuation(text): 
    punctuations = '''()-[]{};:,<>./@#$%^&*_~’'''
    for x in text: 
        if x in punctuations: 
            text = text.replace(x, "") 
    return text

In [35]:
def replace_words_with_dict(text, dictionary):
    sorted_words = sorted(dictionary.keys(), key=len, reverse=True)
    
    for word in sorted_words:
        pattern = r'\b' + re.escape(word) + r'\b'
        text = re.sub(pattern, dictionary[word], text, flags=re.IGNORECASE)
    
    return text.lower()


In [36]:
def text_lemmatization(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [11]:
def text_correction(text, sym_spell=sym_spell, whitelist=correlation_whitelist):
    words = text.split()
    corrected_words = []
    for word in words:
        if word in whitelist:
            corrected_words.append(word)
        else:
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
            if suggestions:
                best = suggestions[0].term
                if best != word:
                    corrected_words.append(best)
                else:
                    corrected_words.append(word)
            else:
                corrected_words.append(word)
    return " ".join(corrected_words)

In [40]:
def remove_repeats_if_not_real_word(text, wordlist=set(words.words())):
    tokens = text.split()
    new_tokens = []
    
    for token in tokens:
        clean_token = re.sub(r'(.)\1{2,}', r'\1', token.lower())
        
        if token.lower() in wordlist:
            new_tokens.append(token)
        else:
            new_tokens.append(clean_token)
            
    return " ".join(new_tokens)


In [41]:
def text_preprocessing(text):
    cleaned_text = text.lower() # Lower case text
    cleaned_text = replace_words_with_dict(cleaned_text, slang_meanings) # Convert slang to normal words
    cleaned_text = re.sub(r'\d+', '', cleaned_text) # Remove numbers
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text) # keep only english words
    cleaned_text = remove_punctuation(cleaned_text) # Remove some punctuation
    cleaned_text = cleaned_text.strip() # Remove Whitespaces
    cleaned_text = re.sub(HTML_CLEANER, '', cleaned_text) # Remove HTML
    cleaned_text = re.sub(URL_CLEANER, '', cleaned_text) # Remove URLs
    cleaned_text = e.demojize(cleaned_text) # Convert emojis to text
    cleaned_text = replace_words_with_dict(cleaned_text, emoticon_map) # Convert emoticons to text
    cleaned_text = remove_repeats_if_not_real_word(cleaned_text) # reduce repeating characters by 2
    cleaned_text = text_correction(cleaned_text) # Autocorrect
    cleaned_text = text_lemmatization(cleaned_text) # Lemantization
    return cleaned_text

In [48]:
%%time
for i in range(10):
    text = dataset1.iloc[i].text
    print(f"{text} -> {text_preprocessing(text)}\n")

That game hurt. -> that game hurt

 >sexuality shouldn’t be a grouping category It makes you different from othet ppl so imo it fits the definition of "grouping"  -> sexuality should not be a grouping category it make you different from other people so in my opinion it fit the definition of group

You do right, if you don't care then fuck 'em! -> you do right if you do not care then fuck pm

Man I love reddit. -> man a love reddit

[NAME] was nowhere near them, he was by the Falcon.  -> name be nowhere near they he be by the falcon

Right? Considering it’s such an important document, I should know the damned thing backwards and forwards... thanks again for the help! -> right consider its such an important document a should know the damned thing backwards and forwards thank again for the help

He isn't as big, but he's still quite popular. I've heard the same thing about his content. Never watched him much. -> he be not as big but he a still quite popular a be hear the same thing about 

In [49]:
%%time
dataset1["preprocessed_text"] = dataset1.text.apply(text_preprocessing)
dataset1["preprocessed_text"].head(5)

CPU times: user 2min 36s, sys: 18.7 ms, total: 2min 36s
Wall time: 2min 37s


0                                       that game hurt
1    sexuality should not be a grouping category it...
2         you do right if you do not care then fuck pm
3                                    man a love reddit
4        name be nowhere near they he be by the falcon
Name: preprocessed_text, dtype: object

# Tokenization

In [50]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset1["preprocessed_text"])

vocab = vectorizer.get_feature_names_out()
print(f"Number of unique words: {len(vocab)}")
print(vocab)
### my text preprocessing kinda not works :(

Number of unique words: 17835
['aah' 'aaron' 'aba' ... 'zookeeper' 'zoom' 'zugzwang']


In [75]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=5,
    max_df=0.7,
)
X = vectorizer.fit_transform(dataset1["preprocessed_text"])
vocab = vectorizer.get_feature_names_out()
print(f"Number of unique words: {len(vocab)}")
print(vocab)

Number of unique words: 5000
['aba' 'abandon' 'ability' ... 'zombie' 'zone' 'zoom']


# Simple Model

In [76]:
# Simple model to see difference between changes of text_processing functions
y_columns = dataset1.columns[9:-1]
y = dataset1[y_columns]

In [77]:
X.shape, y.shape

((70000, 5000), (70000, 28))

In [81]:
lr = OneVsRestClassifier(LogisticRegression(max_iter=50000, C=1.0, random_state=42), n_jobs=-1)
scores = cross_val_score(lr, X, y, cv=5, scoring="accuracy")
lr.fit(X, y)
print(f"Model Accuracy on Train Dataset: {scores.mean()}")

Model Accuracy on Train Dataset: 0.12137142857142857


In [84]:
xgb = XGBClassifier(n_jobs=-1)
scores = cross_val_score(xgb, X, y, cv=5, scoring="accuracy")
xgb.fit(X, y)
print(f"Model Accuracy on Train Dataset: {scores.mean()}")

Model Accuracy on Train Dataset: 0.12404285714285712


To be continue