In [None]:
import gzip
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as api
import nltk
import pandas as pd
import itertools

# 1. Dataset Preparation & Analysis

## 1.2 Data Loading

In [None]:
file = gzip.open('goemotions.json.gz')
json_file = json.load(file)

## 1.3 Features Extraction and Distribution Plots

In [None]:
emotions = np.array([])
sentiments = np.array([])
posts = np.array([1])
json_file = np.asarray(json_file)

posts = json_file[:,0]
emotions = json_file[:,1]
sentiments = json_file[:,2]

In [None]:
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
fig = plt.figure()
plt.hist(emotions, bins=range(29), align="left", ec="white")
plt.xticks(rotation=45, ha="right")
plt.ylabel("number of posts")
plt.title("Distribution of the Emotion Label")
plt.savefig("emotion.pdf", format="pdf")
plt.show()
plt.ylabel("number of posts")
plt.hist(sentiments, bins=range(5), align="left", ec="white")
plt.title("Distribution of the Sentiment Label")
plt.savefig("sentiment.pdf", format="pdf")
plt.show()

# 2. Words as Features

## 2.1 Data Processing

In [None]:
vectorizer = CountVectorizer()

In [None]:
text_dataset = vectorizer.fit_transform(posts)

In [None]:
"""
each column in the feature vector refers to a word/token.
each row in the feature vector is a post.
if said post uses a word in the vectorizer's vocabulary exactly once, then it will be shown as a 1
in the corresponding index of the vectorizer's vocabulary.
therefore, adding up the column for each column will give you the frequency of each word
"""
def word_frequencies(dataset, vocabulary):
    #add up the elements in the column for each column
    frequencies = text_dataset.sum(axis=0).tolist()[0]
    
    #add up the list of frequencies to get total size of words
    total_sum = np.asarray(frequencies).sum()
    
    #get words from feature vector
    words = vocabulary.keys()
    
    #get corresponding indices for words in feature vector
    indices = vocabulary.values()
    
    #since indices is a list of indices for the words at a 1:1 index, then we just map the frequencies value to the index
    #in the indices list and we get a list of sums that map to words list 1:1
    frequencies = list(map(lambda index: frequencies[index], indices))
    
    #then just zip words list and frequencies list as a dictionary
    return dict(zip(words, frequencies)), total_sum

frequencies, total_sum = word_frequencies(text_dataset, vectorizer.vocabulary_)

In [None]:
frequencies

In [None]:
total_sum

## 2.2 Training and Testing Split

In [None]:
# 2.2 Splitting the dataset
training_set, test_set = train_test_split(json_file,train_size=0.8)
training = {}
test = {}

# training dict
training["posts"] = training_set[:,0]
training["emotions"] = training_set[:,1]
training["sentiments"] = training_set[:,2]

# test dict
test["posts"] = test_set[:,0]
test["emotions"] = test_set[:,1]
test["sentiments"] = test_set[:,2]

# 3. Embeddings as Features

## 3.1 Embedding Model Loading

In [None]:
w2v_model = api.load("word2vec-google-news-300")

## 3.2 Words Extraction

In [None]:
nltk.download()

In [None]:
word_tokens = [nltk.word_tokenize(corpus, language="english") for corpus in X_train]

In [None]:
flattened_tokens = list(itertools.chain.from_iterable(word_tokens))
training_set_tokens=pd.Series(flattened_tokens).value_counts()
print(training_set_tokens)
print(training_set_tokens.sum(), "= total number of tokens in the training dataset")

## 3.3 Embeddings Computing

In [None]:
def get_post_embedding(model, post):
    tokens = nltk.word_tokenize(post)
    words = [word for word in tokens if word in model]
    if len(words)>0:
        return np.mean(w2v_model[words], axis=0)
    else:
        return []

In [None]:
training_post_embeddings = [get_post_embedding(w2v_model, post) for post in X_train]

## 3.4 Display the Hit Rates

### Training Hit Rates

In [None]:
valid_training_tokens=pd.Series({word:training_set_tokens[word] for word in training_set_tokens.keys() if word in w2v_model})

In [None]:
print(valid_training_tokens)
print(valid_training_tokens.sum(), "= total number of valid tokens in the training dataset")

In [None]:
training_hitrate = (valid_training_tokens.sum()/training_set_tokens.sum())*100

In [None]:
print("{:.2f}% hitrate in the training dataset".format(training_hitrate))

### Testing Hit Rates

In [None]:
testing_set_tokens = list(itertools.chain.from_iterable([nltk.word_tokenize(corpus, language="english") for corpus in X_test]))
testing_set_tokens = pd.Series(testing_set_tokens).value_counts()

In [None]:
print(testing_set_tokens)
print(testing_set_tokens.sum(), "= total number of tokens in testing dataset")

In [None]:
valid_testing_tokens=pd.Series({word:testing_set_tokens[word] for word in testing_set_tokens.keys() if word in w2v_model})

In [None]:
print(valid_testing_tokens)
print(valid_testing_tokens.sum(), "= total number of valid tokens in the testing dataset")

In [None]:
testing_hitrate = (valid_testing_tokens.sum()/testing_set_tokens.sum())*100

In [None]:
print("{:.2f}% hitrate in the testing dataset".format(testing_hitrate))