In [None]:
import gzip
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
file = gzip.open('goemotions.json.gz')
json_file = json.load(file)

In [None]:
emotions = np.array([])
sentiments = np.array([])
posts = np.array([1])
json_file = np.asarray(json_file)

posts = json_file[:,0]
emotions = json_file[:,1]
sentiments = json_file[:,2]

In [None]:
plt.rcParams["figure.figsize"] = [7.50, 3.50]
plt.rcParams["figure.autolayout"] = True
fig = plt.figure()
plt.hist(emotions, bins=range(29), align="left", ec="white")
plt.xticks(rotation=45, ha="right")
plt.ylabel("number of posts")
plt.title("Distribution of the Emotion Label")
plt.savefig("emotion.pdf", format="pdf")
plt.show()
plt.ylabel("number of posts")
plt.hist(sentiments, bins=range(5), align="left", ec="white")
plt.title("Distribution of the Sentiment Label")
plt.savefig("sentiment.pdf", format="pdf")
plt.show()

In [None]:
# 2.1. Vectorizer
vectorizer = CountVectorizer()

In [None]:
text_dataset = vectorizer.fit_transform(posts)

In [None]:
"""
each column in the feature vector refers to a word/token.
each row in the feature vector is a post.
if said post uses a word in the vectorizer's vocabulary exactly once, then it will be shown as a 1
in the corresponding index of the vectorizer's vocabulary.
therefore, adding up the column for each column will give you the frequency of each word
"""
def word_frequencies(dataset, vocabulary):
    #add up the elements in the column for each column
    frequency_array = dataset.sum(axis=0).tolist()[0]
    
    #add up the list of frequencies to get total size of words
    total_sum = np.asarray(frequency_array).sum()
    
    #get words from feature vector
    words = vocabulary.keys()
    
    #get corresponding indices for words in feature vector
    indices = vocabulary.values()
    
    #since indices is a list of indices for the words at a 1:1 index, then we just map the frequencies value to the index
    #in the indices list and we get a list of sums that map to words list 1:1
    frequencies = list(map(lambda index: frequency_array[index], indices))
    
    #then just zip words list and frequencies list as a dictionary
    return dict(zip(words, frequencies)), total_sum

frequencies, total_sum = word_frequencies(text_dataset, vectorizer.vocabulary_)

In [None]:
# 2.2 Splitting the dataset
training_set, test_set = train_test_split(json_file,train_size=0.8)

# training set
X_train = training_set[:, 0]
y_train = training_set[:, 1:]

# test set
X_test = test_set[:, 0]
y_test = test_set[:, 1:]

# 2.3.1 BASE MNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
X_train_mnb = vectorizer.fit_transform(X_train)

In [None]:
mnb_cl = MultinomialNB()

In [None]:
emotions = y_train[:,0]
sentiments = y_train[:,1]
mnb_emotions = mnb_cl.fit(X_train_mnb, emotions)
mnb_sentiments = mnb_cl.fit(X_train_mnb, sentiments)

In [None]:
score_base_mnb_emotions = mnb_emotions.score(X_train_mnb, emotions)
score_base_mnb_sentiments = mnb_sentiments.score(X_train_mnb, sentiments)
print("emotions score: ", score_base_mnb_emotions)
print("sentiments score: ", score_base_mnb_sentiments)

### Testing Base MNB

In [None]:
X_test_base_mnb = vectorizer.transform(X_test)

In [None]:
predicted_emotions = mnb_emotions.predict(X_test_base_mnb)
predicted_sentiments = mnb_sentiments.predict(X_test_base_mnb)
test_emotions = y_test[:,0]
test_sentiments = y_test[:,1]

## 2.4.1 MNB Base Report

### Report for emotions

In [None]:
print(classification_report(test_emotions, predicted_emotions))

In [None]:
emotions_cf_matrix = confusion_matrix(test_emotions, predicted_emotions)

In [None]:
import seaborn as sns
sns.heatmap(emotions_cf_matrix)

### Report for sentiments

In [None]:
print(classification_report(test_sentiments, predicted_sentiments))

In [None]:
sentiments_cf_matrix = confusion_matrix(test_sentiments, predicted_sentiments)

In [None]:
sns.heatmap(sentiments_cf_matrix)

# 2.3.5 TOP DECISION TREE

In [None]:
from sklearn import tree
from sklearn import preprocessing
from sklearn.tree import export_graphviz

In [None]:
X_train_dt = vectorizer.fit_transform(X_train)

In [None]:
hyper_parameters = {'criterion':['gini', 'entropy'],'max_depth':[4,5], 'min_samples_split':[4,5,6]}

In [None]:
top_dt = GridSearchCV(tree.DecisionTreeClassifier(), hyper_parameters, cv=3)

In [None]:
emotions = y_train[:,0]
sentiments = y_train[:,1]
dt_emotions = top_dt.fit(X_train_dt, emotions)
dt_sentiments = top_dt.fit(X_train_dt, sentiments)

In [None]:
score_dt_emotions = top_dt.score(X_train_dt, emotions)
score_dt_sentiments = top_dt.score(X_train_dt, sentiments)
print("emotions score: ", score_dt_emotions)
print("sentiments score: ", score_dt_sentiments)

In [None]:
print(dt_emotions.best_estimator_)
print(dt_sentiments.best_estimator_)

## Testing Top Decision Tree

In [None]:
X_test_dt = vectorizer.transform(X_test)

In [None]:
predicted_emotions = dt_emotions.predict(X_test_dt)
predicted_sentiments = dt_sentiments.predict(X_test_dt)

In [None]:
test_emotions = y_test[:,0]
test_sentiments = y_test[:,1]

## 2.4.5 Top Decision Tree Report

### Report for emotions

In [None]:
print(classification_report(test_emotions, predicted_emotions))

In [None]:
emotions_cf_matrix = confusion_matrix(test_emotions, predicted_emotions)

In [None]:
sns.heatmap(emotions_cf_matrix)

### Report for sentiments

In [None]:
print(classification_report(test_sentiments, predicted_sentiments))

In [None]:
sentiments_cf_matrix = confusion_matrix(test_sentiments, predicted_sentiments)

In [None]:
sns.heatmap(sentiments_cf_matrix)