# **Pre-requisite steps**

In [None]:
import gzip
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from google.colab import drive
from sklearn.naive_bayes import MultinomialNB

In [None]:
# For Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
with open('/content/drive/MyDrive/goemotions.json', 'r') as f:
  json_file = json.load(f)

# -- For local --
# file = gzip.open('goemotions.json.gz')
# json_file = json.load(file)

In [None]:
json_file = np.asarray(json_file)

posts = np.array(json_file[:,0])
emotions = np.array(json_file[:,1])
sentiments = np.array(json_file[:,2])

Define helpers

In [None]:
def train_and_predict(classifier, X_train, Y_Train, X_test):
  '''
  classifier: the classifier object to be trained
  X_train: train input
  Y_train: train output
  X_test: test input
  return a prediction for X_test
  '''
  # train
  classifier.fit(X_train, Y_Train)

  # predict
  return classifier.predict(X_test)

def generate_stats(Y_test, Y_pred, output_filename):
  '''
  generate a report for the model
  '''
  # report
  print(classification_report(Y_test,Y_pred))
  plt.rcParams['figure.figsize'] = [20, 20]
  ConfusionMatrixDisplay.from_predictions(Y_test,Y_pred)
  plt.savefig(output_filename + ".pdf", format="pdf")
  plt.show()

# **TOP MNB and Base-DT With stopwords**

In [None]:
# 2.1. Vectorizer
vectorizer = CountVectorizer()
vectorizer.fit_transform(posts)

In [None]:
# 2.2 Splitting the dataset
training_set, test_set = train_test_split(json_file,train_size=0.8)
training = {}
test = {}

# training dict
training["posts"] = training_set[:,0]
training["emotions"] = training_set[:,1]
training["sentiments"] = training_set[:,2]

# test dict
test["posts"] = test_set[:,0]
test["emotions"] = test_set[:,1]
test["sentiments"] = test_set[:,2]

In [None]:
#Vectorzied
vectorized_training_posts = vectorizer.transform(training["posts"])
vectorized_test_posts = vectorizer.transform(test["posts"])

## **TOP-MNB**

**Initiate the classifier**

In [None]:
parameters = {"alpha":[0.5,0,0.25,0.75]}
classifier = GridSearchCV(MultinomialNB(), parameters)
print(classifier.best_estimator_)

### **Classify Emotion**

In [None]:
# train and predict
Y_pred = train_and_predict(classifier, vectorized_training_posts, training["emotions"], vectorized_test_posts)

In [None]:
# generate stats
generate_stats(test["emotions"], Y_pred, "top mnb alpha 0.5 emotions")

### **Sentiments**


In [None]:
# train and predict
Y_pred = train_and_predict(classifier, vectorized_training_posts, training["sentiments"], vectorized_test_posts)

In [None]:
# report stats
generate_stats(test["sentiments"], Y_pred, "top mnb alpha 0.5 sentiments")

## **Base DT**

In [None]:
from sklearn import tree

### **Classify emotion**

In [None]:
dtc = tree.DecisionTreeClassifier()  

# train and predict - Emotion
Y_pred = train_and_predict(dtc, vectorized_training_posts, training["emotions"], vectorized_test_posts)
generate_stats(test["emotions"], Y_pred, "Base DT emotions")

### **Classify sentiments**

In [None]:
# train and predict - Sentiments
dtc = tree.DecisionTreeClassifier()  
Y_pred = train_and_predict(dtc, vectorized_training_posts, training["sentiments"], vectorized_test_posts)
generate_stats(test["sentiments"], Y_pred, "Base DT sentiments")

# **Removing stopwords**

In [None]:
# Vectorizer
vectorizer_stp_wrds = CountVectorizer(stop_words="english")
vectorizer_stp_wrds.fit_transform(posts)

vectorized_test_posts = vectorizer_stp_wrds.transform(test['posts'])
vectorized_training_posts = vectorizer_stp_wrds.transform(training['posts'])

## **TOP MNB**

In [None]:
parameters = {"alpha":[0.5,0,0.25,0.75]}
top_mnb_classifier = GridSearchCV(MultinomialNB(), parameters)

### **Top MNB - Emotions**

In [None]:
# emotions
Y_pred = train_and_predict(top_mnb_classifier, vectorized_training_posts, training['emotions'] ,vectorized_test_posts)
generate_stats(test["emotions"],Y_pred, "Top MNB removed stopwords emotions")

### **Top MNB - Sentiments**

In [None]:
# Sentiments
Y_pred = train_and_predict(top_mnb_classifier, vectorized_training_posts, training['sentiments'] ,vectorized_test_posts)
generate_stats(test["sentiments"],Y_pred, "Top MNB removed stopwords emotions")

## **Base DT**

### **Base DT - Emotions**

In [None]:
# Base-DT without stopwords
dtc = tree.DecisionTreeClassifier()  

# train and predict - Emotion
Y_pred = train_and_predict(dtc, vectorized_training_posts, training['emotions'] ,vectorized_test_posts)
generate_stats(test["emotions"],Y_pred, "Base DT removed stopwords emotions")

### **Base DT - Sentiments**

In [None]:
# Base-DT without stopwords
dtc = tree.DecisionTreeClassifier()  

# train and predict - Sentiments
Y_pred = train_and_predict(dtc, vectorized_training_posts, training['sentiments'] ,vectorized_test_posts)
generate_stats(test["sentiments"],Y_pred, "Base DT removed stopwords sentiments")