# Task 2: Words as Features

## Imports

In [1]:
import gzip
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn import tree

## 2.1 Extracting tokens/words and their frequencies

In [2]:
with gzip.open("dataset/goemotions.json.gz", "rb") as f:
    data = json.loads(f.read())

dataset = np.array(data)

# Extracting posts, emotions and sentiments
posts = dataset[:, 0]
emotions = dataset[:, 1]
sentiments = dataset[:, 2]

vectorizer = CountVectorizer()
# Creates a document-term matrix
posts_features = vectorizer.fit_transform(posts)

# Printing the number of tokens (size of the vocabulary)
print("Number of unique tokens (size of vocabulary) in the dataset:", len(vectorizer.get_feature_names_out()))

Number of unique tokens (size of vocabulary) in the dataset: 30449


## 2.2 Split the dataset to 80% for training and 20% for testing 

In [None]:
# Training and testing dataset for posts and emotions
x_train, x_test, y_train_emo, y_test_emo, y_train_sen, y_test_sen = train_test_split(posts_features, emotions, sentiments, test_size=0.2, random_state=0)

## 2.4 Saving performance for each classifier and classification (Emotions or Sentiments)

In [None]:
def save_performance(model_name, classification_task, y_test, y_pred, hyper_parameters=None):
    with open('performance.txt', 'a') as f:

        # Model name + hyper parameters + classification task
        f.writelines(['===========================================================\n',
                      model_name + ' for --> ' + classification_task + '\n(Hyper Parameters: ' + str(hyper_parameters) + ')\n'
                      , '===========================================================\n'])
        # Confusion Matrix
        f.write('\nConfusion Matrix:\n-----------------\n\n' + str(confusion_matrix(y_test, y_pred)) + '\n')
        
        # Classfication Report
        f.write('\nClassification Report:\n----------------------\n\n' + classification_report(y_test, y_pred) + '\n\n')

## 2.3 Training and testing with classifiers

### 2.3.1 Base Multinomial Naive Bayes (Base-MNB) with default parameters

In [None]:
# Function for training and testing with Base MNB
def BaseMNBClassifier (feature, label, x_test, y_test, classification_task):
    MNB = MultinomialNB()
    MNB.fit(feature, label)
    y_pred = MNB.predict(x_test)
    save_performance('Multinomial Naive Bayes Model', classification_task, y_test, y_pred)

# Classification Task: Emotions
BaseMNBClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions')

# Classification Task: Sentiments
BaseMNBClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments')

### 2.3.2 Base Decision Tree (Base-DT) with default parameters

In [None]:
# Function for training and testing with Base DT
def BaseDTClassifier(feature, label, x_test, y_test, classification_task):
    BaseDT = tree.DecisionTreeClassifier()
    BaseDT.fit(feature, label)
    y_pred = BaseDT.predict(x_test)
    save_performance('Base Decision Tree Model', classification_task, y_test, y_pred)

# Classification Task: Emotions
BaseDTClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions')

# Classification Task: Sentiments
BaseDTClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments')

### 2.3.3 Base Multi-Layered Perceptron (Base-MLP) with default parameters

### 2.3.4 Top Multinomial Naive Bayes (Top-MNB)

In [None]:
# Hyper parameters for MNB
MNBparams = {'alpha': (0, 0.5, 1.0, 1.5)}

# Function for training and testing with Top MNB
def TopMNBClassifier(feature, label, x_test, y_test, classification_task):
    TopMNB = GridSearchCV(estimator=MultinomialNB(), param_grid=MNBparams)
    TopMNB.fit(feature, label)
    y_pred = TopMNB.predict(x_test)
    save_performance('Top Multinomial Naive Bayes Model', classification_task, y_test, y_pred, MNBparams)
    
# Classification Task: Emotions
TopMNBClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions')

# Classification Task: Sentiments
TopMNBClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments')

### 2.3.5 Top Decision Tree (Top-DT)

In [None]:
# Hyper parameters for Top DT
DTparams = {'criterion' : ['gini', 'entropy'], "max_depth" : [3, 10] ,  'min_samples_split': [2, 5, 10]
}

# Function for training and testing with Top DT
def TopDTClassifier(feature, label, x_test, y_test, classification_task):
    TopDT = GridSearchCV(estimator = tree.DecisionTreeClassifier(), param_grid = DTparams)
    TopDT.fit(feature, label)
    y_pred = TopDT.predict(x_test)
    save_performance('Top Decision Tree Model', classification_task, y_test, y_pred, DTparams)
    
# Classification Task: Emotions
TopDTClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions')

# Classification Task: Sentiments
TopDTClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments')

### 2.3.6 Top Multi-Layered Perceptron (Top-MLP)