# Task 2: Words as Features

## Imports

In [1]:
import gzip
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn import tree

## 2.1 Extracting tokens/words and their frequencies

In [2]:
with gzip.open("dataset/goemotions.json.gz", "rb") as f:
    data = json.loads(f.read())

dataset = np.array(data)

# Extracting posts, emotions and sentiments
posts = dataset[:, 0]
emotions = dataset[:, 1]
sentiments = dataset[:, 2]

vectorizer = CountVectorizer()
# Creates a document-term matrix
posts_features = vectorizer.fit_transform(posts)

# Printing the number of tokens (size of the vocabulary)
print("Number of unique tokens (size of vocabulary) in the dataset:", len(vectorizer.get_feature_names_out()))

Number of unique tokens (size of vocabulary) in the dataset: 30449


## 2.2 Split the dataset to 80% for training and 20% for testing 

In [3]:
# Splitting training and testing dataset for posts and emotions
x_train, x_test, y_train_emo, y_test_emo, y_train_sen, y_test_sen = train_test_split(posts_features, emotions, sentiments, test_size=0.2, random_state=0)

## 2.4 Saving performance for each classifier and classification (Emotions or Sentiments)

In [4]:
# Function to save output to text files
def save_output(model_name, classification_task, y_test, y_pred, fileName, hyper_parameters=None):
    with open(fileName, 'a') as f:

        # Model name + hyper parameters + classification task
        f.writelines(['===========================================================\n',
                      model_name + ' for --> ' + classification_task + '\n(Hyper Parameters: ' + str(hyper_parameters) + ')\n'
                      , '===========================================================\n'])
        # Confusion Matrix
        f.write('\nConfusion Matrix:\n-----------------\n\n' + str(confusion_matrix(y_test, y_pred)) + '\n')
        
        # Classfication Report
        f.write('\nClassification Report:\n----------------------\n\n' + classification_report(y_test, y_pred) + '\n\n')

## 2.3 Training and testing with classifiers

### 2.3.1 Base Multinomial Naive Bayes (Base-MNB) with default parameters

#### Function for training and testing with Base MNB

In [5]:
def BaseMNBClassifier (feature, label, x_test, y_test, classification_task, fileName):
    MNB = MultinomialNB()
    MNB.fit(feature, label)
    y_pred = MNB.predict(x_test)
    save_output('Multinomial Naive Bayes Model', classification_task, y_test, y_pred, fileName)

#### Training and testing with Base MNB

In [None]:
# Classification Task: Emotions
BaseMNBClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions', 'performance.txt')

# Classification Task: Sentiments
BaseMNBClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments', 'performance.txt')

### 2.3.2 Base Decision Tree (Base-DT) with default parameters

#### Function for training and testing with Base DT

In [6]:
def BaseDTClassifier(feature, label, x_test, y_test, classification_task, fileName):
    BaseDT = tree.DecisionTreeClassifier()
    BaseDT.fit(feature, label)
    y_pred = BaseDT.predict(x_test)
    save_output('Base Decision Tree Model', classification_task, y_test, y_pred, fileName)

#### Training and testing with Base DT

In [None]:
# Classification Task: Emotions
BaseDTClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions', 'performance.txt')

# Classification Task: Sentiments
BaseDTClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments', 'performance.txt')

### 2.3.3 Base Multi-Layered Perceptron (Base-MLP) with default parameters

#### Function for training and testing with Base MLP

In [7]:
# Although we are asked to use default parameters and dataset is large, the MLP model takes too long to train. 
# Hence, we set a parameter of max iteration to 2. (Takes about 2 minutes to train for each classification task)
def BaseMLPClassifier(feature, label, x_test, y_test, classification_task, fileName):
    MLP = MLPClassifier(max_iter=2)
    MLP.fit(feature, label)
    y_pred = MLP.predict(x_test)
    save_output('Base Multi-Layered Perceptron Model', classification_task, y_test, y_pred, fileName)

#### Training and testing with Base MLP

In [None]:
# Classification Task: Emotions
BaseMLPClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions', 'performance.txt')

# Classification Task: Sentiments
BaseMLPClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments', 'performance.txt')

### 2.3.4 Top Multinomial Naive Bayes (Top-MNB)

#### Function for training and testing with Top MNB

In [8]:
# Hyper parameters for MNB
MNBparams = {'alpha': (0, 0.5, 1.0, 1.5)}

def TopMNBClassifier(feature, label, x_test, y_test, classification_task, fileName):
    TopMNB = GridSearchCV(estimator=MultinomialNB(), param_grid=MNBparams)
    TopMNB.fit(feature, label)
    y_pred = TopMNB.predict(x_test)
    save_output('Top Multinomial Naive Bayes Model', classification_task, y_test, y_pred, fileName, MNBparams)

#### Training and testing with Top MNB

In [None]:
# Classification Task: Emotions
TopMNBClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions', 'performance.txt')

# Classification Task: Sentiments
TopMNBClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments', 'performance.txt')

### 2.3.5 Top Decision Tree (Top-DT)

#### Function for training and testing with Top DT

In [9]:
# Hyper parameters for Top DT
DTparams = {'criterion' : ['gini', 'entropy'], "max_depth" : [3, 10] ,  'min_samples_split': [2, 5, 10]
}

# Function for training and testing with Top DT
def TopDTClassifier(feature, label, x_test, y_test, classification_task, fileName):
    TopDT = GridSearchCV(estimator = tree.DecisionTreeClassifier(), param_grid = DTparams)
    TopDT.fit(feature, label)
    y_pred = TopDT.predict(x_test)
    save_output('Top Decision Tree Model', classification_task, y_test, y_pred, fileName, DTparams)

#### Training and testing with Top DT

In [None]:
# Classification Task: Emotions
TopDTClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions', 'performance.txt')

# Classification Task: Sentiments
TopDTClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments', 'performance.txt')

### 2.3.6 Top Multi-Layered Perceptron (Top-MLP)

#### Function for training and testing with Top MLP

In [10]:
# Hyper parameters for Top-MLP
MLPparams = {'activation': ('logistic', 'tanh', 'relu', 'identity'),
            'hidden_layer_sizes': ((120,), (30, 30)),
            'solver': ('adam', 'sgd')}

# Because the dataset is too large, the MLP model takes too long to train. Hence, we set a parameter of max
# iteration to 2. (Takes about 2 minutes to train for each combination of parameters and classification task)
def TopMLPClassifier(feature, label, x_test, y_test, classification_task, fileName):
    TopMLP = GridSearchCV(estimator=MLPClassifier(max_iter=2), param_grid=MLPparams)
    TopMLP.fit(feature, label)
    y_pred = TopMLP.predict(x_test)
    save_output('Top Multi-Layered Perceptron Model', classification_task, y_test, y_pred, fileName, MLPparams)

#### Training and testing with Top MLP

In [None]:
# Classification Task: Emotions
TopMLPClassifier(x_train, y_train_emo, x_test, y_test_emo, 'Emotions', 'performance.txt')

# Classification Task: Sentiments
TopMLPClassifier(x_train, y_train_sen, x_test, y_test_sen, 'Sentiments', 'performance.txt')

## 2.5 Exploration with stop words removal

### Removing stop words with parameter in CountVectorizer

In [None]:
stopWordsVectorizer = CountVectorizer(stop_words='english')
# Creates a document-term matrix
postsWithoutStopWords = stopWordsVectorizer.fit_transform(posts)

### Splitting dataset without stop words into training (80%) and testing (20%) 

In [None]:
# Splitting training and testing dataset for posts and emotions without stop words
x_train_stop, x_test_stop, y_train_emo_stop, y_test_emo_stop, y_train_sen_stop, y_test_sen_stop = train_test_split(postsWithoutStopWords, emotions, sentiments, test_size=0.2, random_state=0)

### Training and testing new dataset with all models from 2.3

#### Model: Base MNB

In [None]:
# Classification Task: Emotions
BaseMNBClassifier(x_train_stop, y_train_emo_stop, x_test_stop, y_test_emo_stop, 'Emotions', 'experiment.txt')

# Classification Task: Sentiments
BaseMNBClassifier(x_train_stop, y_train_sen_stop, x_test_stop, y_test_sen_stop, 'Sentiments', 'experiment.txt')

#### Model: Base DT

In [None]:
# Classification Task: Emotions
BaseDTClassifier(x_train_stop, y_train_emo_stop, x_test_stop, y_test_emo_stop, 'Emotions', 'experiment.txt')

# Classification Task: Sentiments
BaseDTClassifier(x_train_stop, y_train_sen_stop, x_test_stop, y_test_sen_stop, 'Sentiments', 'experiment.txt')

#### Model: Base MLP

In [None]:
# Classification Task: Emotions
BaseMLPClassifier(x_train_stop, y_train_emo_stop, x_test_stop, y_test_emo_stop, 'Emotions', 'experiment.txt')

# Classification Task: Sentiments
BaseMLPClassifier(x_train_stop, y_train_sen_stop, x_test_stop, y_test_sen_stop, 'Sentiments', 'experiment.txt')

#### Model: Top MNB

In [None]:
# Classification Task: Emotions
TopMNBClassifier(x_train_stop, y_train_emo_stop, x_test_stop, y_test_emo_stop, 'Emotions', 'experiment.txt')

# Classification Task: Sentiments
TopMNBClassifier(x_train_stop, y_train_sen_stop, x_test_stop, y_test_sen_stop, 'Sentiments', 'experiment.txt')

#### Model: Top DT

In [None]:
# Classification Task: Emotions
TopDTClassifier(x_train_stop, y_train_emo_stop, x_test_stop, y_test_emo_stop, 'Emotions', 'experiment.txt')

# Classification Task: Sentiments
TopDTClassifier(x_train_stop, y_train_sen_stop, x_test_stop, y_test_sen_stop, 'Sentiments', 'experiment.txt')

#### Model: Top MLP

In [None]:
# Classification Task: Emotions
TopMLPClassifier(x_train_stop, y_train_emo_stop, x_test_stop, y_test_emo_stop, 'Emotions', 'experiment.txt')

# Classification Task: Sentiments
TopMLPClassifier(x_train_stop, y_train_sen_stop, x_test_stop, y_test_sen_stop, 'Sentiments', 'experiment.txt')