## 3. Embeddings as Features

In [None]:
import gensim.downloader as api
import nltk
import gzip
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import ConfusionMatrixDisplay  #coa be combined with above code
from sklearn.svm import SVC

# import matplotlib.pyplot as plt
# from sklearn.metrics import plot_confusion_matrix
# from sklearn.svm import SVC

In [None]:
# pre-trained model with google news datases, covering 3milion words and phrases
embeddings = api.load('word2vec-google-news-300')

In [None]:
# The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
# Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.

In [None]:
with gzip.open("dataset/goemotions.json.gz", "rb") as f:
    data = json.loads(f.read())

dataset = np.array(data)

# Extracting posts, emotions and sentiments
posts = dataset[:, 0]
emotions = dataset[:, 1]
sentiments = dataset[:, 2]

## 3.7 save output

In [None]:
# Function to save output to text files
def save_output(model_name, classification_task, y_test, y_pred, fileName, hyper_parameters=None):
    with open(fileName, 'a') as f:

        # Model name + hyper parameters + classification task
        f.writelines(['===========================================================\n',
                      model_name + ' for --> ' + classification_task + '\n(Hyper Parameters: ' + str(hyper_parameters) + ')\n'
                      , '===========================================================\n'])
        # Confusion Matrix
        f.write('\nConfusion Matrix:\n-----------------\n\n' + str(confusion_matrix(y_test, y_pred)) + '\n')
        
        # Classfication Report
        f.write('\nClassification Report:\n----------------------\n\n' + classification_report(y_test, y_pred) + '\n\n')

### 3.2

In [None]:
# tokenizes each reddit post and stores in the list  [['That','game','hurt','.'],...]
def tokenizePosts(posts):
    tokenizedPostsList = []
    for post in posts:
        tokenizedPostsList.append(nltk.word_tokenize(post)) 
    return tokenizedPostsList


## do we want clean tokens e.g.no punctuations, symbols etc..

In [None]:
tokenizedPostsList = tokenizePosts(posts)

In [None]:
# Splitting training and testing dataset for posts and emotions
# ex) [['That','game','hurt','.'],...]
x_train, x_test, y_train_emo, y_test_emo, y_train_sen, y_test_sen = train_test_split(tokenizedPostsList, emotions, sentiments, test_size=0.2, random_state=0)

In [None]:
# number of tokens in our training set
def countTokens(x_train):
    count = 0
    for post in x_train:
        count = count + len(post)
    return count

In [None]:
countTokens(x_train)

## 3.3 

In [None]:
def computeAvgPostEmbedding(embeddings, tokenizedPostsList):
    avgPostEmbeddingsList = []
    
    for post in tokenizedPostsList:
        #removing word not in vocab
        filteredPost = [w for w in post if w in embeddings.index_to_key]
        if len(filteredPost)>=1:
            avgPostEmbeddingsList.append(np.mean(embeddings[filteredPost],axis=0))
            
        # else: check another time! 
    return avgPostEmbeddingsList

In [None]:
onePost = [tokenizedPostsList[0]]

In [None]:
print( "This is an average embedding of one reddit post 'That game hurt.' : \n", computeAvgPostEmbedding(embeddings, onePost))

## 3.4 Computing overall hit rate

In [None]:
def computeHitRate(embeddings, postList, setType):
    filteredPost = []
    for post in postList:
        filteredPost.append([w for w in post if w in embeddings.index_to_key])
    
    hitRate = countTokens(filteredPost)/countTokens(postList)*100
    print(setType," hit rate: ", round(hitRate,2) , "%")
   
    # return hitRate

In [None]:
computeHitRate(embeddings, x_train, "Train set")

In [None]:
computeHitRate(embeddings, x_test, "Test set")

## Data prep

In [None]:
# average vector form of x_train
trainAvgEmbeddingsList = computeAvgPostEmbedding(embeddings,x_train)

In [None]:
testAvgEmbeddingsList = computeAvgPostEmbedding(embeddings,x_test)

## 3.5 Base-MLP

Base Multi-Layered Perceptron (Base-MLP) with default parameters

In [None]:
# Although we are asked to use default parameters and dataset is large, the MLP model takes too long to train. 
# Hence, we set a parameter of max iteration to 2. (Takes about 2 minutes to train for each classification task)
def BaseMLPClassifier(feature, label, x_test, y_test, classification_task, fileName):
    MLP = MLPClassifier(max_iter=2)
    MLP.fit(feature, label) # training
    y_pred = MLP.predict(x_test) # testing
    
    # Display confusion matrix
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    plt.show()
    
    #save output
    save_output('Base Multi-Layered Perceptron Model', classification_task, y_test, y_pred, fileName)    

In [None]:
# Classification Task: Emotions
BaseMLPClassifier(trainAvgEmbeddingsList, y_train_emo, testAvgEmbeddingsList, y_test_emo, 'Emotions', 'googleNews_performance.txt')

# Classification Task: Sentiments
BaseMLPClassifier(trainAvgEmbeddingsList, y_train_sen, testAvgEmbeddingsList, y_test_sen, 'Sentiments', 'googleNews_performance.txt')

## 3.6 Top-MLP

In [None]:
# Hyper parameters for Top-MLP
MLPparams = {'activation': ('relu','logistic'),
            'hidden_layer_sizes': ((80,),(10,10)),
            'solver': ('adam','sgd'),
            'max_iter':2,
            'learning_rate_init':1.0,
            'learning_rate':('constant')
            }

# Because the dataset is too large, the MLP model takes too long to train. Hence, we set a parameter of max
# iteration to 2. (Takes about 2 minutes to train for each combination of parameters and classification task)
def TopMLPClassifier(feature, label, x_test, y_test, classification_task, fileName):
    TopMLP = GridSearchCV(estimator=MLPClassifier(), param_grid=MLPparams)
    TopMLP.fit(feature, label)
    y_pred = TopMLP.predict(x_test)
    
    # Display confusion matrix
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    plt.show()
    
    # save
    save_output('Top Multi-Layered Perceptron Model', classification_task, y_test, y_pred, fileName, MLPparams)

In [None]:
# Classification Task: Emotions
TopMLPClassifier(trainAvgEmbeddingsList, y_train_emo, testAvgEmbeddingsList, y_test_emo, 'Emotions', 'googleNews_performance.txt')

# Classification Task: Sentiments
TopMLPClassifier(trainAvgEmbeddingsList, y_train_sen, testAvgEmbeddingsList, y_test_sen, 'Sentiments', 'googleNews_performance.txt')

## 3.8 exploration

In [None]:
print(list(api.info()['models'].keys()))

### wiki embeddings

In [None]:
wikiEmbeddings = api.load('glove-wiki-gigaword-100')

In [None]:
# x_tain with wiki embeddings
trainWiki = computeAvgPostEmbedding(wikiEmbeddings, x_train)

# x_test with wiki embeddings
testWiki = computeAvgPostEmbedding(wikiEmbeddings, x_test)

In [None]:
# we chose top-mlp for our best model.

In [None]:
TopMLPClassifier(trainWiki, label, testWiki, y_test, 'Emotions', 'wiki_performance.txt')

In [None]:
TopMLPClassifier(trainWiki, label, testWiki, y_test, 'Sentiments', 'wiki_performance.txt')

### twitter embeddings

In [None]:
twitterEbmbeddings = api.load('glove-twitter-200')

In [None]:
# x_tain with twitter embeddings
trainTwitter = computeAvgPostEmbedding(twitterEmbeddings, x_train)

# x_test with twitter embeddings
testTwitter = computeAvgPostEmbedding(twitterEmbeddings, x_test)

In [None]:
# we chose top-mlp for our best model.

In [None]:
TopMLPClassifier(trainTwitter, label, testTwitter, y_test, 'Emotions', 'twitter_performance.txt')

In [None]:
TopMLPClassifier(trainTwitter, label, testTwitter, y_test, 'Sentiments', 'twitter_performance.txt')