### Data Collection

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '16g474hdNsaNx0_SnoKuqj2BuwSEGdnbt'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('training_data.csv')  

id = '1-7hj0sF3Rc5G6POKdkpbDXm_Q6BWFDPU'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('testing_data.csv')  

import pandas as pd
training_data = pd.read_csv("/content/training_data.csv")
testing_data = pd.read_csv("/content/testing_data.csv")

print("------------------------------------")
print("Size of training dataset: {0}".format(len(training_data)))
print("Size of testing dataset: {0}".format(len(testing_data)))
print("------------------------------------")

print("------------------------------------")
print("Sample Data")
print("LABEL: {0} / SENTENCE: {1}".format(training_data.iloc[-1,0], training_data.iloc[-1,1]))
print("------------------------------------")


In [None]:
training_data.head()

In [None]:
# Extract the labels and posts and store into List

# Get the list of training data (posts)
training_posts=training_data['posts'].tolist()
# Get the list of corresponding labels for the training data (posts)
training_labels=training_data['type'].tolist()

# Get the list of testing data (posts)
testing_posts=testing_data['posts'].tolist()
# Get the list of corresponding labels for the testing data (posts)
testing_labels=testing_data['type'].tolist()

### Url Removal

In [None]:
import re

def remove_url(text):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)

training_posts = [remove_url(post) for post in training_posts]
testing_posts = [remove_url(post) for post in testing_posts]

In [None]:
"""
You are asked to pre-process the training set by integrating several text pre-processing techniques
 (e.g. tokenisation, removing numbers, converting to lowercase, removing stop words, stemming, etc.).
You should test and justify the reason why you apply the specific preprocessing techniques based on the test result in section
"""
import string
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove the stop words
    tokens = [token.lower() for token in tokens if token.lower() not in stopwords]
    # Remove the punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Remove the numbers
    tokens = [token for token in tokens if not token.isdigit()]
    # Stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]
    # lemmatization 
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

clean_training_posts = [preprocess(post) for post in training_posts]

In [None]:
"""
In this section, you are to implement three input representation components, including 
1) Word Embedding Construction Module, 
2) Pretrained Word Embedding Module, and 
3) Input Concatenation Module. For training, you are free to choose hyperparameters [Lab2,Lab4,Lab5] 
(e.g. dimension of embeddings, learning rate, epochs, etc.).

First, you are asked to build a word embedding model (for representing word vectors, 
such as word2vec-CBOW, word2vec-Skip gram, fastText, and Glove) 
for the input embedding of your sequence model. 
Note that we used one-hot vectors as inputs for the sequence model in the Lab3 and Lab4.
 You are required to complete the following sections in the format

Preprocess data for word embeddings: You are to use and preprocess MBTI dataset 
(the one provided in the Section 1) for training word embeddings [Lab2]. 
This can be different from the preprocessing technique that you used in Section 1. 
You can use both the training and testing datasets in order to train the word embedding.
"""

# Preprocess data for word embeddings
clean_training_posts = [preprocess(post) for post in training_posts]
clean_testing_posts = [preprocess(post) for post in testing_posts]

"""
 You are to build a training model for word embeddings. 
 You are required to articulate the hyperparameters you choose (dimension of embeddings and window size) in the report.
 Note that any word embeddings model (e.g. word2vec-CBOW, word2vec-Skip gram, fasttext, glove) can be applied.
"""

# Build the training model for word embeddings
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

# Hyperparameters
embedding_dim = 100
window_size = 5
vocab_size = len(clean_training_posts)


# Build the word embedding training model
class TrainingModelForWordEmbeddings(nn.Module):
    def __init__(self, embedding_dim, window_size):
        super(TrainingModelForWordEmbeddings, self).__init__()
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim * window_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, x):
        x = self.embedding(x)
        x = x.view(-1, self.embedding_dim * self.window_size)
        x = self.linear(x)
        x = self.softmax(x)
        return x

# Build the word embedding testing model
model = TrainingModelForWordEmbeddings(embedding_dim, window_size)

In [None]:
"""
You are asked to extract and apply the pretrained word embedding. Gensim provides several pretrained word embeddings, 
you can find those in the gensim github. You can select the pretrained word embedding that would be useful for the assignment 1 task,
 personality type classification

"""
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# extract and apply the pretrained word embedding
import gensim.downloader as api
# select the pretrained word embedding that would be useful for the assignment 1 task, personality type classification
pretrained_embedding = api.load('glove-twitter-25')
pretrained_embedding = pretrained_embedding.vectors
pretrained_embedding = torch.from_numpy(pretrained_embedding)
pretrained_embedding = pretrained_embedding.float()
pretrained_embedding = pretrained_embedding.to(device)

# concatenate the trained word embedding and pretrained word embedding, and apply the trained model
model.embedding.weight = nn.Parameter(pretrained_embedding)
model.embedding.weight.requires_grad = False
model.to(device)

In [None]:

# Build the training model for word embeddings
class Model(nn.Module):
    def __init__(self, embedding_dim, window_size, vocab_size):
        super(Model, self).__init__()
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim * window_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, x):
        x = self.embedding(x)
        x = x.view(-1, self.embedding_dim * self.window_size)
        x = self.linear(x)
        x = self.softmax(x)
        return x

model = Model(embedding_dim, window_size, vocab_size)
model.to(device)

# save the model
torch.save(model.state_dict(), 'model.pth')

In [None]:
# encoding convert the clean training posts to int and then go ahead with padding
# get vocab
vocab = sorted(set(clean_training_posts))

word_to_id = {word: i for i, word in enumerate(vocab)}
encoded_training_posts = []
for post in clean_training_posts:
    encoded_post = []
    for word in post:
        encoded_post.append(word_to_id[word])
    encoded_training_posts.append(encoded_post)


In [None]:
# train the model
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()
epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(torch.tensor(clean_training_posts).long().to(device))
    loss = loss_function(output, torch.tensor(training_labels).long().to(device))
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch + 1}/{epochs} Loss: {loss.item()}')
print('Training completed')

In [None]:
# Build Sequence Model (Bi-directional model)
class SequenceModel(nn.Module):
    def __init__(self, embedding_dim, window_size):
        super(SequenceModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, embedding_dim, bidirectional=True)
        self.linear = nn.Linear(embedding_dim * 2, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, x):
        x = self.embedding(x)
        x = x.view(-1, self.embedding_dim * self.window_size)
        x, _ = self.lstm(x)
        x = self.linear(x)
        x = self.softmax(x)
        return x

# Build the sequence model
model = SequenceModel(embedding_dim, window_size)






In [None]:
# Train Sequence Model (Bi-directional model)
# Hyperparameters
learning_rate = 0.001
batch_size = 32
num_epochs = 10

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    for i in range(0, len(clean_training_posts), batch_size):
        batch_inputs = clean_training_posts[i:i+batch_size]
        batch_labels = training_labels[i:i+batch_size]
        batch_inputs = torch.tensor(batch_inputs)
        batch_labels = torch.tensor(batch_labels)
        batch_inputs = batch_inputs.long()
        batch_labels = batch_labels.long()
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
    print('Epoch: {}, Loss: {}'.format(epoch, loss.item()))

    
    

In [None]:
"""
You are to apply Semantic-Syntactic word relationship tests for the trained word embeddings and visualise the 
result of Semantic-Syntactic word relationship tests.
"""
# test the trained word embedding
def test_word_embedding(model, test_posts, test_labels):
    test_posts = torch.tensor(test_posts)
    test_labels = torch.tensor(test_labels)
    test_posts = test_posts.long()
    test_labels = test_labels.long()
    outputs = model(test_posts)
    _, predicted = torch.max(outputs.data, 1)
    correct = (predicted == test_labels).sum().item()
    return correct / len(test_labels)

# test the trained word embedding
print('Test accuracy: {}'.format(test_word_embedding(model, clean_testing_posts, testing_labels)))

In [None]:
# evaluate with the testing dataset and provide the table with f1 of test set
def f1_score_table(model, test_posts, test_labels):
    test_posts = torch.tensor(test_posts)
    test_labels = torch.tensor(test_labels)
    test_posts = test_posts.long()
    test_labels = test_labels.long()
    outputs = model(test_posts)
    _, predicted = torch.max(outputs.data, 1)
    f1_score = f1_score(test_labels, predicted, average='macro')
    return f1_score

print('F1 score: {}'.format(f1_score_table(model, clean_testing_posts, testing_labels)))

In [None]:
# Performance Evaluation with Different Sequence Models
# Build the sequence model
model = SequenceModel(embedding_dim, window_size)
# Train the model
for epoch in range(num_epochs):
    for i in range(0, len(clean_training_posts), batch_size):
        batch_inputs = clean_training_posts[i:i+batch_size]
        batch_labels = training_labels[i:i+batch_size]
        batch_inputs = torch.tensor(batch_inputs)
        batch_labels = torch.tensor(batch_labels)
        batch_inputs = batch_inputs.long()
        batch_labels = batch_labels.long()
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
    print('Epoch: {}, Loss: {}'.format(epoch, loss.item()))
# test the trained word embedding
print('Test accuracy: {}'.format(test_word_embedding(model, clean_testing_posts, testing_labels)))
# evaluate with the testing dataset and provide the table with f1 of test set
print('F1 score: {}'.format(f1_score_table(model, clean_testing_posts, testing_labels)))

f1_score = f1_score_table(model, clean_testing_posts, testing_labels)



In [None]:
# draw a graph of the f1 score of the testing set against the number of epochs
import matplotlib.pyplot as plt
num_epochs = range(1, num_epochs + 1)
plt.plot(range(num_epochs), f1_score_table(model, clean_testing_posts, testing_labels))
plt.xlabel('Number of Epochs')
plt.ylabel('F1 Score')
plt.show()




In [None]:
#@title Personality Type Prediction

text = "" #@param {type:"string"}

# predict the input text's personality type using the trained word embedding
def predict_personality_type(model, text):
    text = torch.tensor(text)
    text = text.long()
    outputs = model(text)
    _, predicted = torch.max(outputs.data, 1)
    return predicted.item()

print('Personality Type: {}'.format(predict_personality_type(model, text)))
