### Data Collection

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '16g474hdNsaNx0_SnoKuqj2BuwSEGdnbt'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('training_data.csv')  

id = '1-7hj0sF3Rc5G6POKdkpbDXm_Q6BWFDPU'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('testing_data.csv')  

import pandas as pd
training_data = pd.read_csv("/content/training_data.csv")
testing_data = pd.read_csv("/content/testing_data.csv")

print("------------------------------------")
print("Size of training dataset: {0}".format(len(training_data)))
print("Size of testing dataset: {0}".format(len(testing_data)))
print("------------------------------------")

print("------------------------------------")
print("Sample Data")
print("LABEL: {0} / SENTENCE: {1}".format(training_data.iloc[-1,0], training_data.iloc[-1,1]))
print("------------------------------------")


In [None]:
training_data.head()

In [None]:
# Extract the labels and posts and store into List

# Get the list of training data (posts)
training_posts=training_data['posts'].tolist()
# Get the list of corresponding labels for the training data (posts)
training_labels=training_data['type'].tolist()

# Get the list of testing data (posts)
testing_posts=testing_data['posts'].tolist()
# Get the list of corresponding labels for the testing data (posts)
testing_labels=testing_data['type'].tolist()

### Url Removal

In [None]:
import re

def remove_url(text):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)

training_posts = [remove_url(post) for post in training_posts]
testing_posts = [remove_url(post) for post in testing_posts]

In [None]:
"""
You are asked to pre-process the training set by integrating several text pre-processing techniques
 (e.g. tokenisation, removing numbers, converting to lowercase, removing stop words, stemming, etc.).
You should test and justify the reason why you apply the specific preprocessing techniques based on the test result in section
"""
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove the stop words
    tokens = [token.lower() for token in tokens if token.lower() not in stopwords]
    # Remove the punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Remove the numbers
    tokens = [token for token in tokens if not token.isdigit()]
    # Stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]
    # tokens to string
    return ' '.join(tokens)

clean_training_posts = [preprocess(post) for post in training_posts]

In [None]:
"""
In this section, you are to implement three input representation components, including 
1) Word Embedding Construction Module, 
2) Pretrained Word Embedding Module, and 
3) Input Concatenation Module. For training, you are free to choose hyperparameters [Lab2,Lab4,Lab5] 
(e.g. dimension of embeddings, learning rate, epochs, etc.).

First, you are asked to build a word embedding model (for representing word vectors, 
such as word2vec-CBOW, word2vec-Skip gram, fastText, and Glove) 
for the input embedding of your sequence model. 
Note that we used one-hot vectors as inputs for the sequence model in the Lab3 and Lab4.
 You are required to complete the following sections in the format

Preprocess data for word embeddings: You are to use and preprocess MBTI dataset 
(the one provided in the Section 1) for training word embeddings [Lab2]. 
This can be different from the preprocessing technique that you used in Section 1. 
You can use both the training and testing datasets in order to train the word embedding.
"""

# Preprocess data for word embeddings
clean_training_posts = [preprocess(post) for post in training_posts]
clean_testing_posts = [preprocess(post) for post in testing_posts]


In [None]:
# pretrained word embedding
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the data, use clean_training_posts and clean_testing_posts
# Load the data, use clean_training_posts and clean_testing_posts
X_train = clean_training_posts
X_test = clean_testing_posts
y_train = training_labels
y_test = testing_labels

# Convert the labels to one-hot vectors
encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

# input concatenation
def input_concatenation(input_1, input_2):
    return torch.cat((input_1, input_2), dim=1)

# word embedding construction
class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(WordEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim * 2, 4)
    def forward(self, input_1, input_2):
        embedded_1 = self.embedding(input_1).view(1, -1)
        embedded_2 = self.embedding(input_2).view(1, -1)
        concatenated = input_concatenation(embedded_1, embedded_2)
        out = self.linear(concatenated)
        return out

# Build Sequence Model (Bi-directional model)
class SequenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SequenceModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, 4)
    def forward(self, input_1, input_2):
        embedded_1 = self.embedding(input_1).view(1, -1, embedding_dim)
        embedded_2 = self.embedding(input_2).view(1, -1, embedding_dim)
        output_1, hidden_1 = self.lstm(embedded_1)
        output_2, hidden_2 = self.lstm(embedded_2)
        concatenated = input_concatenation(output_1, output_2)
        out = self.linear(concatenated)
        return out

# Build the model
model = SequenceModel(vocab_size, embedding_dim, hidden_dim)


In [None]:
"""
 extract and apply the pretrained word embedding. 
 Gensim provides several pretrained word embeddings, you can find those in the gensim github.
  You can select the pretrained word embedding that would be useful personality type classification
"""

# extract and apply the pretrained word embedding useful personality type classification
import gensim
from gensim.models import KeyedVectors
word_embedding = gensim.models.KeyedVectors.load_word2vec_format(
    'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)
word_embedding.most_similar(positive=['thinking', 'feeling'])

vocab_size = len(word_embedding.vocab)
embedding_dim = word_embedding.vector_size
vocab = word_embedding.vocab
hidden_dim = 100

# apply the pretrained word embedding
def apply_pretrained_embedding(word_embedding, vocab):
    embedding_matrix = torch.zeros(vocab_size, embedding_dim)
    for word, i in vocab.items():
        try:
            embedding_vector = word_embedding[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = torch.rand(embedding_dim)
    return embedding_matrix

# apply the pretrained word embedding
embedding_matrix = apply_pretrained_embedding(word_embedding, vocab)

# Build the model
model = SequenceModel(vocab_size, embedding_dim, hidden_dim)
