In [12]:
"""
Developing a personality type classification model using Word Embedding models and Recurrent Neural Networks (RNN).
Focus on classifying only a Thinking (T) - Feeling (F) aspect from the 4 axes.
"""

# use the (MBTI) Myers-Briggs Personality Type Dataset, 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Activation

In [13]:
# load the data
training_data = pd.read_csv('content/training_data.csv')
testing_data = pd.read_csv("content/testing_data.csv")

print("------------------------------------")
print("Size of training dataset: {0}".format(len(training_data)))
print("Size of testing dataset: {0}".format(len(testing_data)))
print("------------------------------------")

print("------------------------------------")
print("Sample Data")
print("LABEL: {0} / SENTENCE: {1}".format(training_data.iloc[-1,0], training_data.iloc[-1,1]))
print("------------------------------------")

------------------------------------
Size of training dataset: 7808
Size of testing dataset: 867
------------------------------------
------------------------------------
Sample Data
LABEL: F / SENTENCE: 'Half of it is going straight to charity, another quarter going straight to scientific research, an eighth to the parkour community, a sixteenth to towards spreading information about health and...|||Find a path or suffer more.|||http://personalitycafe.com/enneagram-personality-theory-forum/85323-enneagram-type-mbti-type-compared-statistics.html yep.|||I kind of anchor on Fi and Ne makes having Ni really fun. INFP for me as they tire me out less and our views tend to align more.|||The two ESTPs I have gotten the chance to know seem to experience much more than other people who have been on the planet for the same amount of time and are quite the renaissance (wo)men.  Is this...|||I don't really have a best friend ISTP(passion-amateur group co-founder), INTJ(intellectual and various sma

In [14]:
training_data.head()

Unnamed: 0,type,posts
0,F,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,T,'I'm finding the lack of me in these posts ver...
2,T,'Good one _____ https://www.youtube.com/wat...
3,T,"'Dear INTP, I enjoyed our conversation the o..."
4,T,'You're fired.|||That's another silly misconce...


In [15]:
# Extract the labels and posts and store into List

# Get the list of training data (posts)
training_posts=training_data['posts'].tolist()
# Get the list of corresponding labels for the training data (posts)
training_labels=training_data['type'].tolist()

# Get the list of testing data (posts)
testing_posts=testing_data['posts'].tolist()
# Get the list of corresponding labels for the testing data (posts)
testing_labels=testing_data['type'].tolist()

In [None]:
# get head of the training posts
print("------------------------------------")
print("Training Posts")
print("------------------------------------")
print(training_posts[:5])
print("------------------------------------")


## 1.1. URL Removal

In [16]:
#

In [17]:

# remove url from posts
def remove_url(text):
    return re.sub(r'http\S+', '', text)

remove_url(training_posts)


In [18]:
training_data.head()

Unnamed: 0,type,posts,clean_posts
0,F,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...
1,T,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...
2,T,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k..."
3,T,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,T,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...


In [19]:
#pre-process the training set by integrating several text pre-processing techniques (e.g. tokenisation, removing numbers, converting to lowercase, removing stop words, stemming, etc.).
#  You should test and justify the reason why you apply the specific preprocessing techniques based on the test results.

# Removing all irrelevant characters (Numbers and Punctuation).
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

#  Convert all characters into lowercase.

def lowercase(text):
    return text.lower()

# tokenize the posts
def tokenize(text):
    return nltk.word_tokenize(text)

# remove stop words
def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    return [word for word in text if word not in stopwords]

# stem the words
def stem(text):
    stemmer = nltk.stem.PorterStemmer()
    return [stemmer.stem(word) for word in text]

# lemmatize the words

def lematize(text):
    lemmatizer = nltk.
    return [lemmatizer.lemmatize(word) for word in text]

def remove_short_words(text):
    return [word for word in text if len(word) > 2]

def tokens_to_string(tokens):
    return " ".join(tokens)

def preprocess_testing_data(posts):
    # Apply the preprocessing techniques to the testing data
    posts = remove_punctuation(posts)
    posts = remove_numbers(posts)
    posts = lowercase(posts)
    posts = tokenize(posts)
    posts = remove_stopwords(posts)
    posts = stem(posts)
    posts = lemmatize(posts)
    posts = remove_short_words(posts)
    posts = tokens_to_string(posts)
    return posts

processed_training_posts = preprocess_testing_data(training_posts)



In [20]:
# build a training model for word embeddings. You are required to articulate the hyperparameters [Lab2] you choose (dimension of embeddings and window size)
# and the reason why you choose them.
# use the GloVe embedding model to train the model.
# You should test and justify the reason why you choose the specific hyperparameters based on the test results.
# start with a dimension of 100 and a window size of 5.
# You should test and justify the reason why you choose the specific hyperparameters based on the test results.

def build_model(embedding_dim, window_size):
    max_length = 200
    vocab_size = 20000
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [22]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

training_data['clean_posts'] = training_data['clean_posts'].apply(remove_numbers)

In [23]:
def lowercase(text):
    return text.lower()

training_data['clean_posts'] = training_data['clean_posts'].apply(lowercase)

In [21]:
# #Preprocess data for word embeddings
# # drop the posts column
# training_data = training_data.drop(columns=['posts'])
# testing_data = testing_data.drop(columns=['posts'])

In [24]:

# Tokenization
def tokenize(text):
    return nltk.word_tokenize(text)

training_data['clean_posts'] = training_data['clean_posts'].apply(tokenize)


In [29]:
# Removing Stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]

training_data['clean_posts'] = training_data['clean_posts'].apply(remove_stopwords)

In [26]:

# stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem(tokens):
    return [stemmer.stem(word) for word in tokens]

training_data['clean_posts'] = training_data['clean_posts'].apply(stem)

In [27]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

training_data['clean_posts'] = training_data['clean_posts'].apply(lemmatizer)

In [28]:
# Convert the list of tokens into back to the string
def tokens_to_string(tokens):
    return " ".join(tokens)

training_data['clean_posts'] = training_data['clean_posts'].apply(tokens_to_string)

In [None]:

# Remove the words having length <= 2 —
def remove_short_words(tokens):
    return [word for word in tokens if len(word) > 2]

training_data['clean_posts'] = training_data['clean_posts'].apply(remove_short_words)