In [1]:
"""
Developing a personality type classification model using Word Embedding models and Recurrent Neural Networks (RNN).
Focus on classifying only a Thinking (T) - Feeling (F) aspect from the 4 axes.
"""

# use the (MBTI) Myers-Briggs Personality Type Dataset, 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.models import model_from_json
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Activation

In [33]:
# load the data
training_data = pd.read_csv('content/training_data.csv')
testing_data = pd.read_csv("content/testing_data.csv")

print("------------------------------------")
print("Size of training dataset: {0}".format(len(training_data)))
print("Size of testing dataset: {0}".format(len(testing_data)))
print("------------------------------------")

print("------------------------------------")
print("Sample Data")
print("LABEL: {0} / SENTENCE: {1}".format(training_data.iloc[-1,0], training_data.iloc[-1,1]))
print("------------------------------------")

------------------------------------
Size of training dataset: 7808
Size of testing dataset: 867
------------------------------------
------------------------------------
Sample Data
LABEL: F / SENTENCE: 'Half of it is going straight to charity, another quarter going straight to scientific research, an eighth to the parkour community, a sixteenth to towards spreading information about health and...|||Find a path or suffer more.|||http://personalitycafe.com/enneagram-personality-theory-forum/85323-enneagram-type-mbti-type-compared-statistics.html yep.|||I kind of anchor on Fi and Ne makes having Ni really fun. INFP for me as they tire me out less and our views tend to align more.|||The two ESTPs I have gotten the chance to know seem to experience much more than other people who have been on the planet for the same amount of time and are quite the renaissance (wo)men.  Is this...|||I don't really have a best friend ISTP(passion-amateur group co-founder), INTJ(intellectual and various sma

In [34]:
training_data.head()

Unnamed: 0,type,posts
0,F,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,T,'I'm finding the lack of me in these posts ver...
2,T,'Good one _____ https://www.youtube.com/wat...
3,T,"'Dear INTP, I enjoyed our conversation the o..."
4,T,'You're fired.|||That's another silly misconce...


In [35]:
# Extract the labels and posts and store into List

# Get the list of training data (posts)
training_posts=training_data['posts'].tolist()
# Get the list of corresponding labels for the training data (posts)
training_labels=training_data['type'].tolist()

# Get the list of testing data (posts)
testing_posts=testing_data['posts'].tolist()
# Get the list of corresponding labels for the testing data (posts)
testing_labels=testing_data['type'].tolist()

## 1.1. URL Removal

In [36]:
# remove rows with missing values in posts column
training_data = training_data.dropna(subset=['posts'])
testing_data = testing_data.dropna(subset=['posts'])

In [37]:

# remove url from posts column
def remove_url(text):
    return re.sub(r'http\S+', '', text)

training_data['clean_posts'] = training_data['posts'].apply(remove_url)
testing_data['clean_posts'] = testing_data['posts'].apply(remove_url)


In [38]:
training_data.head()

Unnamed: 0,type,posts,clean_posts
0,F,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,' and intj moments sportscenter not top ten...
1,T,'I'm finding the lack of me in these posts ver...,'I'm finding the lack of me in these posts ver...
2,T,'Good one _____ https://www.youtube.com/wat...,"'Good one _____ course, to which I say I k..."
3,T,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
4,T,'You're fired.|||That's another silly misconce...,'You're fired.|||That's another silly misconce...


In [45]:
#pre-process the training set by integrating several text pre-processing techniques (e.g. tokenisation, removing numbers, converting to lowercase, removing stop words, stemming, etc.).
#  You should test and justify the reason why you apply the specific preprocessing techniques based on the test results.

# Removing all irrelevant characters (Numbers and Punctuation).
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

#  Convert all characters into lowercase.

def lowercase(text):
    return text.lower()

# Tokenization
def tokenize(text):
    return nltk.word_tokenize(text)

# Removing Stopwords
def remove_stopwords(tokens):
    return [word for word in tokens if word not in nltk.corpus.stopwords.words('english')]

# stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stem(tokens):
    return [stemmer.stem(word) for word in tokens]

# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Remove the words having length <= 2 —
def remove_short_words(tokens):
    return [word for word in tokens if len(word) > 2]


# Convert the list of tokens into back to the string
def tokens_to_string(tokens):
    return " ".join(tokens)

def preprocess_training_data(posts):
    # Apply the preprocessing techniques to the training data
    posts = posts.apply(remove_punctuation)
    posts = posts.apply(remove_numbers)
    posts = posts.apply(lowercase)
    posts = posts.apply(tokenize)
    posts = posts.apply(remove_stopwords)
    posts = posts.apply(stem)
    posts = posts.apply(lemmatize)
    posts = posts.apply(remove_short_words)
    posts = posts.apply(tokens_to_string)
    return posts

training_data = preprocess_training_data(training_data['clean_posts'])



LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/retech/nltk_data'
    - '/home/retech/anaconda3/nltk_data'
    - '/home/retech/anaconda3/share/nltk_data'
    - '/home/retech/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [42]:
# pre-process the training set
def preprocess_posts(posts):
    # remove punctuation
    posts = [''.join(c for c in post if c not in string.punctuation) for post in posts]
    # remove numbers
    posts = [''.join(c for c in post if c not in string.digits) for post in posts]
    # remove whitespace
    posts = [post.strip() for post in posts]
    # lowercase
    posts = [str(post).lower() for post in posts]
    # remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    posts = [post.split() for post in posts]
    posts = [[word for word in post if word not in stopwords] for post in posts]
    posts = [' '.join(post) for post in posts]
    # tokenize
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(posts)
    posts = tokenizer.texts_to_sequences(posts)
    # stemming
    posts = [nltk.PorterStemmer().stem(post) for post in posts]
    #lemmatization
    posts = [nltk.WordNetLemmatizer().lemmatize(post) for post in posts]
    # Convert the list of tokens into back to the string
    posts = [' '.join(post) for post in posts]
    return posts

training_data = preprocess_posts(training_data['clean_posts'])
training_data.head()

AttributeError: 'list' object has no attribute 'lower'