# Task for Today  

***

## Personality Type Prediction  

Given *data about posts people have made*, let's try to predict the **personality type** of a given person.  
  
We will use a TensorFlow RNN to make our predictions.

# Getting Started

In [1]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import tensorflow as tf

In [3]:
data = pd.read_csv('data/mbti_1.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


# Preprocessing

In [5]:
data['type'].unique()

array(['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'],
      dtype=object)

In [27]:
def preprocess_inputs(df):
    
    texts = df['posts'].copy()
    labels = df['type'].copy()
    
    # Process text data
    stop_words = stopwords.words('english')
    
    texts = [text.lower() for text in texts]
    texts = [text.split() for text in texts]
    texts = [[word.strip() for word in text] for text in texts]
    texts = [[word for word in text if word not in stop_words] for text in texts]
    
    vocab_length = 10000
    
    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(texts)
    
    texts = tokenizer.texts_to_sequences(texts)
    
    max_seq_length = np.max([len(text) for text in texts])
    
    texts = pad_sequences(texts, maxlen=max_seq_length, padding='post')
    
    # Process label data
    label_values = [
        'INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
       'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ'
    ]
    
    label_mapping = {label: np.int(label[0] == 'E') for label in label_values}
    
    labels = labels.replace(label_mapping)
    labels = np.array(labels)
    
    return texts, labels, max_seq_length, vocab_length, label_mapping

In [28]:
texts, labels, max_seq_length, vocab_length, label_mapping = preprocess_inputs(data)

["'http://www.youtube.com/watch?v=qsxhcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03pma1qa1rooo1_500.jpg|||enfp", 'intj', 'moments', 'https://www.youtube.com/watch?v=iz7le1g4xm4', 'sportscenter', 'top', 'ten', 'plays', 'https://www.youtube.com/watch?v=ucdfze1etec', 'pranks|||what', 'life-changing', 'experience', 'life?|||http://www.youtube.com/watch?v=vxzeywwrdw8', 'http://www.youtube.com/watch?v=u8ejam5dp3e', 'repeat', 'today.|||may', 'perc', 'experience', 'immerse', 'you.|||the', 'last', 'thing', 'infj', 'friend', 'posted', 'facebook', 'committing', 'suicide', 'next', 'day.', 'rest', 'peace~', 'http://vimeo.com/22842206|||hello', 'enfj7.', 'sorry', 'hear', 'distress.', 'natural', 'relationship', 'perfection', 'time', 'every', 'moment', 'existence.', 'try', 'figure', 'hard', 'times', 'times', 'growth,', 'as...|||84389', '84390', 'http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg', 'http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.

In [24]:
print("Text sequences:\n", texts.shape)
print("\nLabels:\n", labels.shape)
print("\nMax sequence length:\n", max_seq_length)
print("\nVocab length:\n", vocab_length)
print("\nLabel mapping:\n", label_mapping)

Text sequences:
 (8675, 859)

Labels:
 (8675,)

Max sequence length:
 859

Vocab length:
 10000

Label mapping:
 {'INFJ': 0, 'ENTP': 1, 'INTP': 0, 'INTJ': 0, 'ENTJ': 1, 'ENFJ': 1, 'INFP': 0, 'ENFP': 1, 'ISFP': 0, 'ISTP': 0, 'ISFJ': 0, 'ISTJ': 0, 'ESTP': 1, 'ESFP': 1, 'ESTJ': 1, 'ESFJ': 1}


In [25]:
texts_train, texts_test, labels_train, labels_test = train_test_split(texts, labels, train_size=0.7, random_state=123)

# Training

In [26]:
embedding_dim = 512

inputs = tf.keras.Input(shape=(max_seq_length,))

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length
)(inputs)

gru = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(
        units=256,
        return_sequences=True
    )
)(embedding)

flatten = tf.keras.layers.Flatten()(gru)

outputs = tf.keras.layers.Dense(1, activation='sigmoid')(flatten)


model = tf.keras.Model(inputs, outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


history = model.fit(
    texts_train,
    labels_train,
    validation_split=0.2,
    batch_size=32,
    epochs=5,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint('./model.h5', save_best_only=True, save_weights_only=True)
    ]
)

Epoch 1/5
  6/152 [>.............................] - ETA: 18:36 - loss: 0.6691 - accuracy: 0.6298 - auc: 0.5457

KeyboardInterrupt: 

# Results

In [None]:
model.load_weights('./model.h5')

In [None]:
model.evaluate(texts_test, labels_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/s3g0MJcJZyA