## Import Packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
# from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

# Download required NLTK data
nltk.download('punkt')  # Needed for tokenizing words.
nltk.download('stopwords')  # Needed for stop words that are used in the preprocessing.
nltk.download('wordnet')  # Needed for lemmatization.

## Add 4 Classifier Labels

In [13]:
file_path = '../data/data.csv'
df = pd.read_csv(file_path)

In [45]:
df

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [17]:
df['length'] = df['posts'].apply(lambda x: len(x.split(' ')))
df['length'].describe()

count    8675.000000
mean     1262.678963
std       317.261077
min         4.000000
25%      1081.000000
50%      1314.000000
75%      1497.000000
max      2212.000000
Name: length, dtype: float64

In [46]:
"""
Here we create 4 new columns each containing information about one of the key dichotomies of MBTI
"""

def label_mbti_ie(row):
    if "I" in row['type']:
        return 0
    else:
        return 1
    
def label_mbti_ns(row):
    if "N" in row['type']:
        return 0
    else:
        return 1

def label_mbti_ft(row):
    if "F" in row['type']:
        return 0
    else:
        return 1

def label_mbti_pj(row):
    if "P" in row['type']:
        return 0
    else:
        return 1

df['I/E'] = df.apply(label_mbti_ie, axis=1)
df['N/S'] = df.apply(label_mbti_ns, axis=1)
df['F/T'] = df.apply(label_mbti_ft, axis=1)
df['P/J'] = df.apply(label_mbti_pj, axis=1)

# df['I-E'] = df['Type'].apply(lambda x: 1 if 'E' in x['type'] else 0)
# df['N-S'] = df['Type'].apply(lambda x: 1 if 'S' in x['type'] else 0)
# df['F-T'] = df['Type'].apply(lambda x: 1 if 'T' in x['type'] else 0)
# df['P-J'] = df['Type'].apply(lambda x: 1 if 'J' in x['type'] else 0)

df

Unnamed: 0,type,posts,I/E,N/S,F/T,P/J
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0,0,0,1
1,ENTP,'I'm finding the lack of me in these posts ver...,1,0,1,0
2,INTP,'Good one _____ https://www.youtube.com/wat...,0,0,1,0
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0,0,1,1
4,ENTJ,'You're fired.|||That's another silly misconce...,1,0,1,1
...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,0,1,0,0
8671,ENFP,'So...if this thread already exists someplace ...,1,0,0,0
8672,INTP,'So many questions when i do these things. I ...,0,0,1,0
8673,INFP,'I am very conflicted right now when it comes ...,0,0,0,0


In [47]:
# Text preprocessing function
def preprocess(text):
    text = re.sub(r'http\S+', '', text)  # Removes URLs that start with http
    text = re.sub(r'www\S+', '', text)   # Removes URLs that start with www

    text = text.lower()  # Lowercase
    tokens = word_tokenize(text)  # Tokenize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stopwords.words('english')]
    return tokens

# Apply preprocessing
df['processed_posts'] = df['posts'].apply(preprocess)

In [48]:
from sklearn.model_selection import train_test_split

X = df['processed_posts']
y = df[['I/E', 'N/S', 'F/T', 'P/J']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# _, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)
# _, _, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)
# _, _, y4_train, y4_test = train_test_split(X, y4, test_size=0.2, random_state=42) -->

In [49]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6940,), (1735,), (6940, 4), (1735, 4))

## Create TF-IDF data and Save

In [50]:

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(X_train).toarray()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Show TF-IDF result
print("TF-IDF Train Matrix Shape:", tfidf_matrix_train.shape)
print("TF-IDF Feature Names:", tfidf_feature_names)

# TF-IDF
# tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix_test = tfidf_vectorizer.transform(X_test).toarray()
# tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Show TF-IDF result
print("TF-IDF Test Matrix Shape:", tfidf_matrix_test.shape)
print("TF-IDF Feature Names:", tfidf_feature_names)



TF-IDF Train Matrix Shape: (6940, 71597)
TF-IDF Feature Names: ['aa' 'aaa' 'aaaa' ... 'ﾉｼ' 'ﾟ' 'ﾟдﾟщ']
TF-IDF Test Matrix Shape: (1735, 71597)
TF-IDF Feature Names: ['aa' 'aaa' 'aaaa' ... 'ﾉｼ' 'ﾟ' 'ﾟдﾟщ']


In [51]:
y_train['I/E'].shape

(6940,)

In [52]:
# from sklearn.model_selection import train_test_split

X_train = tfidf_matrix_train
y1_train = y_train['I/E']
y2_train = y_train['N/S']
y3_train = y_train['F/T']
y4_train = y_train['P/J']

X_test = tfidf_matrix_test
y1_test = y_test['I/E']
y2_test = y_test['N/S']
y3_test = y_test['F/T']
y4_test = y_test['P/J']

# X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
# _, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)
# _, _, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)
# _, _, y4_train, y4_test = train_test_split(X, y4, test_size=0.2, random_state=42)

In [53]:
X_train.shape, y1_train.shape, y2_train.shape, y3_train.shape, y4_train.shape

((6940, 71597), (6940,), (6940,), (6940,), (6940,))

In [54]:
X_test.shape, y1_test.shape, y2_test.shape, y3_test.shape, y4_test.shape

((1735, 71597), (1735,), (1735,), (1735,), (1735,))

In [55]:
# Save data
np.save('../data/tfidf/train/X_train.npy', X_train)
np.save('../data/tfidf/train/y1_train.npy', y1_train)
np.save('../data/tfidf/train/y2_train.npy', y2_train)
np.save('../data/tfidf/train/y3_train.npy', y3_train)
np.save('../data/tfidf/train/y4_train.npy', y4_train)

np.save('../data/tfidf/test/X_test.npy', X_test)
np.save('../data/tfidf/test/y1_test.npy', y1_test)
np.save('../data/tfidf/test/y2_test.npy', y2_test)
np.save('../data/tfidf/test/y3_test.npy', y3_test)
np.save('../data/tfidf/test/y4_test.npy', y4_test)

In [4]:
# Loading the data
X_train = np.load('../data/tfidf/train/X_train.npy')
y1_train = np.load('../data/tfidf/train/y1_train.npy')
y2_train = np.load('../data/tfidf/train/y2_train.npy')
y3_train = np.load('../data/tfidf/train/y3_train.npy')
y4_train = np.load('../data/tfidf/train/y4_train.npy')

X_test = np.load('../data/tfidf/test/X_test.npy')
y1_test = np.load('../data/tfidf/test/y1_test.npy')
y2_test = np.load('../data/tfidf/test/y2_test.npy')
y3_test = np.load('../data/tfidf/test/y3_test.npy')
y4_test = np.load('../data/tfidf/test/y4_test.npy')

In [5]:
train_inputs = (X_train, y1_train, y2_train, y3_train, y4_train)
test_inputs = (X_test, y1_test, y2_test, y3_test, y4_test)

In [6]:
X_train.shape, y1_train.shape, y2_train.shape, y3_train.shape, y4_train.shape

((6940, 71597), (6940,), (6940,), (6940,), (6940,))

In [7]:
X_test.shape, y1_test.shape, y2_test.shape, y3_test.shape, y4_test.shape

((1735, 71597), (1735,), (1735,), (1735,), (1735,))

In [8]:
sum(y1_test), sum(y2_test), sum(y3_test), sum(y4_test)

(382, 246, 798, 669)

In [17]:
# Loading the data
# X_loaded = np.load('../data/tfidf/X.npy')
# y1_loaded = np.load('../data/tfidf/y1.npy')
# y2_loaded = np.load('../data/tfidf/y2.npy')
# y3_loaded = np.load('../data/tfidf/y3.npy')
# y4_loaded = np.load('../data/tfidf/y4.npy')

# X_train, X_test, y1_train, y1_test = train_test_split(X_loaded, y1_loaded, test_size=0.2, random_state=42)
# _, _, y2_train, y2_test = train_test_split(X_loaded, y2_loaded, test_size=0.2, random_state=42)
# _, _, y3_train, y3_test = train_test_split(X_loaded, y3_loaded, test_size=0.2, random_state=42)
# _, _, y4_train, y4_test = train_test_split(X_loaded, y4_loaded, test_size=0.2, random_state=42)

/Users/kzzzz/Desktop/24Spring/Deep Learning/Project/Detecting_Your_MBTI_Using_NLP/code


## MLP

In [3]:
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Conv1D, GlobalMaxPooling1D, Dropout, Reshape, Flatten
import numpy as np

2024-05-04 16:08:15.815951: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
input_shape = X_train.shape[1]
inputs = Input(shape=input_shape)

shared_layers = Sequential([
    Dense(1024, activation='relu'),
    Dense(256, activation='relu')
])
# shared_layers = Sequential([
#     Reshape((X.shape[0], X.shape[1], 1)),
#     Conv1D(filters=128, kernel_size=5, activation='relu', input_shape=input_shape),
#     GlobalMaxPooling1D(),
#     Dropout(0.5)
# ])
shared_output = shared_layers(inputs)

output1_layers = Sequential([
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
], name='output1')

output2_layers = Sequential([
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
], name='output2')

output3_layers = Sequential([
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
], name='output3')

output4_layers = Sequential([
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
], name='output4')

# Connect each sequential output to the shared output
output1 = output1_layers(shared_output)
output2 = output2_layers(shared_output)
output3 = output3_layers(shared_output)
output4 = output4_layers(shared_output)

mlp_model = Model(inputs=inputs, outputs=[output1, output2, output3, output4], name='mlp_model')
mlp_model.compile(optimizer='adam',
              loss={'output1': 'binary_crossentropy', 'output2': 'binary_crossentropy',
                    'output3': 'binary_crossentropy', 'output4': 'binary_crossentropy'},
              metrics={'output1': 'accuracy', 'output2': 'accuracy',
                       'output3': 'accuracy', 'output4': 'accuracy'})


In [15]:
from sklearn.model_selection import train_test_split

def train_mbti_model_mlp(model, train_inputs, test_inputs):
        X_train, y1_train, y2_train, y3_train, y4_train = train_inputs
        X_test, y1_test, y2_test, y3_test, y4_test = test_inputs

        model.fit(x=X_train,
                y={'output1': y1_train, 'output2': y2_train, 'output3': y3_train, 'output4': y4_train},
                epochs=3,
                batch_size=64)
        test_scores = model.evaluate(x=X_test,
                                y={'output1': y1_test, 'output2': y2_test, 'output3': y3_test, 'output4': y4_test},
                                verbose=1)
        
        y1_predict, y2_predict, y3_predict, y4_predict = model.predict(X_test)
        y_predict_all = (y1_predict, y2_predict, y3_predict, y4_predict)
    
        print(f'Test Scores: {test_scores}')
        return y_predict_all, test_scores

In [16]:
y_predict_all, results = train_mbti_model_mlp(mlp_model, train_inputs, test_inputs)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Scores: [3.0573952198028564, 0.7825258374214172, 0.5729391574859619, 0.723351001739502, 0.978579580783844, 0.8409221768379211, 0.8783861398696899, 0.8259366154670715, 0.7273775339126587]


In [17]:
loss_total = results[0]
loss_output1, acc_output1 = results[1], results[5]
loss_output2, acc_output2 = results[2], results[6]
loss_output3, acc_output3 = results[3], results[7]
loss_output4, acc_output4 = results[4], results[8]
average_accuracy = (acc_output1 + acc_output2 + acc_output3 + acc_output4) / 4
print((acc_output1, acc_output2, acc_output3, acc_output4))
print(f'Average Accuracy: {average_accuracy}')

(0.8409221768379211, 0.8783861398696899, 0.8259366154670715, 0.7273775339126587)
Average Accuracy: 0.8181556165218353


In [18]:
def calculate_accuracy(y_true, y_pred):
    # Check if all four predicted classes match the true classes
    correct_predictions = np.all(y_true == y_pred, axis=1)
    accuracy = np.mean(correct_predictions)
    return accuracy

def threshold_prediction(value, threshold):
    if value >= threshold:
        return 1
    else:
        return 0

y1_predict, y2_predict, y3_predict, y4_predict = y_predict_all
threshold = 0.5
y1_predict_thresholded = np.array([threshold_prediction(value, threshold) for value in y1_predict])
y2_predict_thresholded = np.array([threshold_prediction(value, threshold) for value in y2_predict])
y3_predict_thresholded = np.array([threshold_prediction(value, threshold) for value in y3_predict])
y4_predict_thresholded = np.array([threshold_prediction(value, threshold) for value in y4_predict])

y1_test_reshaped, y2_test_reshaped, y3_test_reshaped, y4_test_reshaped = y1_test.reshape(-1, 1), y2_test.reshape(-1, 1), y3_test.reshape(-1, 1), y4_test.reshape(-1, 1)

y_pred_combined = np.vstack((y1_predict_thresholded,
                                 y2_predict_thresholded,
                                 y3_predict_thresholded,
                                 y4_predict_thresholded)).T

y_test_combined = np.vstack((y1_test_reshaped.T,
                                y2_test_reshaped.T,
                                y3_test_reshaped.T,
                                y4_test_reshaped.T)).T

accuracy = calculate_accuracy(y_test_combined, y_pred_combined)

In [19]:
print(f'Accuracy across all four categories: {accuracy}')

Accuracy across all four categories: 0.47780979827089337
