In [1]:
import nltk

DOWNLOAD_NLTK = False

if DOWNLOAD_NLTK:
    nltk.download("stopwords")
    nltk.download("wordnet")

In [2]:
import re

import nltk
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

InteractiveShell.ast_node_interactivity = "all"

In [3]:
df = pd.read_csv("/home/jason/mbti_model/mbti_1.csv")
df = df.head(100)
df.shape

(100, 2)

In [4]:
lemmatiser = WordNetLemmatizer()

# Remove the stop words for speed
useless_words = stopwords.words("english")

# Remove these from the posts
unique_type_list = [
    "INFJ",
    "ENTP",
    "INTP",
    "INTJ",
    "ENTJ",
    "ENFJ",
    "INFP",
    "ENFP",
    "ISFP",
    "ISTP",
    "ISFJ",
    "ISTJ",
    "ESTP",
    "ESFP",
    "ESTJ",
    "ESFJ",
]
unique_type_list = [x.lower() for x in unique_type_list]

In [5]:
# Splitting the MBTI personality into 4 letters and binarizing it

b_Pers = {"I": 0, "E": 1, "N": 0, "S": 1, "F": 0, "T": 1, "J": 0, "P": 1}
b_Pers_list = [{0: "I", 1: "E"}, {0: "N", 1: "S"}, {0: "F", 1: "T"}, {0: "J", 1: "P"}]


def translate_personality(personality):
    # transform mbti to binary vector
    return [b_Pers[l] for l in personality]


# To show result output for personality prediction
def translate_back(personality):
    # transform binary vector to mbti personality
    s = ""
    for i, l in enumerate(personality):
        s += b_Pers_list[i][l]
    return s


list_personality_bin = np.array([translate_personality(p) for p in df.type])
# print("Binarize MBTI list: \n%s" % list_personality_bin)

In [6]:
def pre_process_text(data, remove_stop_words=True, remove_mbti_profiles=True):
    list_personality = []
    list_posts = []
    len_data = len(data)
    i = 0

    for row in data.iterrows():
        # check code working
        # i+=1
        # if (i % 500 == 0 or i == 1 or i == len_data):
        #     print("%s of %s rows" % (i, len_data))

        # Remove and clean comments
        posts = row[1].posts

        # Remove url links
        temp = re.sub(
            "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            " ",
            posts,
        )

        # Remove Non-words - keep only words
        temp = re.sub("[^a-zA-Z]", " ", temp)

        # Remove spaces > 1
        temp = re.sub(" +", " ", temp).lower()

        # Remove multiple letter repeating words
        temp = re.sub(r"([a-z])\1{2,}[\s|\w]*", "", temp)

        # Remove stop words
        if remove_stop_words:
            temp = " ".join(
                [
                    lemmatiser.lemmatize(w)
                    for w in temp.split(" ")
                    if w not in useless_words
                ]
            )
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(" ")])

        # Remove MBTI personality words from posts
        if remove_mbti_profiles:
            for t in unique_type_list:
                temp = temp.replace(t, "")

        # transform mbti to binary vector
        type_labelized = translate_personality(
            row[1].type
        )  # or use lab_encoder.transform([row[1].type])[0]
        list_personality.append(type_labelized)
        # the cleaned data temp is passed here
        list_posts.append(temp)

    # returns the result
    list_posts = np.array(list_posts)
    list_personality = np.array(list_personality)
    return list_posts, list_personality


list_posts, list_personality = pre_process_text(
    df, remove_stop_words=True, remove_mbti_profiles=True
)

print("Example :")
# print("\nPost before preprocessing:\n\n", data.posts[0])
# print("\nPost after preprocessing:\n\n", list_posts[0])
print("\nMBTI before preprocessing:\n\n", df.type[0])
print("\nMBTI after preprocessing:\n\n", list_personality[0])

Example :

MBTI before preprocessing:

 INFJ

MBTI after preprocessing:

 [0 0 0 0]


In [7]:
nRow, nCol = list_personality.shape
print(f'No. of posts = {nRow}  and No. of Personalities = {nCol} ')

No. of posts = 100  and No. of Personalities = 4 


In [8]:
# Vectorizing the database posts to a matrix of token counts for the model
cntizer = CountVectorizer(analyzer="word", max_features=1000, max_df=0.7, min_df=0.1)
# the feature should be made of word n-gram
# Learn the vocabulary dictionary and return term-document matrix
print("Using CountVectorizer :")
X_cnt = cntizer.fit_transform(list_posts)

# The enumerate object yields pairs containing a count and a value (useful for obtaining an indexed list)
feature_names = list(enumerate(cntizer.get_feature_names()))
print("10 feature names can be seen below")
print(feature_names[0:10])

# For the Standardization or Feature Scaling Stage :-
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()

# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
print("\nUsing Tf-idf :")

print("Now the dataset size is as below")
X_tfidf = tfizer.fit_transform(X_cnt).toarray()
print(X_tfidf.shape)

Using CountVectorizer :
10 feature names can be seen below
[(0, 'ability'), (1, 'able'), (2, 'absolutely'), (3, 'accurate'), (4, 'achieve'), (5, 'act'), (6, 'action'), (7, 'actually'), (8, 'add'), (9, 'admit')]

Using Tf-idf :
Now the dataset size is as below
(100, 636)


In [9]:
personality_type = [
    "IE: Introversion (I) / Extroversion (E)",
    "NS: Intuition (N) / Sensing (S)",
    "FT: Feeling (F) / Thinking (T)",
    "JP: Judging (J) / Perceiving (P)",
]

for l in range(len(personality_type)):
    print(personality_type[l])

IE: Introversion (I) / Extroversion (E)
NS: Intuition (N) / Sensing (S)
FT: Feeling (F) / Thinking (T)
JP: Judging (J) / Perceiving (P)


In [10]:
print("For MBTI personality type : %s" % translate_back(list_personality[0,:]))
print("Y : Binarized MBTI 1st row: %s" % list_personality[0,:])

For MBTI personality type : INFJ
Y : Binarized MBTI 1st row: [0 0 0 0]


In [11]:
# Posts in tf-idf representation
X = X_tfidf

In [12]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased") # "bert-large-uncased"

for l in range(len(personality_type)):

    Y = list_personality[:, l]

    x_token = [
        tokenizer.encode(str(i)) # max_length=maxlen, pad_to_max_length=False, truncation=True)
        for i in X
    ]

    
    # split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        np.array(x_token), Y, test_size=0.33, random_state=7
    )

    # fit model on training data
    model = XGBClassifier()
    model.fit(X_train, y_train)

    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)

    print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))

2021-07-23 20:22:05.152007: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-23 20:22:05.152062: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Token indices sequence length is longer than the specified maximum sequence length for this model (1788 > 512). Running this sequence through the model will result in indexing errors
  np.array(x_token), Y, test_size=0.33, random_state=7


ValueError: Please reshape the input data X into 2-dimensional matrix.

### try bert/keras

In [None]:
# import tensorflow as tf
# import transformers

# tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased") # "bert-large-uncased"

In [None]:
# def get_keras_model(x_train):
#     input_word_ids = tf.keras.layers.Input(
#         shape=(x_train.shape[1],), dtype=tf.int32, name="input_word_ids"
#     )
    
#     bert_layer = transformers.TFBertModel.from_pretrained("bert-base-uncased") # "bert-large-uncased"
#     bert_outputs = bert_layer(input_word_ids)[0]
#     pred = tf.keras.layers.Dense(1, activation="sigmoid")(bert_outputs[:, 0, :])
# #     pred = tf.keras.layers.Dense(16, activation="softmax")(bert_outputs[:, 0, :])

#     model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
#     loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
#     metrics = tf.metrics.BinaryAccuracy()
#     model.compile(
#         loss=loss,
#         optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
#         metrics=metrics,
#     )
#     return model

In [None]:
# batch_size = 16
# maxlen = 512

# for l in range(len(personality_type)):

#     Y = list_personality[:, l]

#     x_token = [
#         tokenizer.encode(str(i), max_length=maxlen, pad_to_max_length=False, truncation=True)
#         for i in X
#     ]

    
#     # split data into train and test sets
#     X_train, X_test, y_train, y_test = train_test_split(
#         np.array(x_token), Y, test_size=0.33, random_state=7
#     )

#     model = get_keras_model(X_train)
#     model.summary()

#     print("Started training")
#     model.fit(
#         X_train,
#         y_train,
#         validation_data=(X_test, y_test),
#         verbose=1,
#         epochs=20,
#         batch_size=batch_size,
#         callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)],
#     )

#     # make predictions for test data
#     y_pred = model.predict(X_test)
#     predictions = [round(value) for value in y_pred]
#     # evaluate predictions
#     accuracy = accuracy_score(y_test, predictions)

#     print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
#     break