In [43]:
import pandas as pd
import numpy as np 
from keras.preprocessing.text import text_to_word_sequence, one_hot, hashing_trick, Tokenizer


# Data Prep

We start by loading the data, splitting the comments to be individual data points. 


In [44]:
mbti_data_original = pd.read_csv('mbti_data.csv')
mbti_data_original.iloc[2,1].split('|||')
mbti_split = pd.DataFrame()

for x in range(len(mbti_data_original)):
    row_data = mbti_data_original.iloc[x,1].split('|||') # The ||| is how the data is split in the file 
    p_type = mbti_data_original.iloc[x,0]
    split_data_entry = pd.DataFrame({'type' : p_type, 'comment' : row_data, 'user' : x})
    mbti_split = pd.concat([mbti_split, split_data_entry])

mbti_split.shape


(422845, 3)

We next split the comments into word vectors, find the total number of unique words then use the md5 hash function to create an integer hash of each of the words 

In [45]:
import re
#remove url comments since they won't relate well to the yelp data 
mbti_split['comment'] = mbti_split['comment'].apply(lambda x : re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', x, flags=re.MULTILINE) )
mbti_split['comment'].replace('', np.nan, inplace=True)
mbti_split.dropna(subset=['comment'], inplace=True)

#vectorise
mbti_split['seq_comment'] = mbti_split['comment'].apply(text_to_word_sequence)
vocab_size = len(mbti_split['seq_comment'].apply(pd.Series).stack().unique())
mbti_split['vectorised'] = mbti_split['comment'].apply(lambda x : hashing_trick(x, round(vocab_size*1.5), hash_function= 'md5'))


In [50]:

mbti_split.head(200)


Unnamed: 0,type,comment,user,seq_comment,vectorised
0,INFJ,',0,['],[73200]
2,INFJ,enfp and intj moments sportscenter not top ...,0,"[enfp, and, intj, moments, sportscenter, not, ...","[36362, 147875, 92936, 81320, 67477, 47566, 13..."
3,INFJ,What has been the most life-changing experienc...,0,"[what, has, been, the, most, life, changing, e...","[97637, 175511, 33942, 50366, 18468, 102440, 1..."
4,INFJ,On repeat for most of today.,0,"[on, repeat, for, most, of, today]","[135915, 88078, 66714, 18468, 28759, 33041]"
5,INFJ,May the PerC Experience immerse you.,0,"[may, the, perc, experience, immerse, you]","[125654, 50366, 167561, 136920, 71501, 40536]"
6,INFJ,The last thing my INFJ friend posted on his fa...,0,"[the, last, thing, my, infj, friend, posted, o...","[50366, 121498, 7594, 59001, 484, 57573, 17385..."
7,INFJ,Hello ENFJ7. Sorry to hear of your distress. I...,0,"[hello, enfj7, sorry, to, hear, of, your, dist...","[25695, 19303, 101691, 118588, 157170, 28759, ..."
8,INFJ,84389 84390 ...,0,"[84389, 84390]","[150130, 88496]"
9,INFJ,Welcome and stuff.,0,"[welcome, and, stuff]","[45424, 147875, 78425]"
10,INFJ,Game. Set. Match.,0,"[game, set, match]","[119708, 116879, 148401]"


In [47]:
mbti_split.to_csv('mbti_data_preprocessed.csv') # writing to csv to save re-splitting

In [48]:
#Export a vocab list to be combined with the yelp vocab
words = mbti_split['seq_comment'].apply(pd.Series).stack().unique()
np.savetxt("vocab_list_mbti.csv", words, delimiter=",", fmt='%s')