In [397]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

from sklearn import metrics
from sklearn.utils import shuffle

import nltk
import re
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet')

import spacy
import scipy

pd.set_option('max_colwidth', 400)

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package wordnet to /Users/ilya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [398]:
df = pd.read_csv('/Users/ilya/Downloads/mbti_1.csv')

df.loc[:,'IE'] = [x[0:1] for x in df['type']]
df.loc[:,'NS'] = [x[1:2] for x in df['type']]
df.loc[:,'TF'] = [x[2:3] for x in df['type']]
df.loc[:,'JP'] = [x[3:4] for x in df['type']]
df.head()

Unnamed: 0,type,posts,IE,NS,TF,JP
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/...,I,N,F,J
1,ENTP,'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the same position often. For example me and my girlfriend are currently in an environment where we have to creatively use cowgirl and missionary. There isn't enough...|||Giving new meaning to 'Game' theory.|||Hello *ENTP Grin* That's all it takes. Than we converse and they do most of the flirting while I ...,E,N,T,P
2,INTP,"'Good one _____ https://www.youtube.com/watch?v=fHiGbolFFGw|||Of course, to which I say I know; that's my blessing and my curse.|||Does being absolutely positive that you and your best friend could be an amazing couple count? If so, than yes. Or it's more I could be madly in love in case I reconciled my feelings (which at...|||No, I didn't; thank you for a link!|||So-called Ti-Si loop (an...",I,N,T,P
3,INTJ,"'Dear INTP, I enjoyed our conversation the other day. Esoteric gabbing about the nature of the universe and the idea that every rule and social code being arbitrary constructs created...|||Dear ENTJ sub, Long time no see. Sincerely, Alpha|||None of them. All other types hurt in deep existential ways that I want no part of.|||Probably a sliding scale that depends on individual preferences...",I,N,T,J
4,ENTJ,'You're fired.|||That's another silly misconception. That approaching is logically is going to be the key to unlocking whatever it is you think you are entitled to. Nobody wants to be approached with BS...|||But guys... he REALLY wants to go on a super-duper-long-ass vacation. C'mon guys. His boss just doesn't listen or get it. He even approached him logically and everything.|||Never mind. J...,E,N,T,J


In [399]:
df.loc[:,'html_link'] = [post.count('html') for post in df['posts']]
df.loc[:,'words_per_comment'] = [len(post.split(' '))/50 for post in df['posts']]
df.loc[:,'exclam_mark'] = [post.count('!') for post in df['posts']]
df.loc[:,'question_mark'] = [post.count('?') for post in df['posts']]
df.head(4)

Unnamed: 0,type,posts,IE,NS,TF,JP,html_link,words_per_comment,exclam_mark,question_mark
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/...,I,N,F,J,0,11.56,3,18
1,ENTP,'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the same position often. For example me and my girlfriend are currently in an environment where we have to creatively use cowgirl and missionary. There isn't enough...|||Giving new meaning to 'Game' theory.|||Hello *ENTP Grin* That's all it takes. Than we converse and they do most of the flirting while I ...,E,N,T,P,0,23.88,0,5
2,INTP,"'Good one _____ https://www.youtube.com/watch?v=fHiGbolFFGw|||Of course, to which I say I know; that's my blessing and my curse.|||Does being absolutely positive that you and your best friend could be an amazing couple count? If so, than yes. Or it's more I could be madly in love in case I reconciled my feelings (which at...|||No, I didn't; thank you for a link!|||So-called Ti-Si loop (an...",I,N,T,P,0,17.38,4,12
3,INTJ,"'Dear INTP, I enjoyed our conversation the other day. Esoteric gabbing about the nature of the universe and the idea that every rule and social code being arbitrary constructs created...|||Dear ENTJ sub, Long time no see. Sincerely, Alpha|||None of them. All other types hurt in deep existential ways that I want no part of.|||Probably a sliding scale that depends on individual preferences...",I,N,T,J,0,22.02,3,11


In [400]:
ie_df = df.groupby('IE').count().reset_index()[['IE','type']].rename(columns = {'type':'count','IE':'type'})
ie_df['group'] = 'IE'

ns_df = df.groupby('NS').count().reset_index()[['NS','type']].rename(columns = {'type':'count','NS':'type'})
ns_df['group'] = 'NS'

tf_df = df.groupby('TF').count().reset_index()[['TF','type']].rename(columns = {'type':'count','TF':'type'})
tf_df['group'] = 'TF'

jp_df = df.groupby('JP').count().reset_index()[['JP','type']].rename(columns = {'type':'count','JP':'type'})
jp_df['group'] = 'JP'

stat_df = ie_df.append(ns_df).append(tf_df).append(jp_df)

stat_df.loc[:,'percentage'] = [round(x/len(df)*100,2) for x in stat_df['count']]

fig = px.bar(stat_df, x="group", y="count", color="type", title="Long-Form Input", text = 'percentage')

fig.update_layout(height = 600, width = 800)

fig.show()

In [401]:
def multiple_replace(dict, text):
  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

def lemmatize_string(item):
    #lemmatizer = WordNetLemmatizer()
    #return ' '.join([lemmatizer.lemmatize(w) for w in item.split(' ')])
    doc = nlp(item)
    return ' '.join([token.lemma_ for token in doc if token.lemma_ != '-PRON-'])

def stem_string(item):
    ps = PorterStemmer()
    return ' '.join([ps.stem(w) for w in item.split(' ')])

pronouns = ['i','you','he','she','it','we','they','there','here','that']
verbs = ['do','don','does','doesn','is','isn','are','aren','can','will','won','did','didn','would','wouldn','could','couldn']
contractions = ['s','ve','t','re','m']

contraction_list = [(p+'\''+c) for c in contractions for p in pronouns]+[(v+'\''+c) for c in contractions for v in verbs]

contraction_dict = {}

for key in contraction_list:
    contraction_dict[key] = ''

In [402]:
def post_clean(item):

    pattern = re.compile(r'http.[^|]*')
    string = re.sub(pattern, '', item)

    pattern = re.compile(r'\|\|\|')
    string = re.sub(pattern, '', string)
    
    pattern = re.compile(r'[0-9]')
    string = re.sub(pattern, '', string).lower()

    mtbi_acrynym_dict = {'istj':'','isfj':'',
                         'infj':'','intj':'',
                         'istp':'','isfp':'',
                         'infp':'','intp':'',
                         'estp':'','esfp':'',
                         'enfp':'','entp':'',
                         'estj':'','esfj':'',
                         'enfj':'','entj':''}

    string = multiple_replace(mtbi_acrynym_dict, string)

    #string6 = lemmatize(string5)
    string = multiple_replace(contraction_dict, string)

    all_punctuation_list = '''!()-[]{};:"\|,+<>./?@#$%^&*_~='''

    punct_dict = dict()

    for x in all_punctuation_list:
        punct_dict[x] = ' '

    string = multiple_replace(punct_dict, string)

    return string

In [403]:
df['posts'] = df['posts'].apply(lambda x: post_clean(x))
print('Cleaning done!')

df.loc[:,'IE_hot'] = [1 if x == 'I' else 0 for x in df['IE']]
df.loc[:,'NS_hot'] = [1 if x == 'N' else 0 for x in df['NS']]
df.loc[:,'TF_hot'] = [1 if x == 'T' else 0 for x in df['TF']]
df.loc[:,'JP_hot'] = [1 if x == 'J' else 0 for x in df['JP']]
df.head(4)

Cleaning done!


Unnamed: 0,type,posts,IE,NS,TF,JP,html_link,words_per_comment,exclam_mark,question_mark,IE_hot,NS_hot,TF_hot,JP_hot
0,INFJ,' and moments what has been the most life changing experience in your life may the perc experience immerse you the last thing my friend posted on his facebook before committing suicide the next day rest in peace hello sorry to hear of your distress only natural for a relationship to not be perfection all the time in every moment of existence try to figure the hard times as times of...,I,N,F,J,0,11.56,3,18,1,1,0,1
1,ENTP,' finding the lack of me in these posts very alarming sex can be boring if in the same position often for example me and my girlfriend are currently in an environment where we have to creatively use cowgirl and missionary there enough giving new meaning to 'game' theory hello grin all it takes than we converse and they do most of the flirting while i acknowledge their presence and ...,E,N,T,P,0,23.88,0,5,0,1,1,0
2,INTP,'good one of course to which i say i know my blessing and my curse does being absolutely positive that you and your best friend could be an amazing couple count if so than yes or more i could be madly in love in case i reconciled my feelings which at no i thank you for a link so called ti si loop and it can stem from any current topic obsession can be deadly like wh...,I,N,T,P,0,17.38,4,12,1,1,1,0
3,INTJ,'dear i enjoyed our conversation the other day esoteric gabbing about the nature of the universe and the idea that every rule and social code being arbitrary constructs created dear sub long time no see sincerely alphanone of them all other types hurt in deep existential ways that i want no part of probably a sliding scale that depends on individual preferences like everything...,I,N,T,J,0,22.02,3,11,1,1,1,1


In [349]:
df[['html_link', 'words_per_comment', 'exclam_mark', 'question_mark', 'IE_hot']].corr()

Unnamed: 0,html_link,words_per_comment,exclam_mark,question_mark,IE_hot
html_link,1.0,-0.022849,-0.001913,0.038555,-0.016849
words_per_comment,-0.022849,1.0,0.051825,-0.113651,-0.006831
exclam_mark,-0.001913,0.051825,1.0,0.090001,-0.141437
question_mark,0.038555,-0.113651,0.090001,1.0,-0.040108
IE_hot,-0.016849,-0.006831,-0.141437,-0.040108,1.0


In [203]:
counter = 0

post_list = []

for post in df['posts']:
    post_list.append(lemmatize_string(post))
    counter = counter+1
    if counter%100 == 0:
        print(counter, end = ' ')

100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 

In [409]:
df['posts'] = post_list
df = df[['posts','html_link','words_per_comment','exclam_mark', 'question_mark', 'IE_hot','NS_hot','TF_hot','JP_hot']]

df.loc[:,'html_link'] = MinMaxScaler(feature_range=(0,1)).fit_transform(df[['html_link']])
df.loc[:,'words_per_comment'] = MinMaxScaler(feature_range=(0,1)).fit_transform(df[['words_per_comment']])
df.loc[:,'exclam_mark'] = MinMaxScaler(feature_range=(0,1)).fit_transform(df[['exclam_mark']])
df.loc[:,'question_mark'] = MinMaxScaler(feature_range=(0,1)).fit_transform(df[['question_mark']])

df = shuffle(df)

train_df = df.iloc[:round(len(df)*0.8)]
test_df = df.iloc[round(len(df)*0.8):]

train_df.head(4)

Unnamed: 0,posts,html_link,words_per_comment,exclam_mark,question_mark,IE_hot,NS_hot,TF_hot,JP_hot
4272,hi i d like to change username to fuliajulia thank kitteh hey so i haven t see anybody talk about but ej arendee delete channel not start this thread to speculate about mental health or anything but as be talk about in damnit why ya always get to be so soul searchy be difficult to insult because i want to hurt damn aura of innocence i be interested in what type...,0.0,0.666667,0.018265,0.090909,1,1,0,0
3325,well honestly would like and i know that be go to heaven when i die and go to live in eternal happiness with everyone that matter most to definitely a the first one have always dream of live in a medieval chronicle of narnia type of world p girlfriend lion pet o o medium size no food kid plastic drink outta waterfall jumpi understand man b...,0.0,0.28125,0.0,0.049587,1,1,1,0
5461,can i quote mr t stay in school fool wish i would have get ged now be a semi driver money be good but not the life i want but the life i have to choose i have dyslexia and yes have be a while eylrid be drive a semi and try to learn how to repair pattern and build box for for a foundry have much time for anything anymore work hour a day wow like the quote sta...,0.0,0.575181,0.031963,0.057851,1,1,0,1
7565,i want to date someone base just on personality type a lot more to each individual i can see how someone may come to the conclusion that a particular personality type would be read look at funny stuff online do something physical find a new project i hope really just hypothetical yup this happen to a lot too i know when i be a child i read all the time ...,0.0,0.668478,0.063927,0.033058,1,1,1,0


In [415]:
# Initialize tfidf vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 0.9, min_df = 0.1)

vector_train = vectorizer.fit_transform(train_df['posts'])
vector_test = vectorizer.transform(test_df['posts'])

train_df2 = pd.DataFrame.sparse.from_spmatrix(vector_train, columns = vectorizer.get_feature_names()).\
    merge(train_df[['html_link', 'words_per_comment', 'exclam_mark','question_mark']].reset_index(), left_index = True, right_index= True).drop(columns = 'index')

test_df2 = pd.DataFrame.sparse.from_spmatrix(vector_test, columns = vectorizer.get_feature_names()).\
    merge(test_df[['html_link', 'words_per_comment', 'exclam_mark','question_mark']].reset_index(), left_index = True, right_index= True).drop(columns = 'index')

sparse_train_df = scipy.sparse.csr_matrix(train_df2.values)
sparse_test_df = scipy.sparse.csr_matrix(test_df2.values)

In [416]:
# IE model instance
ie_classifier = XGBClassifier()
ie_classifier.fit(sparse_train_df, train_df['IE_hot'])

ns_classifier = XGBClassifier()
ns_classifier.fit(sparse_train_df, train_df['NS_hot'])

tf_classifier = XGBClassifier()
tf_classifier.fit(sparse_train_df, train_df['TF_hot'])

jp_classifier = XGBClassifier()
jp_classifier.fit(sparse_train_df, train_df['JP_hot'])

XGBClassifier()

In [417]:
# run predictons
ie_pred = ie_classifier.predict(sparse_test_df)

ns_pred = ns_classifier.predict(sparse_test_df)

tf_pred = tf_classifier.predict(sparse_test_df)

jp_pred = jp_classifier.predict(sparse_test_df)

In [383]:
def feature_importances(classifier):
    feature_importance_dict = {}
    for x,y in zip(list(train_df2.columns), list(tf_classifier.feature_importances_)):
        feature_importance_dict[x] = y
    return {k: v for k, v in sorted(feature_importance_dict.items(), key=lambda item: item[1])}

In [418]:
accuracy_metrics_df = pd.DataFrame()

accuracy_metrics_df['Metric'] = ['accuracy', 'recall', 'precison', 'f1_score']

accuracy_metrics_df.loc[:,'IE'] = [metrics.accuracy_score(test_df['IE_hot'], ie_pred),
                                   metrics.recall_score(test_df['IE_hot'], ie_pred),
                                   metrics.precision_score(test_df['IE_hot'], ie_pred),
                                   metrics.f1_score(test_df['IE_hot'], ie_pred)]

accuracy_metrics_df.loc[:,'NS'] = [metrics.accuracy_score(test_df['NS_hot'], ns_pred),
                                   metrics.recall_score(test_df['NS_hot'], ns_pred),
                                   metrics.precision_score(test_df['NS_hot'], ns_pred),
                                   metrics.f1_score(test_df['NS_hot'], ns_pred)]

accuracy_metrics_df.loc[:,'TF'] = [metrics.accuracy_score(test_df['TF_hot'], tf_pred),
                                   metrics.recall_score(test_df['TF_hot'], tf_pred),
                                   metrics.precision_score(test_df['TF_hot'], tf_pred),
                                   metrics.f1_score(test_df['TF_hot'], tf_pred)]

accuracy_metrics_df.loc[:,'JP'] = [metrics.accuracy_score(test_df['TF_hot'], jp_pred),
                                   metrics.recall_score(test_df['TF_hot'], jp_pred),
                                   metrics.precision_score(test_df['TF_hot'], jp_pred),
                                   metrics.f1_score(test_df['TF_hot'], jp_pred)]

accuracy_metrics_df

Unnamed: 0,Metric,IE,NS,TF,JP
0,accuracy,0.76196,0.864553,0.604611,0.545821
1,recall,0.998488,1.0,0.551282,0.052564
2,precison,0.762702,0.864553,0.561358,0.455556
3,f1_score,0.864812,0.927357,0.556274,0.094253


In [217]:
accuracy_metrics_df.round(2)

Unnamed: 0,Metric,IE,NS,TF,JP
0,accuracy,0.78,0.85,0.75,0.51
1,recall,0.99,1.0,0.72,0.15
2,precison,0.78,0.85,0.73,0.43
3,f1_score,0.87,0.92,0.72,0.23


In [10]:
import joblib
joblib.dump(tfidf_svc, 'mbti_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [159]:
# Write two tweets as strings, one which you want to classify as Trump and one as Trudeau
post = ['I think this would be a great move for my career if I got that offer!']

# Vectorize each tweet using the TF-IDF vectorizer's transform method
post_vectorized = tfidf_vectorizer.transform(post)

tfidf_svc.predict(post_vectorized)

array(['F'], dtype=object)

In [2]:
#this is a change