Get spacy and gensim:

``conda install -c conda-forge spacy``\
``python -m spacy download en_core_web_lg``\
``conda install -c conda-forge gensim``


In [1]:
import pandas as pd

df = pd.read_csv('mbti_data\mbti_1.csv')
# split_posts = df['posts'].to_string().split('|||')

In [2]:
import numpy as np

# break up each row into its constituent sentences
# drop any row that has more than 50 sentences (these often have ||||||| in them)
l = df['posts'].values.tolist()
targets = df['type'].values.tolist()
new_list_1 = [x.split('|||') for x in l]

for i in range(len(new_list_1)):
    new_list_1[i].insert(0, targets[i])
    
new_list = [x for x in new_list_1 if len(x) <= 51]
df_mbti = pd.DataFrame(new_list)


In [7]:
from sklearn.model_selection import train_test_split

text_X_train, text_X_test, text_y_train, text_y_test = train_test_split(df_mbti.drop(columns=[0]),
                                                    df_mbti[0],
                                                    test_size = 0.2,
                                                    random_state = 123)

# reshape train data so each sentence is on its own row, remove Nones
text_X_train = text_X_train.to_numpy().reshape(-1,1)
text_y_train = np.c_[[text_y_train for i in range(50)]].T.reshape(-1,1)
text_train_data = np.c_[text_y_train, text_X_train]
text_train_data = text_train_data[text_train_data[:,1] != None]

text_X_test = text_X_test.to_numpy().reshape(-1,1)
text_y_test = np.c_[[text_y_test for i in range(50)]].T.reshape(-1,1)
text_test_data = np.c_[text_y_test, text_X_test]
text_test_data = text_test_data[text_test_data[:,1] != None]


print(text_test_data)

[['ISFP'
  "Now that you mention it, I see a lot of Christian Grey style ENTJ-ness in Narcisse (I'm pretty sure Anastasia was also an INFJ). Their overall relationship quite reminds me of Fifty Shades, which is..."]
 ['ISFP'
  "I'm a high school senior so don't judge me! I get to school like an hour early and I head to the art room but it's locked so I knock on the office door to see if the teacher's inside. My teacher..."]
 ['ISFP'
  "Definitely ENTP! Don't over think this. It takes one to know one."]
 ...
 ['INTP'
  "Not yet. I could do it. But I most match with Ne and Se what just can't work. .-."]
 ['INTP' "I've read very much about it. xD"]
 ['INTP'
  "At first I thoght I'm an ISTJ, but I'm not sure anymore. But there is so much N. I'm very creative, esspecially when writing music. I have soo many ideas for covers when I'm listening to music. I...'"]]


In [2]:

import spacy

# load language model
# nlp_model = spacy.load('en_core_web_sm') # 96 features
nlp_model = spacy.load('en_core_web_lg') # 300 features

text = "knock knocked bring brought"

doc = nlp_model(text)

print(doc[0].vector[:10])
print(doc.vector[:10])

from scipy.spatial.distance import cosine
knock = doc[0].vector
knocked = doc[1].vector
bring = doc[2].vector
brought = doc[3].vector

print()
print(knock.shape)

print(brought @ brought.T)
brought_2 = knocked - knock + bring
print(brought_2 @ brought_2.T)
print(brought_2 @ brought.T)


print(cosine(brought, brought))
print(cosine(brought_2, brought))
print(cosine(bring, brought))
print(cosine(knocked, brought))
print(cosine(knock, knocked))



[-0.48385    0.067619  -0.11923    0.29677   -0.0024814 -0.43181
 -0.15166    0.10901    0.35108    1.7757   ]
[-0.23949926 -0.00695597 -0.06731676  0.24106626 -0.01212585 -0.3377375
 -0.11993999  0.08199837 -0.0297675   2.4574    ]

(300,)
21.791866
30.923595
19.217073
0
0.2597215175628662
0.2976776957511902
0.5519660711288452
0.24341750144958496


In [3]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]

num_features = 300
model = Doc2Vec(documents, vector_size=num_features, window=2, min_count=1, workers=4)

from gensim.test.utils import get_tmpfile


fname = get_tmpfile("my_doc2vec_model")


model.save(fname)

model = Doc2Vec.load(fname)  # you can continue training with the loaded model!


vector2 = model.infer_vector(["'Dear INTP, I enjoyed our conversation the other day. Esoteric gabbing about the nature of the un..."])
vector3 = model.infer_vector(["'Good one _____ https://www.youtube.com/watch?v=fHiGbolFFGw|||Of course, to which I say I know; t..."])
vector4 = model.infer_vector(["'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the same po..."])


print(vector4.shape)
print(vector3)
print(vector2)


(300,)
[-1.29316410e-03  5.33098006e-04 -2.77362065e-04 -4.31931025e-04
  7.84529431e-04 -1.64688006e-03 -1.02754484e-03 -1.61585526e-03
 -4.81061346e-04  1.14485261e-03 -5.94238169e-04 -1.47988205e-03
  1.08578883e-03 -1.36727991e-03 -1.08169345e-03  9.51017952e-04
  1.25757535e-03 -1.68117083e-04  1.47246674e-03  1.22477091e-03
  1.20116470e-04  1.05490486e-04 -4.52334993e-04 -1.20382314e-03
 -2.99960288e-04  3.90275323e-04 -1.56172470e-03  8.74682853e-04
 -9.18047212e-04  2.58347398e-04  5.96967759e-04  1.59238791e-03
 -3.32279102e-04  4.42817021e-04  1.11137633e-03  3.98861564e-04
  6.49326248e-04  8.27140582e-04 -8.43588088e-04  3.04858288e-04
 -1.30302319e-03  2.97580555e-04 -9.19331738e-04 -1.48688024e-03
  1.06117921e-03 -2.26273434e-04 -6.94630435e-04 -1.56279723e-03
 -2.68328178e-04  3.10669158e-04 -5.29445533e-04 -1.20211521e-03
  2.89882038e-04  1.29755621e-03 -1.59570330e-03  1.29298365e-03
 -1.51893951e-03  9.00059938e-04  2.20980255e-05  1.27322460e-03
 -6.34803670e-04 -