# Libraries

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from numpy.random import random_sample
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import category_encoders as ce

In [3]:
ps = PorterStemmer()

# Open CSV

In [4]:
data = pd.read_csv('mbti_1.csv')

In [5]:
data

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


# Cleaning text

In [7]:
def cleaning_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words ('english')]
    text = ' '.join(text)
    return text

In [18]:
corpus = []
for row in range(0,100):
    res = cleaning_text(data['posts'][row])
    corpus.append(res)

In [19]:
corpus

['http www youtub com watch v qsxhcwe krw http media tumblr com tumblr lfouy pma qa rooo jpg enfp intj moment http www youtub com watch v iz le g xm sportscent top ten play http www youtub com watch v ucdfz etec prank life chang experi life http www youtub com watch v vxzeywwrdw http www youtub com watch v u ejam dp e repeat today may perc experi immers last thing infj friend post facebook commit suicid next day rest peac http vimeo com hello enfj sorri hear distress natur relationship perfect time everi moment exist tri figur hard time time growth http wallpaperpass com upload friendship boy girl wallpap jpg http asset dornob com wp content upload round home design jpg welcom stuff http playeress com wp content upload red red pokemon master jpg game set match prozac wellbrutin least thirti minut move leg mean move sit desk chair weed moder mayb tri edibl healthier altern basic come three item determin type whichev type want would like use given type cognit function whatnot left thing 

# X & y

In [20]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:,0].values

In [21]:
print(X)
print(y)

[[0 0 0 ... 1 0 0]
 [0 0 0 ... 0 7 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['INFJ' 'ENTP' 'INTP' ... 'INTP' 'INFP' 'INFP']


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

ValueError: Found input variables with inconsistent numbers of samples: [100, 8675]

# Naive Bayes

In [23]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

NameError: name 'X_train' is not defined

In [24]:
accuracy_score(y_test, y_pred)

NameError: name 'y_test' is not defined

## One hot encoder

In [26]:
data['type'].describe()

count     8675
unique      16
top       INFP
freq      1832
Name: type, dtype: object

In [30]:
encoder = ce.OneHotEncoder(cols='type', use_cat_names=True)

In [32]:
train, Test = train_test_split(data, random_state=42)

In [33]:
train_encoded = encoder.fit_transform(train)
test_encoded = encoder.transform(Test)

In [34]:
test_encoded

Unnamed: 0,type_ENFP,type_INFP,type_INTJ,type_INFJ,type_ISFP,type_INTP,type_ISTJ,type_ESTJ,type_ENTP,type_ENFJ,type_ISTP,type_ENTJ,type_ISFJ,type_ESFJ,type_ESFP,type_ESTP,posts
2802,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,'This. When I lie it's to avoid an unreasonabl...
2166,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,'I said zero flexibility and little time for d...
1919,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,'This has to be written with bias or something...
360,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,'HAuhuHAuh You might be right Muhicz. I'm just...
1115,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,'Her parents are kind of the go to college = g...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6401,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"'In general, I think you should do what makes ..."
1033,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"'Dear ISTPs, 7000 posts and this doesn't qua..."
6652,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,'RSPCA called to rescue lizard that turns out ...
3541,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,'It doesn't really sound like you read him wro...
