Initial Tests!
==========

Imports and preprocessing
---------------------------------------

In [1]:
# add modules here
import re
import copy
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk

# sklearn utilities

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
mbti_dict = {
    "ISTJ": 1,
    "ISFJ": 2,
    "INFJ": 3,
    "INTJ": 4,
    "ISTP": 5,
    "ISFP": 6,
    "INFP": 7,
    "INTP": 8,
    "ESTP": 9,
    "ESFP": 10,
    "ENFP": 11,
    "ENTP": 12,
    "ESTJ": 13,
    "ESFJ": 14,
    "ENFJ": 15,
    "ENTJ": 16
}

In [3]:
mbti_file = open("mbti_1.csv", "r", encoding="utf-8")

mbti_rows = mbti_file.readlines()

mbti_data_list = []

#Split the csv into label and 
for i in range(len(mbti_rows)):
    mbti_data_list.append([mbti_rows[i][0:4],mbti_rows[i][5:]])

#Process posts

mbti_list_processed = copy.deepcopy(mbti_data_list)

alpha_match = re.compile('[^a-zA-Z \']')
link_match = re.compile("\\b[^\s]*(http)[^\s]+\\b")#Matches links for removal

multispace_match = re.compile(r" +")

for j in range(len(mbti_list_processed)):
    posts_raw = mbti_list_processed[j][1]
    posts = posts_raw.casefold()
    posts = posts.replace('|||', ' ')
    
    posts = " "+posts+" "
    posts = link_match.sub(' ', posts)
    posts = alpha_match.sub(' ', posts)
    posts = posts.replace("'",'')
    posts = multispace_match.sub(' ', posts)
    mbti_list_processed[j][1] = posts;
    

#print(mbti_list_processed)
#print(mbti_data_list[0])


mbti_zipped = list(zip(*mbti_list_processed)) #Swap rows, columns.






In [4]:
target_nums = []

for t in mbti_zipped[0]:
    target_nums.append(mbti_dict[t]) #Convert from text targets to numbered targets
    
print(len(target_nums))

8675


In [5]:
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=False,  
                        ngram_range=(1,1),stop_words='english', min_df=.005)
#transform
tf_idf_data = tf_idf_vec.fit_transform(mbti_zipped[1])
tf_arr = tf_idf_data.toarray()

target_num_arr = np.array(target_nums)
target_num_arr = np.reshape(target_num_arr, (target_num_arr.shape[0],1))

data_scaler = StandardScaler()
tf_arr = data_scaler.fit_transform(tf_arr)


full_data = np.append(tf_arr, target_num_arr, 1)






#create dataframe for display purposes mostly
tf_idf_dataframe=pd.DataFrame(full_data,columns=np.append(tf_idf_vec.get_feature_names(),'target'))
tf_idf_dataframe.shape


(8675, 8002)

In [6]:
tf_idf_dataframe.head()

Unnamed: 0,abandon,abandoned,abilities,ability,able,abnormal,aboard,abortion,abrasive,abroad,...,zealand,zelda,zen,zero,zodiac,zombie,zombies,zone,zoo,target
0,-0.099116,-0.105053,-0.153346,-0.354876,-0.599665,-0.095587,-0.073952,-0.093108,-0.079404,-0.074454,...,-0.070125,-0.083323,-0.084912,-0.169169,-0.098357,-0.125198,-0.107146,-0.195881,-0.070969,3.0
1,-0.099116,8.057097,-0.153346,-0.354876,0.7662,-0.095587,-0.073952,-0.093108,-0.079404,-0.074454,...,-0.070125,-0.083323,-0.084912,-0.169169,-0.098357,-0.125198,-0.107146,-0.195881,-0.070969,12.0
2,-0.099116,-0.105053,-0.153346,5.25814,1.017383,-0.095587,-0.073952,-0.093108,-0.079404,-0.074454,...,-0.070125,-0.083323,-0.084912,-0.169169,-0.098357,-0.125198,-0.107146,-0.195881,-0.070969,8.0
3,-0.099116,-0.105053,-0.153346,-0.354876,2.366856,-0.095587,-0.073952,-0.093108,-0.079404,-0.074454,...,-0.070125,-0.083323,-0.084912,-0.169169,-0.098357,-0.125198,-0.107146,-0.195881,-0.070969,4.0
4,-0.099116,-0.105053,-0.153346,-0.354876,-0.599665,-0.095587,-0.073952,-0.093108,-0.079404,-0.074454,...,-0.070125,-0.083323,-0.084912,-0.169169,-0.098357,-0.125198,-0.107146,-0.195881,-0.070969,16.0


SVM Testing
------------------

In [7]:
x_train, x_test, y_train, y_test = train_test_split(full_data[:, :-1], full_data[:, -1], test_size = .25, random_state = 0)

In [8]:
svm_1 = SVC(max_iter = 1000, gamma = 'auto')
svm_1.fit(x_train, y_train)



SVC(gamma='auto', max_iter=1000)

In [9]:
reg_1 = LogisticRegression(max_iter = 100000)
reg_1.fit(x_train,y_train)

LogisticRegression(max_iter=100000)

In [10]:
reg_1_preds = reg_1.predict(x_test)
reg_conf = confusion_matrix(y_test, reg_1_preds)
print(reg_conf)

[[ 13   1   3   8   0   0   9   7   0   0   1   1   0   0   0   0]
 [  0  15   9   3   0   1  12   3   0   0   1   2   0   0   0   0]
 [  1   1 225  21   3   2  79  24   0   0   9   9   0   0   1   0]
 [  1   0  46 135   2   2  34  47   0   0  10  11   0   0   0   2]
 [  0   1   6   7  25   0  12  18   0   0   1   3   0   0   0   0]
 [  0   1  13   8   0  15  18   6   0   0   3   2   0   0   0   0]
 [  2   0  64  22   1   4 288  35   0   0  16   6   0   0   2   0]
 [  1   0  27  36   1   3  51 192   0   0   5  19   0   0   0   0]
 [  0   0   2   1   1   0   4   3   3   0   2   2   0   0   0   0]
 [  0   0   4   2   0   0   0   1   0   0   0   4   0   0   0   0]
 [  1   0  26  10   2   0  39  13   0   0  71   9   0   0   1   0]
 [  1   0  21  14   3   2  14  39   0   0  12  71   0   0   0   1]
 [  0   0   2   2   0   0   2   0   0   0   1   2   0   0   0   1]
 [  0   0   5   0   0   0   4   5   0   0   0   1   0   1   0   0]
 [  0   0  13   5   1   0  14   1   0   0   4   0   0   0   9 

In [11]:
print(classification_report(y_test, reg_1_preds))

              precision    recall  f1-score   support

         1.0       0.65      0.30      0.41        43
         2.0       0.79      0.33      0.46        46
         3.0       0.47      0.60      0.53       375
         4.0       0.48      0.47      0.47       290
         5.0       0.62      0.34      0.44        73
         6.0       0.52      0.23      0.32        66
         7.0       0.49      0.65      0.56       440
         8.0       0.48      0.57      0.52       335
         9.0       1.00      0.17      0.29        18
        10.0       0.00      0.00      0.00        11
        11.0       0.52      0.41      0.46       172
        12.0       0.48      0.40      0.44       178
        13.0       0.00      0.00      0.00        10
        14.0       1.00      0.06      0.12        16
        15.0       0.69      0.19      0.30        47
        16.0       0.75      0.24      0.37        49

    accuracy                           0.50      2169
   macro avg       0.56   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
