In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [2]:
import pandas as pd
import joblib
import numpy as np
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Read the CSV and Perform Basic Data Cleaning

In [2]:
mbti_df = pd.read_csv("Resources/mbti_final.csv")
mbti_df.head()

Unnamed: 0,type,posts,description,i_e,n_s,f_t,j_p,sentiment_score,words_per_comment,squared_total_words,...,adjectives,adjective_count,verbs,verb_count,determiners,determiner_count,interjections,interjection_count,prepositions,preposition_count
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,Introvert Intuition Feeling Judging,I,N,F,J,0.0471,11.12,22.24,...,"['intj', 'life-changing', 'most', 'last', 'nex...",51,"['top', 'has', 'been', 'posted', 'committing',...",90,"['the', 'the', 'the', 'a', 'the', 'every', 'th...",52,[],0,"['in', 'On', 'for', 'of', 'on', 'before', 'in'...",78
1,ENTP,'I'm finding the lack of me in these posts ver...,Extrovert Intuition Thinking Perceiving,E,N,T,P,0.388976,23.4,46.8,...,"['same', 'missionary', 'new', 'theory.Hello', ...",96,"[""'m"", 'finding', 'be', 'boring', ""'s"", 'are',...",257,"['the', 'these', 'the', 'an', 'all', 'the', 't...",90,[],0,"['of', 'in', 'if', 'in', 'For', 'in', 'Than', ...",136
2,INTP,'Good one _____ https://www.youtube.com/wat...,Introvert Intuition Thinking Perceiving,I,N,T,P,0.620244,16.72,33.44,...,"['positive', 'best', 'amazing', 'more', 'So-ca...",82,"['say', 'know', ""'s"", 'being', 'be', ""'s"", 'be...",166,"['that', 'an', 'a', 'any', 'All', 'the', 'that...",52,"['yes', 'No', 'Oh', 'Yessss', 'Oh']",5,"['that', 'If', 'than', 'in', 'in', 'at', 'for'...",91
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",Introvert Intuition Thinking Judging,I,N,T,J,0.807546,21.28,42.56,...,"[""'Dear"", 'other', 'social', 'arbitrary', 'oth...",93,"['enjoyed', 'gabbing', 'being', 'created', 'hu...",233,"['the', 'the', 'the', 'the', 'every', 'no', 'A...",94,[],0,"['about', 'of', 'of', 'in', 'on', 'like', 'in'...",124
4,ENTJ,'You're fired.|||That's another silly misconce...,Extrovert Intuition Thinking Judging,E,N,T,J,0.861824,19.34,38.68,...,"['silly', 'super-duper-long-ass', 'permanent',...",87,"[""'re"", ""'s"", 'approaching', 'is', 'is', 'goin...",229,"['another', 'the', 'a', 'the', 'that', 'that',...",84,"['Oh', 'Yes']",2,"['That', 'with', 'on', 'on', 'about', 'If', 'f...",84


# Select features (columns)

In [3]:
mbti_df.columns

Index(['type', 'posts', 'description', 'i_e', 'n_s', 'f_t', 'j_p',
       'sentiment_score', 'words_per_comment', 'squared_total_words',
       'word_count_variance_per_comment', 'interrobangs_per_comment',
       'Tagged Posts PosTag', 'nouns', 'noun_count', 'adjectives',
       'adjective_count', 'verbs', 'verb_count', 'determiners',
       'determiner_count', 'interjections', 'interjection_count',
       'prepositions', 'preposition_count'],
      dtype='object')

In [4]:
# Set features. This will also be used as your x values.
selected_features = mbti_df[['sentiment_score', 'words_per_comment', 'squared_total_words',
       'word_count_variance_per_comment', 'interrobangs_per_comment','noun_count', 
       'adjective_count', 'verb_count', 
       'determiner_count', 'interjection_count','preposition_count']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [5]:
target = mbti_df["type"]
target_names = ["INFJ","INFP","INTJ",'INTP',"ISFJ","ISFP","ISTJ",'ISTP',"ENFJ","ENFP","ENTJ",'ENTP',"ESFJ","ESFP","ESTJ",'ESTP']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

In [7]:
X_train.head()

Unnamed: 0,sentiment_score,words_per_comment,squared_total_words,word_count_variance_per_comment,interrobangs_per_comment,noun_count,adjective_count,verb_count,determiner_count,interjection_count,preposition_count
2706,571.406128,28.22,56.44,127.84,1.14,310,123,305,99,0,124
2521,534.330686,20.92,41.84,186.37,0.24,296,80,202,81,0,115
4192,891.721612,25.9,51.8,113.7856,0.74,213,113,318,68,1,132
6296,1363.673841,30.04,60.08,110.109954,0.16,291,92,348,107,3,167
3399,717.557672,28.98,57.96,131.2784,0.6,267,100,334,126,7,152


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [8]:
# Scale your data
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Train the Model



In [10]:
model = SVC(kernel='linear')
model.fit(X_train_scaled, encoded_y_train)

print(f"Training Data Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.2239471257300953
Testing Data Score: 0.23236514522821577


In [11]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(encoded_y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

        INFJ       0.00      0.00      0.00        49
        INFP       0.00      0.00      0.00       160
        INTJ       0.00      0.00      0.00        51
        INTP       0.00      0.00      0.00       165
        ISFJ       0.00      0.00      0.00         8
        ISFP       0.00      0.00      0.00         8
        ISTJ       0.00      0.00      0.00        12
        ISTP       0.00      0.00      0.00        20
        ENFJ       0.09      0.00      0.01       361
        ENFP       0.24      0.89      0.37       475
        ENTJ       0.00      0.00      0.00       251
        ENTP       0.22      0.22      0.22       350
        ESFJ       0.00      0.00      0.00        54
        ESFP       0.00      0.00      0.00        64
        ESTJ       0.00      0.00      0.00        56
        ESTP       0.00      0.00      0.00        85

    accuracy                           0.23      2169
   macro avg       0.03   

  'precision', 'predicted', average, warn_for)


# Re-test for I vs. E

In [12]:
target1 = mbti_df["i_e"]
target_names1 = ["Introvert","Extrovert"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target1, random_state=42)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled1 = X_scaler.transform(X_train)
X_test_scaled1 = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train1 = label_encoder.transform(y_train)
encoded_y_test1 = label_encoder.transform(y_test)

model1 = SVC(kernel='linear')
model1.fit(X_train_scaled1, encoded_y_train1)

print(f"Training Data I_E Score: {model1.score(X_train_scaled1, encoded_y_train1)}")
print(f"Testing Data I_E Score: {model1.score(X_test_scaled1, encoded_y_test1)}")

predictions1 = model1.predict(X_test_scaled1)
print(classification_report(encoded_y_test1, predictions1,
                            target_names=target_names1))

Training Data I_E Score: 0.7654472794343683
Testing Data I_E Score: 0.7819271553711388
              precision    recall  f1-score   support

   Introvert       0.00      0.00      0.00       473
   Extrovert       0.78      1.00      0.88      1696

    accuracy                           0.78      2169
   macro avg       0.39      0.50      0.44      2169
weighted avg       0.61      0.78      0.69      2169



  'precision', 'predicted', average, warn_for)


# Hypertune

In [20]:
# Create the GridSearchCV model
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01],
              'kernel':['linear','rbf','poly']}
grid1 = GridSearchCV(model1, param_grid, verbose=3)

# Train the model with GridSearch
grid1.fit(X_train_scaled1, encoded_y_train1)

print(grid1.best_params_)
print(grid1.best_score_)

predictions1 = grid1.predict(X_test_scaled1)

print(f"Training Data I_E Score: {grid1.score(X_train_scaled1, encoded_y_train1)}")
print(f"Testing Data I_E Score: {grid1.score(X_test_scaled1, encoded_y_test1)}")

print(classification_report(encoded_y_test1, predictions1, target_names=target_names1))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.765, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.765, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.766, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.765, total=   0.4s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.765, total=   0.3s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.766, total=   0.3s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.765, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.765, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.766, total=   0.2s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV] .

[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.765, total=   0.2s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.765, total=   0.2s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.766, total=   0.2s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.765, total=   0.6s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.765, total=   0.6s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.766, total=   0.6s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.765, total=   0.6s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:   44.8s finished


{'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}
0.7654472794343683
Training Data I_E Score: 0.7654472794343683
Testing Data I_E Score: 0.7819271553711388
              precision    recall  f1-score   support

   Introvert       0.00      0.00      0.00       473
   Extrovert       0.78      1.00      0.88      1696

    accuracy                           0.78      2169
   macro avg       0.39      0.50      0.44      2169
weighted avg       0.61      0.78      0.69      2169



  'precision', 'predicted', average, warn_for)


In [28]:
joblib.dump(grid1.best_estimator_, 'Models/svm_ie.sav')

['Models/svm_ie.sav']

# Re-test for N vs. S

In [13]:
target2 = mbti_df["n_s"]
target_names2 = ["Intuition","Sensing"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target2, random_state=42)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled2 = X_scaler.transform(X_train)
X_test_scaled2 = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train2 = label_encoder.transform(y_train)
encoded_y_test2 = label_encoder.transform(y_test)

model2 = SVC(kernel='linear')
model2.fit(X_train_scaled2, encoded_y_train2)

print(f"Training Data N_S Score: {model2.score(X_train_scaled2, encoded_y_train2)}")
print(f"Testing Data N_S Score: {model2.score(X_test_scaled2, encoded_y_test2)}")

predictions2 = model2.predict(X_test_scaled2)
print(classification_report(encoded_y_test2, predictions2,
                            target_names=target_names2))

Training Data N_S Score: 0.8632031970488779
Testing Data N_S Score: 0.8584601198709082
              precision    recall  f1-score   support

   Intuition       0.86      1.00      0.92      1862
     Sensing       0.00      0.00      0.00       307

    accuracy                           0.86      2169
   macro avg       0.43      0.50      0.46      2169
weighted avg       0.74      0.86      0.79      2169



  'precision', 'predicted', average, warn_for)


# Hypertune

In [29]:
# Create the GridSearchCV model
grid2 = GridSearchCV(model2, param_grid, verbose=3)

# Train the model with GridSearch
grid2.fit(X_train_scaled2, encoded_y_train2)

print(grid2.best_params_)
print(grid2.best_score_)

predictions2 = grid2.predict(X_test_scaled2)

print(f"Training Data N_S Score: {grid2.score(X_train_scaled2, encoded_y_train2)}")
print(f"Testing Data N_S Score: {grid2.score(X_test_scaled2, encoded_y_test2)}")

print(classification_report(encoded_y_test2, predictions2, target_names=target_names2))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.863, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.863, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.863, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.863, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.863, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.863, total=   0.2s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.863, total=   0.1s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.863, total=   0.1s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.863, total=   0.1s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV] ..... C=1, gamma=0.001, kernel=linear, score=0.863, total=   0.2s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV] .

[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.863, total=   0.1s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.863, total=   0.1s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.863, total=   0.6s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.863, total=   0.4s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.863, total=   0.7s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.863, total=   0.4s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.863, total=   0.4s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:   32.4s finished


{'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}
0.8632031970488779
Training Data N_S Score: 0.8632031970488779
Testing Data N_S Score: 0.8584601198709082
              precision    recall  f1-score   support

   Intuition       0.86      1.00      0.92      1862
     Sensing       0.00      0.00      0.00       307

    accuracy                           0.86      2169
   macro avg       0.43      0.50      0.46      2169
weighted avg       0.74      0.86      0.79      2169



  'precision', 'predicted', average, warn_for)


In [30]:
joblib.dump(grid2.best_estimator_, 'Models/svm_ns.sav')

['Models/svm_ns.sav']

# Re-test for F vs. T

In [14]:
target3 = mbti_df["f_t"]
target_names3 = ["Feeling","Thinking"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target3, random_state=42)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled3 = X_scaler.transform(X_train)
X_test_scaled3 = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train3 = label_encoder.transform(y_train)
encoded_y_test3 = label_encoder.transform(y_test)

model3 = SVC(kernel='linear')
model3.fit(X_train_scaled3, encoded_y_train3)

print(f"Training Data F_T Score: {model3.score(X_train_scaled3, encoded_y_train3)}")
print(f"Testing Data F_T Score: {model3.score(X_test_scaled3, encoded_y_test3)}")

predictions3 = model3.predict(X_test_scaled3)
print(classification_report(encoded_y_test3, predictions3,
                            target_names=target_names3))

Training Data F_T Score: 0.6034429757147248
Testing Data F_T Score: 0.6154910096818811
              precision    recall  f1-score   support

     Feeling       0.63      0.71      0.67      1179
    Thinking       0.59      0.51      0.55       990

    accuracy                           0.62      2169
   macro avg       0.61      0.61      0.61      2169
weighted avg       0.61      0.62      0.61      2169



# Hypertune

In [31]:
grid3 = GridSearchCV(model3, param_grid, verbose=3)

# Train the model with GridSearch
grid3.fit(X_train_scaled3, encoded_y_train3)

print(grid3.best_params_)
print(grid3.best_score_)

predictions3 = grid3.predict(X_test_scaled3)

print(f"Training Data F_T Score: {grid3.score(X_train_scaled3, encoded_y_train3)}")
print(f"Testing Data F_T Score: {grid3.score(X_test_scaled3, encoded_y_test3)}")

print(classification_report(encoded_y_test3, predictions3, target_names=target_names3))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.596, total=   0.9s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.621, total=   0.9s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.599, total=   0.8s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.540, total=   0.7s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.540, total=   0.7s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.540, total=   0.7s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.540, total=   0.5s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.540, total=   0.5s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.540, total=   0.5s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV] .

[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.540, total=   0.4s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.540, total=   0.4s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.540, total=   0.5s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.595, total=   2.5s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.622, total=   2.6s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.597, total=   2.5s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.599, total=   0.7s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  1.3min finished


{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.6114355979096219
Training Data F_T Score: 0.6163541346449432
Testing Data F_T Score: 0.6168741355463347
              precision    recall  f1-score   support

     Feeling       0.63      0.70      0.67      1179
    Thinking       0.59      0.51      0.55       990

    accuracy                           0.62      2169
   macro avg       0.61      0.61      0.61      2169
weighted avg       0.61      0.62      0.61      2169



In [32]:
joblib.dump(grid3.best_estimator_, 'Models/svm_ft.sav')

['Models/svm_ft.sav']

# Re-test for J vs. P

In [33]:
target4 = mbti_df["j_p"]
target_names4 = ["Judging","Perceiving"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target4, random_state=42)

X_scaler = StandardScaler().fit(X_train)
X_train_scaled4 = X_scaler.transform(X_train)
X_test_scaled4 = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train4 = label_encoder.transform(y_train)
encoded_y_test4 = label_encoder.transform(y_test)

model4 = SVC(kernel='linear')
model4.fit(X_train_scaled4, encoded_y_train4)

print(f"Training Data J_P Score: {model4.score(X_train_scaled4, encoded_y_train4)}")
print(f"Testing Data J_P Score: {model4.score(X_test_scaled4, encoded_y_test4)}")

predictions4 = model4.predict(X_test_scaled4)
print(classification_report(encoded_y_test4, predictions4,
                            target_names=target_names4))

Training Data J_P Score: 0.6015985244389794
Testing Data J_P Score: 0.611802674043338
              precision    recall  f1-score   support

     Judging       0.00      0.00      0.00       842
  Perceiving       0.61      1.00      0.76      1327

    accuracy                           0.61      2169
   macro avg       0.31      0.50      0.38      2169
weighted avg       0.37      0.61      0.46      2169



  'precision', 'predicted', average, warn_for)


# Hypertune

In [34]:
grid4 = GridSearchCV(model4, param_grid, verbose=3)

# Train the model with GridSearch
grid4.fit(X_train_scaled4, encoded_y_train4)

print(grid4.best_params_)
print(grid4.best_score_)

predictions4 = grid4.predict(X_test_scaled4)

print(f"Training Data J_P Score: {grid4.score(X_train_scaled2, encoded_y_train4)}")
print(f"Testing Data J_P Score: {grid4.score(X_test_scaled2, encoded_y_test4)}")

print(classification_report(encoded_y_test4, predictions4, target_names=target_names4))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] C=1, gamma=0.0001, kernel=linear ................................
[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.602, total=   0.4s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.602, total=   0.4s
[CV] C=1, gamma=0.0001, kernel=linear ................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] .... C=1, gamma=0.0001, kernel=linear, score=0.601, total=   0.4s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.602, total=   0.6s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.602, total=   0.6s
[CV] C=1, gamma=0.0001, kernel=rbf ...................................
[CV] ....... C=1, gamma=0.0001, kernel=rbf, score=0.601, total=   0.6s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.602, total=   0.4s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.602, total=   0.4s
[CV] C=1, gamma=0.0001, kernel=poly ..................................
[CV] ...... C=1, gamma=0.0001, kernel=poly, score=0.601, total=   0.4s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV] .

[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.602, total=   0.4s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.602, total=   0.4s
[CV] C=10, gamma=0.0001, kernel=poly .................................
[CV] ..... C=10, gamma=0.0001, kernel=poly, score=0.601, total=   0.4s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.602, total=   2.3s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.602, total=   1.2s
[CV] C=10, gamma=0.001, kernel=linear ................................
[CV] .... C=10, gamma=0.001, kernel=linear, score=0.601, total=   1.9s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.602, total=   0.9s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  1.1min finished


{'C': 1, 'gamma': 0.0001, 'kernel': 'linear'}
0.6015985244389794
Training Data J_P Score: 0.6015985244389794
Testing Data J_P Score: 0.611802674043338
              precision    recall  f1-score   support

     Judging       0.00      0.00      0.00       842
  Perceiving       0.61      1.00      0.76      1327

    accuracy                           0.61      2169
   macro avg       0.31      0.50      0.38      2169
weighted avg       0.37      0.61      0.46      2169



  'precision', 'predicted', average, warn_for)


In [35]:
joblib.dump(grid4.best_estimator_, 'Models/svm_jp.sav')

['Models/svm_jp.sav']

# Test

In [3]:
test = joblib.load('Models/svm_jp.sav')



In [4]:
test

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [45]:
pred_test = test.predict(X_test_scaled4[:5])
pred_test

array([1, 1, 1, 1, 1])

In [47]:
label_encoder.inverse_transform(pred_test)

array(['P', 'P', 'P', 'P', 'P'], dtype=object)

In [42]:
encoded_y_train4[:5]

array([1, 1, 1, 0, 0])