In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [11]:
import pandas as pd
import joblib
import numpy as np

# Read the CSV and Perform Basic Data Cleaning

In [8]:
mbti_df = pd.read_csv("Resources/mbti_1.csv")
# Drop the null columns where all values are null
mbti_df = mbti_df.dropna(axis='columns', how='all')
# Drop the null rows
mbti_df = mbti_df.dropna()
mbti_df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [9]:
def var_row(row):
    l = []
    for i in row.split('|||'):
        l.append(len(i.split()))
    return np.var(l)

In [12]:
mbti = {'I':'Introvert', 'E':'Extrovert', 'N':'Intuition', 
        'S':'Sensing', 'T':'Thinking', 'F': 'Feeling', 
        'J':'Judging', 'P': 'Perceiving'}
#description of the type 
mbti_df['description'] = mbti_df['type'].apply(lambda x: ' '.join([mbti[l] for l in list(x)]))
#words per comment
mbti_df['words_per_comment'] = mbti_df['posts'].apply(lambda x: len(x.split())/50)
#squared totals
mbti_df['squared_total_words'] = mbti_df['words_per_comment']*2
#word count variance
mbti_df['word_count_variance_per_comment'] = mbti_df['posts'].apply(lambda x: var_row(x))
#interrobangs per comment = 
mbti_df['interrobangs_per_comment']=mbti_df['posts'].apply(lambda x: x.count('?')/50) + mbti_df['posts'].apply(lambda x: x.count('!')/50)
#preview
mbti_df.head()

Unnamed: 0,type,posts,description,words_per_comment,squared_total_words,word_count_variance_per_comment,interrobangs_per_comment
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,Introvert Intuition Feeling Judging,11.12,22.24,135.29,0.42
1,ENTP,'I'm finding the lack of me in these posts ver...,Extrovert Intuition Thinking Perceiving,23.4,46.8,187.4756,0.1
2,INTP,'Good one _____ https://www.youtube.com/wat...,Introvert Intuition Thinking Perceiving,16.72,33.44,180.69,0.32
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",Introvert Intuition Thinking Judging,21.28,42.56,181.8324,0.28
4,ENTJ,'You're fired.|||That's another silly misconce...,Extrovert Intuition Thinking Judging,19.34,38.68,196.4576,0.22


In [45]:
mbti_df['i_e']= mbti_df['type'].astype(str).str[0]
mbti_df['n_s']= mbti_df['type'].astype(str).str[1]
mbti_df['f_t']= mbti_df['type'].astype(str).str[2]
mbti_df['j_p']= mbti_df['type'].astype(str).str[3]
mbti_df.head()

Unnamed: 0,type,posts,description,words_per_comment,squared_total_words,word_count_variance_per_comment,interrobangs_per_comment,i_e,n_s,f_t,j_p
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,Introvert Intuition Feeling Judging,11.12,22.24,135.29,0.42,I,N,F,J
1,ENTP,'I'm finding the lack of me in these posts ver...,Extrovert Intuition Thinking Perceiving,23.4,46.8,187.4756,0.1,E,N,T,P
2,INTP,'Good one _____ https://www.youtube.com/wat...,Introvert Intuition Thinking Perceiving,16.72,33.44,180.69,0.32,I,N,T,P
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",Introvert Intuition Thinking Judging,21.28,42.56,181.8324,0.28,I,N,T,J
4,ENTJ,'You're fired.|||That's another silly misconce...,Extrovert Intuition Thinking Judging,19.34,38.68,196.4576,0.22,E,N,T,J


In [55]:
mbti_df.to_csv('Resources/mbti.csv',index=False)

In [26]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Target String Setting
target_string = mbti_df['posts'][0]

In [25]:
compound = analyzer.polarity_scores(target_string)["compound"]
print(compound)

0.9877


# Select features (columns)

In [16]:
mbti_df.columns

Index(['type', 'posts', 'description', 'words_per_comment',
       'squared_total_words', 'word_count_variance_per_comment',
       'interrobangs_per_comment'],
      dtype='object')

In [21]:
# Set features. This will also be used as your x values.
selected_features = mbti_df[['words_per_comment','squared_total_words','squared_total_words','interrobangs_per_comment']]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [40]:
target = mbti_df["type"]
target_names = ["INFJ","INFP","INTJ",'INTP',"ISFJ","ISFP","ISTJ",'ISTP',"ENFJ","ENFP","ENTJ",'ENTP',"ESFJ","ESFP","ESTJ",'ESTP']

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

In [29]:
X_train.head()

Unnamed: 0,words_per_comment,squared_total_words,squared_total_words.1,interrobangs_per_comment
2706,28.22,56.44,56.44,1.14
2521,20.92,41.84,41.84,0.24
4192,25.9,51.8,51.8,0.74
6296,30.04,60.08,60.08,0.16
3399,28.98,57.96,57.96,0.6


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [33]:
# Scale your data
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Train the Model



In [37]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled, encoded_y_train)

print(f"Training Data Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, encoded_y_test)}")

Training Data Score: 0.2085766984322164
Testing Data Score: 0.218994928538497


In [41]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(encoded_y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

        INFJ       0.00      0.00      0.00        49
        INFP       0.00      0.00      0.00       160
        INTJ       0.00      0.00      0.00        51
        INTP       0.00      0.00      0.00       165
        ISFJ       0.00      0.00      0.00         8
        ISFP       0.00      0.00      0.00         8
        ISTJ       0.00      0.00      0.00        12
        ISTP       0.00      0.00      0.00        20
        ENFJ       0.00      0.00      0.00       361
        ENFP       0.22      1.00      0.36       475
        ENTJ       0.00      0.00      0.00       251
        ENTP       0.00      0.00      0.00       350
        ESFJ       0.00      0.00      0.00        54
        ESFP       0.00      0.00      0.00        64
        ESTJ       0.00      0.00      0.00        56
        ESTP       0.00      0.00      0.00        85

    accuracy                           0.22      2169
   macro avg       0.01   

  'precision', 'predicted', average, warn_for)


# Re-test for I vs. E

In [47]:
target = mbti_df["i_e"]
target_names = ["Introvert","Extrovert"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)
X_train.head()

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

model = SVC(kernel='linear')
model.fit(X_train_scaled, encoded_y_train)

print(f"Training Data I_E Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data I_E Score: {model.score(X_test_scaled, encoded_y_test)}")

predictions = model.predict(X_test_scaled)
print(classification_report(encoded_y_test, predictions,
                            target_names=target_names))

Training Data I_E Score: 0.7654472794343683
Testing Data I_E Score: 0.7819271553711388
              precision    recall  f1-score   support

   Introvert       0.00      0.00      0.00       473
   Extrovert       0.78      1.00      0.88      1696

    accuracy                           0.78      2169
   macro avg       0.39      0.50      0.44      2169
weighted avg       0.61      0.78      0.69      2169



  'precision', 'predicted', average, warn_for)


# Re-test for N vs. S

In [48]:
target = mbti_df["n_s"]
target_names = ["Intuition","Sensing"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)
X_train.head()

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

model = SVC(kernel='linear')
model.fit(X_train_scaled, encoded_y_train)

print(f"Training Data N_S Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data N_S Score: {model.score(X_test_scaled, encoded_y_test)}")

predictions = model.predict(X_test_scaled)
print(classification_report(encoded_y_test, predictions,
                            target_names=target_names))

Training Data N_S Score: 0.8632031970488779
Testing Data N_S Score: 0.8584601198709082
              precision    recall  f1-score   support

   Intuition       0.86      1.00      0.92      1862
     Sensing       0.00      0.00      0.00       307

    accuracy                           0.86      2169
   macro avg       0.43      0.50      0.46      2169
weighted avg       0.74      0.86      0.79      2169



  'precision', 'predicted', average, warn_for)


# Re-test for F vs. T

In [49]:
target = mbti_df["f_t"]
target_names = ["Feeling","Thinking"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)
X_train.head()

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

model = SVC(kernel='linear')
model.fit(X_train_scaled, encoded_y_train)

print(f"Training Data F_T Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data F_T Score: {model.score(X_test_scaled, encoded_y_test)}")

predictions = model.predict(X_test_scaled)
print(classification_report(encoded_y_test, predictions,
                            target_names=target_names))

Training Data F_T Score: 0.5579465109130034
Testing Data F_T Score: 0.553711387736284
              precision    recall  f1-score   support

     Feeling       0.56      0.86      0.68      1179
    Thinking       0.53      0.19      0.28       990

    accuracy                           0.55      2169
   macro avg       0.54      0.52      0.48      2169
weighted avg       0.55      0.55      0.49      2169



# Re-test for J vs. P

In [50]:
target = mbti_df["j_p"]
target_names = ["Judging","Perceiving"]

X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)
X_train.head()

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

model = SVC(kernel='linear')
model.fit(X_train_scaled, encoded_y_train)

print(f"Training Data J_P Score: {model.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data J_P Score: {model.score(X_test_scaled, encoded_y_test)}")

predictions = model.predict(X_test_scaled)
print(classification_report(encoded_y_test, predictions,
                            target_names=target_names))

Training Data J_P Score: 0.6015985244389794
Testing Data J_P Score: 0.611802674043338
              precision    recall  f1-score   support

     Judging       0.00      0.00      0.00       842
  Perceiving       0.61      1.00      0.76      1327

    accuracy                           0.61      2169
   macro avg       0.31      0.50      0.38      2169
weighted avg       0.37      0.61      0.46      2169



  'precision', 'predicted', average, warn_for)


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [51]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [52]:
# Train the model with GridSearch
grid.fit(X_train_scaled, encoded_y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.602, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.602, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.601, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.602, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.602, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.601, total=   0.3s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.602, total=   0.3s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.602, total=   0.3s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.601, total=   0.3s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    9.7s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [53]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 0.0001}
0.6015985244389794


In [54]:
predictions = grid.predict(X_test_scaled)
predictions=to_categorical(predictions)

from sklearn.metrics import classification_report
print(classification_report(y_test_categorical, predictions, target_names=target_names))

ValueError: Multi-label binary indicator input with different numbers of labels

# Save the Model

In [36]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
filename = 'SVM.sav'
joblib.dump(model, 'SVM.sav')

['SVM.sav']