## Load Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

In [8]:
folderPath = '/content/drive/My Drive/Colab Notebooks/文字探勘初論/'

In [118]:
tfidf_corpus = pd.read_csv(folderPath+'mbti_tfidf.csv')
word2vec_corpus = pd.read_csv(folderPath+'MBTI_w2v.csv')
bert_corpus = pd.read_csv(folderPath+'mbti_bert_CLS.csv')

In [119]:
tfidf_corpus.head()

Unnamed: 0,label_type,00,000,00100000,01,01100101,02,03,05,06,...,zodiac,zoe,zombi,zon,zone,zoo,zooey,zoom,zuko,är
0,INFJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ENTP,0.0,0.049496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,INTP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,INTJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENTJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
label_type = tfidf_corpus['label_type']
label_type

0       INFJ
1       ENTP
2       INTP
3       INTJ
4       ENTJ
        ... 
8670    ISFP
8671    ENFP
8672    INTP
8673    INFP
8674    INFP
Name: label_type, Length: 8675, dtype: object

In [121]:
# drop label column
tfidf_corpus = tfidf_corpus.drop('label_type', axis=1)
word2vec_corpus = word2vec_corpus.drop('label_type', axis=1)
bert_corpus = bert_corpus.drop('label_type', axis=1)

In [122]:
from sklearn.model_selection import train_test_split

tfidf_x_train, tfidf_x_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidf_corpus, label_type, test_size=0.2, random_state=1024)
word2vec_x_train, word2vec_x_test, word2vec_y_train, word2vec_y_test = train_test_split(word2vec_corpus, label_type, test_size=0.2, random_state=1024)
bert_x_train, bert_x_test, bert_y_train, bert_y_test = train_test_split(bert_corpus, label_type, test_size=0.2, random_state=1024)

## Training with different corpus

### TFIDF

In [123]:
print(tfidf_x_train.shape)
print(tfidf_x_test.shape)
print(tfidf_y_train.shape)
print(tfidf_y_test.shape)

(6940, 10000)
(1735, 10000)
(6940,)
(1735,)


In [124]:
tfidf_x_train.head()

Unnamed: 0,00,000,00100000,01,01100101,02,03,05,06,07,...,zodiac,zoe,zombi,zon,zone,zoo,zooey,zoom,zuko,är
4751,0.0,0.0,0.0,0.0,0.0,0.062019,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### KNN

In [125]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore') 

expected_results = []
expected_results.extend(tfidf_y_test)
tfidf_accuracies = {}

k_with_best_accuracy = 0
best_accuracy = 0

# try to find the best k
for k in range(1, 15, 2):
  KNN_model = KNeighborsClassifier(n_neighbors = k)
  KNN_model.fit(tfidf_x_train, tfidf_y_train)
  accuracy = metrics.accuracy_score(expected_results, KNN_model.predict(tfidf_x_test))
  if accuracy > best_accuracy:
    k_with_best_accuracy = k
    best_accuracy = accuracy
print('Besk k and its accuracy: {}, {}'.format(k, round(accuracy, 4)))

Besk k and its accuracy: 13, 0.4548


In [126]:
KNN_model = KNeighborsClassifier(n_neighbors = k_with_best_accuracy)
KNN_model.fit(tfidf_x_train, tfidf_y_train)
predicted_results = KNN_model.predict(tfidf_x_test)
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
tfidf_accuracies['KNN'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.4548
              precision    recall  f1-score   support

        ENFJ       0.47      0.37      0.41        38
        ENFP       0.34      0.51      0.41       123
        ENTJ       0.72      0.32      0.44        41
        ENTP       0.58      0.39      0.47       127
        ESFJ       0.50      0.43      0.46         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.50      0.20      0.29         5
        ESTP       0.64      0.39      0.48        18
        INFJ       0.36      0.72      0.48       302
        INFP       0.46      0.65      0.54       379
        INTJ       0.48      0.21      0.29       213
        INTP       0.84      0.31      0.45       293
        ISFJ       0.63      0.41      0.50        29
        ISFP       0.88      0.23      0.36        61
        ISTJ       0.50      0.18      0.26        28
        ISTP       0.91      0.17      0.28        60

    accuracy                           0.45      1735
   macro

#### Naïve Bayes

In [127]:
from sklearn. naive_bayes import MultinomialNB

NB_model = MultinomialNB()
NB_model.fit(tfidf_x_train, tfidf_y_train)
predicted_results = NB_model.predict(tfidf_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
tfidf_accuracies['NB'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.2824
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.00      0.00      0.00       123
        ENTJ       0.00      0.00      0.00        41
        ENTP       0.00      0.00      0.00       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.42      0.23      0.29       302
        INFP       0.25      0.97      0.40       379
        INTJ       0.79      0.05      0.10       213
        INTP       0.50      0.15      0.23       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       0.00      0.00      0.00        60

    accuracy                           0.28      1735
   macro

#### SVM

In [128]:
from sklearn.svm import SVC # about 9mins

SVM_model = SVC(kernel='linear')
SVM_model.fit(tfidf_x_train, tfidf_y_train)
predicted_results = SVM_model.predict(tfidf_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
tfidf_accuracies['SVM'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.6847
              precision    recall  f1-score   support

        ENFJ       0.65      0.39      0.49        38
        ENFP       0.67      0.58      0.62       123
        ENTJ       0.53      0.46      0.49        41
        ENTP       0.70      0.66      0.68       127
        ESFJ       0.60      0.43      0.50         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.90      0.50      0.64        18
        INFJ       0.71      0.70      0.71       302
        INFP       0.65      0.84      0.73       379
        INTJ       0.68      0.68      0.68       213
        INTP       0.70      0.74      0.72       293
        ISFJ       0.79      0.38      0.51        29
        ISFP       0.83      0.49      0.62        61
        ISTJ       0.60      0.54      0.57        28
        ISTP       0.78      0.67      0.72        60

    accuracy                           0.68      1735
   macro

#### Random Forest

In [129]:
from sklearn.ensemble import RandomForestClassifier

RandomForest_model = RandomForestClassifier() #n_estimators=20, max_depth=4)
RandomForest_model.fit(tfidf_x_train, tfidf_y_train)
predicted_results = RandomForest_model.predict(tfidf_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
tfidf_accuracies['Random Forest'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.4876
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.80      0.16      0.27       123
        ENTJ       0.00      0.00      0.00        41
        ENTP       0.71      0.35      0.47       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.52      0.61      0.56       302
        INFP       0.40      0.90      0.55       379
        INTJ       0.57      0.36      0.44       213
        INTP       0.59      0.61      0.60       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       1.00      0.05      0.10        60

    accuracy                           0.49      1735
   macro

#### XGBoost

In [130]:
from xgboost import XGBClassifier # About 1min

XGBoost_model = XGBClassifier(tree_method='gpu_hist')
XGBoost_model.fit(tfidf_x_train, tfidf_y_train)
predicted_results = XGBoost_model.predict(tfidf_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
tfidf_accuracies['XGBoost'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.6916
              precision    recall  f1-score   support

        ENFJ       0.62      0.26      0.37        38
        ENFP       0.64      0.57      0.60       123
        ENTJ       0.61      0.54      0.57        41
        ENTP       0.60      0.65      0.62       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.50      0.20      0.29         5
        ESTP       0.80      0.67      0.73        18
        INFJ       0.69      0.74      0.71       302
        INFP       0.68      0.80      0.74       379
        INTJ       0.73      0.75      0.74       213
        INTP       0.73      0.73      0.73       293
        ISFJ       0.71      0.52      0.60        29
        ISFP       0.80      0.54      0.65        61
        ISTJ       0.59      0.57      0.58        28
        ISTP       0.78      0.65      0.71        60

    accuracy                           0.69      1735
   macro

#### Accuracy Summary

In [131]:
sorted_temp = sorted(tfidf_accuracies.items(), key=lambda x:x[1], reverse=True)
sorted_dict = dict(sorted_temp)

for model, accuracy in sorted_dict.items():
  print('{:14s}: {}'. format(model, accuracy))

XGBoost       : 0.6916
SVM           : 0.6847
Random Forest : 0.4876
KNN           : 0.4548
NB            : 0.2824


### Word2Vec

In [132]:
print(word2vec_x_train.shape)
print(word2vec_x_test.shape)
print(word2vec_y_train.shape)
print(word2vec_y_test.shape)

(6940, 300)
(1735, 300)
(6940,)
(1735,)


In [133]:
word2vec_x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
4751,0.033608,0.019713,-0.006406,0.103992,-0.073113,0.01243,0.060293,-0.06119,0.07318,0.052323,...,-0.062131,0.07653,-0.070884,0.025346,-0.049229,-0.014008,0.018575,-0.052992,0.026174,-0.003254
6586,0.015971,0.018738,-0.008684,0.095841,-0.059922,0.020814,0.055605,-0.104802,0.081261,0.052042,...,-0.044834,0.06264,-0.089133,-0.001836,-0.0556,-0.050004,0.001039,-0.058356,-0.002797,-0.010239
5535,0.003274,0.032183,0.012234,0.131873,-0.091715,0.023427,0.02211,-0.055153,0.06378,0.053258,...,0.001829,0.057937,-0.095537,0.022092,-0.058073,-0.046735,-0.023682,-0.073307,-0.008851,0.033536
8375,0.017993,0.022159,-0.00182,0.127087,-0.051276,0.010008,0.064386,-0.053379,0.06331,0.036754,...,-0.0455,0.060219,-0.068598,0.012662,-0.03691,-0.034133,0.035755,-0.062563,0.004587,-0.008409
5724,0.058109,0.018364,0.019445,0.120743,-0.082491,0.043969,0.091466,-0.050806,0.084152,0.039735,...,-0.039496,0.059131,-0.08317,0.012914,-0.062171,-0.039137,0.025268,-0.054501,0.026521,-0.005303


#### KNN

In [134]:
word2vec_accuracies = {}

k_with_best_accuracy = 0
best_accuracy = 0

# try to find the best k
for k in range(1, 15, 2):
  KNN_model = KNeighborsClassifier(n_neighbors = k)
  KNN_model.fit(word2vec_x_train, word2vec_y_train)
  accuracy = metrics.accuracy_score(expected_results, KNN_model.predict(word2vec_x_test))
  if accuracy > best_accuracy:
    k_with_best_accuracy = k
    best_accuracy = accuracy
print('Besk k and its accuracy: {}, {}'.format(k, round(accuracy, 4)))

Besk k and its accuracy: 13, 0.2692


In [141]:
KNN_model = KNeighborsClassifier(n_neighbors = k_with_best_accuracy)
KNN_model.fit(word2vec_x_train, word2vec_y_train)
predicted_results = KNN_model.predict(word2vec_x_test)
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
word2vec_accuracies['KNN'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, KNN_model.predict(word2vec_x_test)))

Accuracy:  0.2697
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.15      0.20      0.17       123
        ENTJ       0.06      0.02      0.03        41
        ENTP       0.15      0.14      0.15       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.27      0.36      0.31       302
        INFP       0.32      0.52      0.40       379
        INTJ       0.27      0.20      0.23       213
        INTP       0.31      0.25      0.28       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       1.00      0.04      0.07        28
        ISTP       0.40      0.03      0.06        60

    accuracy                           0.27      1735
   macro

#### Naïve Bayes

In [136]:
NB_model = MultinomialNB()
NB_model.fit(word2vec_x_train, word2vec_y_train)
predicted_results = NB_model.predict(word2vec_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
word2vec_accuracies['NB'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

ValueError: ignored

#### SVM

In [142]:
# about 16seconds

SVM_model = SVC(kernel='linear')
SVM_model.fit(word2vec_x_train, word2vec_y_train)
predicted_results = SVM_model.predict(word2vec_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
word2vec_accuracies['SVM'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.3061
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.00      0.00      0.00       123
        ENTJ       0.00      0.00      0.00        41
        ENTP       0.00      0.00      0.00       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.29      0.09      0.13       302
        INFP       0.29      0.88      0.43       379
        INTJ       0.37      0.07      0.12       213
        INTP       0.35      0.54      0.42       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       0.00      0.00      0.00        60

    accuracy                           0.31      1735
   macro

#### Random Forest

In [143]:
RandomForest_model = RandomForestClassifier() #n_estimators=20, max_depth=4)
RandomForest_model.fit(word2vec_x_train, word2vec_y_train)
predicted_results = RandomForest_model.predict(word2vec_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
word2vec_accuracies['Random Forest'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.3159
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.30      0.11      0.16       123
        ENTJ       0.00      0.00      0.00        41
        ENTP       0.33      0.09      0.15       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.28      0.31      0.30       302
        INFP       0.32      0.64      0.42       379
        INTJ       0.26      0.28      0.27       213
        INTP       0.40      0.42      0.41       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       0.00      0.00      0.00        60

    accuracy                           0.32      1735
   macro

#### XGBoost

In [144]:
XGBoost_model = XGBClassifier(tree_method='gpu_hist') #'gpu_hist')
XGBoost_model.fit(word2vec_x_train, word2vec_y_train)
predicted_results = XGBoost_model.predict(word2vec_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
word2vec_accuracies['XGBoost'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.3308
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.24      0.20      0.22       123
        ENTJ       0.17      0.02      0.04        41
        ENTP       0.27      0.23      0.25       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.31      0.33      0.32       302
        INFP       0.38      0.62      0.47       379
        INTJ       0.28      0.29      0.28       213
        INTP       0.35      0.39      0.37       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       0.43      0.17      0.24        60

    accuracy                           0.33      1735
   macro

#### Accuracy Summary

In [145]:
sorted_temp = sorted(word2vec_accuracies.items(), key=lambda x:x[1], reverse=True)
sorted_dict = dict(sorted_temp)

for model, accuracy in sorted_dict.items():
  print('{:14s}: {}'. format(model, accuracy))

XGBoost       : 0.3308
Random Forest : 0.3159
SVM           : 0.3061
KNN           : 0.2697


### BERT

In [146]:
print(bert_x_train.shape)
print(bert_x_test.shape)
print(bert_y_train.shape)
print(bert_y_test.shape)

(6940, 768)
(1735, 768)
(6940,)
(1735,)


In [147]:
bert_x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
4751,0.402858,0.318011,0.254947,0.049771,0.03918,0.214207,-0.144508,0.075794,-0.048228,0.030871,...,0.046557,0.290464,0.113117,0.501106,0.252866,-0.410454,-0.224503,-0.508595,0.45688,0.423962
6586,0.46781,-0.137879,0.409783,-0.099868,-0.181403,-0.205526,-0.179753,-0.084543,-0.116036,0.242073,...,-0.32275,0.179296,0.214621,0.570998,0.179275,-0.625468,-0.577467,-0.475954,0.302637,0.228944
5535,0.31256,0.241212,-0.036759,-0.181224,0.096001,0.31416,-0.038363,-0.210804,0.064968,0.177095,...,0.129015,0.210811,0.036526,0.80326,-0.088156,-0.715251,-0.775918,-0.498049,-0.003452,-0.112318
8375,0.063326,0.151152,0.184636,-0.009521,0.226647,0.2809,-0.027736,-0.210657,-0.118354,0.025944,...,0.199705,0.260579,-0.069651,0.320997,0.092945,-0.829492,-0.820121,-0.412903,0.38041,0.053854
5724,0.223298,0.120789,0.024652,0.165891,0.5394,-0.097522,-0.257311,0.019716,-0.019101,-0.006455,...,0.133186,0.247489,0.146129,0.622041,0.150296,-0.596775,-0.279438,-0.197555,0.252011,0.23048


#### KNN

In [148]:
bert_accuracies = {}

k_with_best_accuracy = 0
best_accuracy = 0

# try to find the best k
for k in range(1, 15, 2):
  KNN_model = KNeighborsClassifier(n_neighbors = k)
  KNN_model.fit(bert_x_train, bert_y_train)
  accuracy = metrics.accuracy_score(expected_results, KNN_model.predict(bert_x_test))
  if accuracy > best_accuracy:
    k_with_best_accuracy = k
    best_accuracy = accuracy
print('Besk k and its accuracy: {}, {}'.format(k, round(accuracy, 4)))

Besk k and its accuracy: 13, 0.2571


In [149]:
KNN_model = KNeighborsClassifier(n_neighbors = k_with_best_accuracy)
KNN_model.fit(bert_x_train, bert_y_train)
predicted_results = KNN_model.predict(bert_x_test)
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
bert_accuracies['KNN'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, KNN_model.predict(bert_x_test)))

Accuracy:  0.2571
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.23      0.31      0.26       123
        ENTJ       0.00      0.00      0.00        41
        ENTP       0.17      0.17      0.17       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.23      0.37      0.28       302
        INFP       0.32      0.48      0.38       379
        INTJ       0.26      0.17      0.21       213
        INTP       0.30      0.18      0.23       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       0.33      0.03      0.06        60

    accuracy                           0.26      1735
   macro

#### Naïve Bayes

In [150]:
NB_model = MultinomialNB()
NB_model.fit(bert_x_train, bert_y_train)
predicted_results = NB_model.predict(bert_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
bert_accuracies['NB'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

ValueError: ignored

#### SVM

In [151]:
# about 40seconds

SVM_model = SVC(kernel='linear')
SVM_model.fit(bert_x_train, bert_y_train)
predicted_results = SVM_model.predict(bert_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
bert_accuracies['SVM'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.3378
              precision    recall  f1-score   support

        ENFJ       0.13      0.16      0.14        38
        ENFP       0.24      0.27      0.25       123
        ENTJ       0.13      0.15      0.14        41
        ENTP       0.22      0.23      0.23       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.08      0.09      0.08        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.14      0.22      0.17        18
        INFJ       0.37      0.41      0.39       302
        INFP       0.45      0.46      0.46       379
        INTJ       0.35      0.34      0.35       213
        INTP       0.43      0.35      0.39       293
        ISFJ       0.28      0.24      0.26        29
        ISFP       0.19      0.10      0.13        61
        ISTJ       0.13      0.14      0.14        28
        ISTP       0.27      0.25      0.26        60

    accuracy                           0.34      1735
   macro

#### Random Forest

In [152]:
RandomForest_model = RandomForestClassifier() #n_estimators=20, max_depth=4)
RandomForest_model.fit(bert_x_train, bert_y_train)
predicted_results = RandomForest_model.predict(bert_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
bert_accuracies['Random Forest'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.298
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.36      0.08      0.13       123
        ENTJ       0.00      0.00      0.00        41
        ENTP       0.22      0.06      0.09       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.29      0.36      0.32       302
        INFP       0.31      0.68      0.43       379
        INTJ       0.24      0.18      0.20       213
        INTP       0.30      0.32      0.31       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       0.00      0.00      0.00        60

    accuracy                           0.30      1735
   macro 

#### XGBoost

In [153]:
XGBoost_model = XGBClassifier(tree_method='gpu_hist')
XGBoost_model.fit(bert_x_train, bert_y_train)
predicted_results = XGBoost_model.predict(bert_x_test)
# print(metrics.accuracy_score(expected_results, predicted_results))
accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
bert_accuracies['XGBoost'] = accuracy
print('Accuracy: ', accuracy)
print(metrics.classification_report(expected_results, predicted_results))

Accuracy:  0.3314
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00        38
        ENFP       0.33      0.18      0.23       123
        ENTJ       0.00      0.00      0.00        41
        ENTP       0.27      0.20      0.23       127
        ESFJ       0.00      0.00      0.00         7
        ESFP       0.00      0.00      0.00        11
        ESTJ       0.00      0.00      0.00         5
        ESTP       0.00      0.00      0.00        18
        INFJ       0.32      0.35      0.34       302
        INFP       0.35      0.64      0.45       379
        INTJ       0.27      0.25      0.26       213
        INTP       0.37      0.43      0.40       293
        ISFJ       0.00      0.00      0.00        29
        ISFP       0.00      0.00      0.00        61
        ISTJ       0.00      0.00      0.00        28
        ISTP       0.20      0.05      0.08        60

    accuracy                           0.33      1735
   macro

#### Accuracy Summary

In [154]:
sorted_temp = sorted(bert_accuracies.items(), key=lambda x:x[1], reverse=True)
sorted_dict = dict(sorted_temp)

for model, accuracy in sorted_dict.items():
  print('{:14s}: {}'. format(model, accuracy))

SVM           : 0.3378
XGBoost       : 0.3314
Random Forest : 0.298
KNN           : 0.2571


## Predict with 4 classfiers
嘗試用四個classfier分別對user的E/I, N/S, F/T, J/P進行預測

In [155]:
tfidf_corpus = pd.read_csv(folderPath+'mbti_tfidf.csv')
word2vec_corpus = pd.read_csv(folderPath+'MBTI_w2v.csv')
bert_corpus = pd.read_csv(folderPath+'mbti_bert_CLS.csv')

In [156]:
# 0 for E, 1 for I
def extractEI(row):  
  if row['label_type'][0] == "E":
    return 'E'
  elif row['label_type'][0] == 'I':
    return 'I'
# 0 for N, 1 for S
def extractNS(row):  
  if row['label_type'][1] == "N":
    return 'N'
  elif row['label_type'][1] == 'S':
    return 'S'
# 0 for F, 1 for T
def extractFT(row):  
  if row['label_type'][2] == "F":
    return 'F'
  elif row['label_type'][2] == 'T':
    return 'T'
# 0 for J, 1 for P
def extractJP(row):  
  if row['label_type'][3] == "J":
    return 'J'
  elif row['label_type'][3] == 'P':
    return 'P'

In [157]:
def KNNClassifier(x_train, y_train, x_test, y_test):
  predicted_result_list = []
  for label_category in label_categories:
    expected_results = []
    expected_results.extend(tfidf_y_test[label_category])
    KNN_model = KNeighborsClassifier(n_neighbors = k_with_best_accuracy)
    KNN_model.fit(x_train, y_train[label_category])
    predicted_results = KNN_model.predict(x_test)
    accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
    print('Accuracy for {} classfier: {}'.format(label_category, accuracy))
    # print(metrics.classification_report(expected_results, predicted_results))
    predicted_result_list.append(predicted_results)
  predicted_mbti_list = ['']*len(y_test)
  for predicted_result in predicted_result_list:
    for idx, ele in enumerate(predicted_result):
      # print(idx, ele)
      predicted_mbti_list[idx] += ele

  # calculate accuracy
  count = 0
  for predicted, expected in zip(predicted_mbti_list, y_test['label_type']):
    if predicted == expected:
      count += 1
  accuracy = round(count/len(predicted_mbti_list), 4)
  print('Accuracy of KNN: {}'.format(accuracy))

  return predicted_mbti_list, accuracy 

In [158]:
from sklearn. naive_bayes import MultinomialNB

def NBClassifier(x_train, y_train, x_test, y_test):
  predicted_result_list = []
  for label_category in label_categories:
    expected_results = []
    expected_results.extend(tfidf_y_test[label_category])
    NB_model = MultinomialNB()
    NB_model.fit(x_train, y_train[label_category])
    predicted_results = NB_model.predict(x_test)
    accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
    print('Accuracy for {} classfier: {}'.format(label_category, accuracy))
    # print(metrics.classification_report(expected_results, predicted_results))
    predicted_result_list.append(predicted_results)
  predicted_mbti_list = ['']*len(y_test)
  for predicted_result in predicted_result_list:
    for idx, ele in enumerate(predicted_result):
      # print(idx, ele)
      predicted_mbti_list[idx] += ele

  # calculate accuracy
  count = 0
  for predicted, expected in zip(predicted_mbti_list, y_test['label_type']):
    if predicted == expected:
      count += 1
  accuracy = round(count/len(predicted_mbti_list), 4)
  print('Accuracy of NB: {}'.format(accuracy))

  return predicted_mbti_list, accuracy

In [159]:
from sklearn.svm import SVC

def SVMClassifier(x_train, y_train, x_test, y_test):
  predicted_result_list = []
  for label_category in label_categories:
    expected_results = []
    expected_results.extend(tfidf_y_test[label_category])
    SVM_model = SVC(kernel='linear')
    SVM_model.fit(x_train, y_train[label_category])
    predicted_results = SVM_model.predict(x_test)
    accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
    print('Accuracy for {} classfier: {}'.format(label_category, accuracy))
    # print(metrics.classification_report(expected_results, predicted_results))
    predicted_result_list.append(predicted_results)
  predicted_mbti_list = ['']*len(y_test)
  for predicted_result in predicted_result_list:
    for idx, ele in enumerate(predicted_result):
      # print(idx, ele)
      predicted_mbti_list[idx] += ele

  # calculate accuracy
  count = 0
  for predicted, expected in zip(predicted_mbti_list, y_test['label_type']):
    if predicted == expected:
      count += 1
  accuracy = round(count/len(predicted_mbti_list), 4)
  print('Accuracy of SVM: {}'.format(accuracy))

  return predicted_mbti_list, accuracy

In [160]:
from sklearn.ensemble import RandomForestClassifier

def MyRandomForestClassifier(x_train, y_train, x_test, y_test):
  predicted_result_list = []
  for label_category in label_categories:
    expected_results = []
    expected_results.extend(tfidf_y_test[label_category])
    RandomForest_model = RandomForestClassifier()
    RandomForest_model.fit(x_train, y_train[label_category])
    predicted_results = RandomForest_model.predict(x_test)
    accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
    print('Accuracy for {} classfier: {}'.format(label_category, accuracy))
    # print(metrics.classification_report(expected_results, predicted_results))
    predicted_result_list.append(predicted_results)
  predicted_mbti_list = ['']*len(y_test)
  for predicted_result in predicted_result_list:
    for idx, ele in enumerate(predicted_result):
      # print(idx, ele)
      predicted_mbti_list[idx] += ele

  # calculate accuracy
  count = 0
  for predicted, expected in zip(predicted_mbti_list, y_test['label_type']):
    if predicted == expected:
      count += 1
  accuracy = round(count/len(predicted_mbti_list), 4)
  print('Accuracy of RandomForest: {}'.format(accuracy))

  return predicted_mbti_list, accuracy 

In [161]:
from xgboost import XGBClassifier

def MyXGBoostClassifier(x_train, y_train, x_test, y_test):
  predicted_result_list = []
  for label_category in label_categories:
    expected_results = []
    expected_results.extend(tfidf_y_test[label_category])
    XGBoost_model = XGBClassifier(tree_method='gpu_hist')
    XGBoost_model.fit(x_train, y_train[label_category])
    predicted_results = XGBoost_model.predict(x_test)
    accuracy = round(metrics.accuracy_score(expected_results, predicted_results), 4)
    print('Accuracy for {} classfier: {}'.format(label_category, accuracy))
    # print(metrics.classification_report(expected_results, predicted_results))
    predicted_result_list.append(predicted_results)
  predicted_mbti_list = ['']*len(y_test)
  for predicted_result in predicted_result_list:
    for idx, ele in enumerate(predicted_result):
      # print(idx, ele)
      predicted_mbti_list[idx] += ele

  # calculate accuracy
  count = 0
  for predicted, expected in zip(predicted_mbti_list, y_test['label_type']):
    if predicted == expected:
      count += 1
  accuracy = round(count/len(predicted_mbti_list), 4)
  print('Accuracy of XGBoost: {}'.format(accuracy))

  return predicted_mbti_list, accuracy

### TFIDF

In [162]:
tfidf_df = tfidf_corpus.copy()
tfidf_df['E/I'] = tfidf_df.apply(lambda row: extractEI(row), axis=1)
tfidf_df['N/S'] = tfidf_df.apply(lambda row: extractNS(row), axis=1)
tfidf_df['F/T'] = tfidf_df.apply(lambda row: extractFT(row), axis=1)
tfidf_df['J/P'] = tfidf_df.apply(lambda row: extractJP(row), axis=1)

In [163]:
label_df = tfidf_df[['label_type', 'E/I', 'N/S', 'F/T', 'J/P']]
label_categories = ['E/I', 'N/S', 'F/T', 'J/P']

tfidf_df = tfidf_df.drop(['label_type', 'E/I', 'N/S', 'F/T', 'J/P'], axis=1)

In [164]:
from sklearn.model_selection import train_test_split

tfidf_x_train, tfidf_x_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidf_df, label_df, test_size=0.1, random_state=1024)

In [165]:
tfidf_accuracies_list = {}

In [166]:
KNN_result, accuracy = KNNClassifier(tfidf_x_train, tfidf_y_train, tfidf_x_test, tfidf_y_test)
tfidf_accuracies_list['KNN'] = accuracy

Accuracy for E/I classfier: 0.8376
Accuracy for N/S classfier: 0.894
Accuracy for F/T classfier: 0.6947
Accuracy for J/P classfier: 0.7304
Accuracy of KNN: 0.4366


In [167]:
NB_result, accuracy = NBClassifier(tfidf_x_train, tfidf_y_train, tfidf_x_test, tfidf_y_test)
tfidf_accuracies_list['NB'] = accuracy

Accuracy for E/I classfier: 0.7984
Accuracy for N/S classfier: 0.879
Accuracy for F/T classfier: 0.7903
Accuracy for J/P classfier: 0.6429
Accuracy of NB: 0.3399


In [168]:
SVM_result, accuracy = SVMClassifier(tfidf_x_train, tfidf_y_train, tfidf_x_test, tfidf_y_test) # About 15mins
tfidf_accuracies_list['SVM'] = accuracy

Accuracy for E/I classfier: 0.8629
Accuracy for N/S classfier: 0.9217
Accuracy for F/T classfier: 0.8698
Accuracy for J/P classfier: 0.7915
Accuracy of SVM: 0.591


In [169]:
RandomForest_result, accuracy = MyRandomForestClassifier(tfidf_x_train, tfidf_y_train, tfidf_x_test, tfidf_y_test)
tfidf_accuracies_list['RandomForest'] = accuracy

Accuracy for E/I classfier: 0.7995
Accuracy for N/S classfier: 0.879
Accuracy for F/T classfier: 0.7972
Accuracy for J/P classfier: 0.7166
Accuracy of RandomForest: 0.4078


In [170]:
XGBoost_result, accuracy = MyXGBoostClassifier(tfidf_x_train, tfidf_y_train, tfidf_x_test, tfidf_y_test)
tfidf_accuracies_list['XGBoost'] = accuracy

Accuracy for E/I classfier: 0.8664
Accuracy for N/S classfier: 0.924
Accuracy for F/T classfier: 0.8606
Accuracy for J/P classfier: 0.7961
Accuracy of XGBoost: 0.5979


In [171]:
sorted_temp = sorted(tfidf_accuracies_list.items(), key=lambda x:x[1], reverse=True)
sorted_dict = dict(sorted_temp)

for model, accuracy in sorted_dict.items():
  print('{:14s}: {}'. format(model, accuracy))

XGBoost       : 0.5979
SVM           : 0.591
KNN           : 0.4366
RandomForest  : 0.4078
NB            : 0.3399


### Word2Vec

In [172]:
word2vec_df = word2vec_corpus.copy()
word2vec_df['E/I'] = word2vec_df.apply(lambda row: extractEI(row), axis=1)
word2vec_df['N/S'] = word2vec_df.apply(lambda row: extractNS(row), axis=1)
word2vec_df['F/T'] = word2vec_df.apply(lambda row: extractFT(row), axis=1)
word2vec_df['J/P'] = word2vec_df.apply(lambda row: extractJP(row), axis=1)

In [173]:
label_df = word2vec_df[['label_type', 'E/I', 'N/S', 'F/T', 'J/P']]
label_categories = ['E/I', 'N/S', 'F/T', 'J/P']

word2vec_df = word2vec_df.drop(['label_type', 'E/I', 'N/S', 'F/T', 'J/P'], axis=1)

In [174]:
word2vec_x_train, word2vec_x_test, word2vec_y_train, word2vec_y_test = train_test_split(word2vec_df, label_df, test_size=0.1, random_state=1024)

In [175]:
word2vec_accuracies_list = {}

In [176]:
KNN_result, accuracy = KNNClassifier(word2vec_x_train, word2vec_y_train, word2vec_x_test, word2vec_y_test)
word2vec_accuracies_list['KNN'] = accuracy

Accuracy for E/I classfier: 0.7915
Accuracy for N/S classfier: 0.8767
Accuracy for F/T classfier: 0.6947
Accuracy for J/P classfier: 0.5922
Accuracy of KNN: 0.2788


In [177]:
NB_result, accuracy = NBClassifier(word2vec_x_train, word2vec_y_train, word2vec_x_test, word2vec_y_test)
word2vec_accuracies_list['NB'] = accuracy

ValueError: ignored

In [178]:
SVM_result, accuracy = SVMClassifier(word2vec_x_train, word2vec_y_train, word2vec_x_test, word2vec_y_test)
word2vec_accuracies_list['SVM'] = accuracy

Accuracy for E/I classfier: 0.7972
Accuracy for N/S classfier: 0.8779
Accuracy for F/T classfier: 0.7592
Accuracy for J/P classfier: 0.6106
Accuracy of SVM: 0.3041


In [179]:
RandomForest_result, accuracy = MyRandomForestClassifier(word2vec_x_train, word2vec_y_train, word2vec_x_test, word2vec_y_test)
word2vec_accuracies_list['RandomForest'] = accuracy

Accuracy for E/I classfier: 0.7903
Accuracy for N/S classfier: 0.8744
Accuracy for F/T classfier: 0.7304
Accuracy for J/P classfier: 0.5945
Accuracy of RandomForest: 0.2972


In [180]:
XGBoost_result, accuracy = MyXGBoostClassifier(word2vec_x_train, word2vec_y_train, word2vec_x_test, word2vec_y_test)
word2vec_accuracies_list['XGBoost'] = accuracy

Accuracy for E/I classfier: 0.7938
Accuracy for N/S classfier: 0.8767
Accuracy for F/T classfier: 0.7465
Accuracy for J/P classfier: 0.6221
Accuracy of XGBoost: 0.3007


In [181]:
sorted_temp = sorted(word2vec_accuracies_list.items(), key=lambda x:x[1], reverse=True)
sorted_dict = dict(sorted_temp)

for model, accuracy in sorted_dict.items():
  print('{:14s}: {}'. format(model, accuracy))

SVM           : 0.3041
XGBoost       : 0.3007
RandomForest  : 0.2972
KNN           : 0.2788


### BERT

In [182]:
bert_df = bert_corpus.copy()
bert_df['E/I'] = bert_df.apply(lambda row: extractEI(row), axis=1)
bert_df['N/S'] = bert_df.apply(lambda row: extractNS(row), axis=1)
bert_df['F/T'] = bert_df.apply(lambda row: extractFT(row), axis=1)
bert_df['J/P'] = bert_df.apply(lambda row: extractJP(row), axis=1)

In [183]:
label_df = bert_df[['label_type', 'E/I', 'N/S', 'F/T', 'J/P']]
label_categories = ['E/I', 'N/S', 'F/T', 'J/P']

bert_df = bert_df.drop(['label_type', 'E/I', 'N/S', 'F/T', 'J/P'], axis=1)

In [184]:
bert_x_train, bert_x_test, bert_y_train, bert_y_test = train_test_split(bert_df, label_df, test_size=0.1, random_state=1024)

In [185]:
bert_accuracies_list = {}

In [186]:
KNN_result, accuracy = KNNClassifier(bert_x_train, bert_y_train, bert_x_test, bert_y_test)
bert_accuracies_list['KNN'] = accuracy

Accuracy for E/I classfier: 0.7926
Accuracy for N/S classfier: 0.8779
Accuracy for F/T classfier: 0.6613
Accuracy for J/P classfier: 0.6164
Accuracy of KNN: 0.2995


In [187]:
NB_result, accuracy = NBClassifier(bert_x_train, bert_y_train, bert_x_test, bert_y_test)
bert_accuracies_list['NB'] = accuracy

ValueError: ignored

In [188]:
SVM_result, accuracy = SVMClassifier(bert_x_train, bert_y_train, bert_x_test, bert_y_test)
bert_accuracies_list['SVM'] = accuracy

Accuracy for E/I classfier: 0.7984
Accuracy for N/S classfier: 0.8744
Accuracy for F/T classfier: 0.7465
Accuracy for J/P classfier: 0.6774
Accuracy of SVM: 0.3641


In [189]:
RandomForest_result, accuracy = MyRandomForestClassifier(bert_x_train, bert_y_train, bert_x_test, bert_y_test)
bert_accuracies_list['RandomForest'] = accuracy

Accuracy for E/I classfier: 0.8007
Accuracy for N/S classfier: 0.8767
Accuracy for F/T classfier: 0.6993
Accuracy for J/P classfier: 0.6452
Accuracy of RandomForest: 0.318


In [190]:
XGBoost_result, accuracy = MyXGBoostClassifier(bert_x_train, bert_y_train, bert_x_test, bert_y_test)
bert_accuracies_list['XGBoost'] = accuracy

Accuracy for E/I classfier: 0.8099
Accuracy for N/S classfier: 0.8767
Accuracy for F/T classfier: 0.7074
Accuracy for J/P classfier: 0.6636
Accuracy of XGBoost: 0.3237


In [191]:
sorted_temp = sorted(bert_accuracies_list.items(), key=lambda x:x[1], reverse=True)
sorted_dict = dict(sorted_temp)

for model, accuracy in sorted_dict.items():
  print('{:14s}: {}'. format(model, accuracy))

SVM           : 0.3641
XGBoost       : 0.3237
RandomForest  : 0.318
KNN           : 0.2995
