# 텍스트 데이터로 MBTI 예측하기
- 과제(task) : Text Classification
- 데이터셋 : MBTI 500 [kaggle](https://www.kaggle.com/mercurio117/mbti-500/data)
  - 전처리된 텍스트 데이터와 MBTI 유형으로 이루어짐
- 주요 참고 코드 : [MBTI 500 - 84% Accuracy](https://www.kaggle.com/clebermarques/mbti-500-84-accuracy)


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
import pickle
import os.path
import plotly.offline as pyo
import plotly.graph_objs as go
import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

## 1. 데이터 로드 및 확인
- kaggle에서 데이터 다운로드 후 `read_csv` 사용하여 데이터 로드
- 훈련 데이터 : 74357개, 테스트 데이터 : 9337개
- 훈련 데이터에만 MBTI `type` 컬럼 존재
- 다른 MBTI 데이터셋([(MBTI) Myers-Briggs Personality Type Dataset](https://www.kaggle.com/datasnaek/mbti-type)) 과의 차이점 
  - 데이터 크기가 크기 때문에 모델링 시에 연산량을 신경써야함
  - 텍스트 데이터가 이미 전처리(tokenization / Stemming 또는 Lemmatization)가 되어 있음


In [92]:
data_dir = './MBTI_dataset/'

train = pd.read_csv(os.path.join(data_dir, 'MBTI_train_v7.csv'), encoding='ISO 8859-1', header=None, names=['type', 'posts'])
test = pd.read_csv(os.path.join(data_dir, 'test_16_noanswer.csv'), encoding='utf-8-sig', header=None, names=['posts'])

print(train.shape, test.shape) # 74357, 9337
train.head()

(154706, 2) (10484, 1)


Unnamed: 0,type,posts
0,ENFP,find something good anyone even people even me...
1,ENFP,freshman year high school spanish speak class ...
2,ENFP,pretty sure pretend fall asleep see boy would ...
3,ENFP,image relate fi emphasis self understand versu...
4,ENFP,clumsiness good get use sometimes pay attentio...


In [46]:
test.head()

Unnamed: 0,posts
0,Principles Somewhat impulsively What is possib...
1,Emotions Somewhat impulsively What is actual C...
2,Emotions Somewhat impulsively What is possible...
3,Principles Rather carefully What is actual Com...
4,Emotions Somewhat impulsively What is possible...


## 2. 모델 로드 또는 재생성 후 학습
- 이미 저장된 모델이 있는 경우 해당 모델을 불러오고 그렇지 않다면 재생성하여 새로 학습시킴

In [93]:
# 이미 만들어진 모델이 있어서 재생성해야하는지 여부를 지정
recreate_model=False

In [94]:
# 해당 이름의 모델 파일이 있다면 모델 학습을 수행하지 않음
filename = 'mbti_svm_v2.sav'

In [95]:
# 만약 모델이 존재하지 않는다면 모델을 재생성
if not os.path.isfile(filename):
    recreate_model=True

In [96]:
X = train['posts'] # features
y = train['type']  # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
# 모델 재생성 여부 확인
if recreate_model:    
    
    # vectorizer 정의 및 fit_transform
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    # 훈련
    clf = LinearSVC()
    clf.fit(X_train_tfidf, y_train)
    
    # vectorizer 및 모델 파이프라인
    text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
    text_clf.fit(X_train, y_train)
    
    # 모델 저장
    pickle.dump(text_clf, open(filename, 'wb'))

# 모델 재생성하지 않으면 기존 저장된 모델 불러오기
else:
    # loading the model from disk
    text_clf = pickle.load(open(filename, 'rb'))

In [98]:
predictions = text_clf.predict(X_test)

In [99]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.92      0.73      0.81       284
        ENFP       0.94      0.91      0.92      1622
        ENTJ       0.94      0.90      0.92      1047
        ENTP       0.96      0.93      0.94      4077
        ESFJ       0.76      0.62      0.68       140
        ESFP       0.81      0.66      0.73       188
        ESTJ       0.71      0.83      0.76       248
        ESTP       0.91      0.95      0.93       733
        INFJ       0.95      0.94      0.95      5099
        INFP       0.94      0.94      0.94      4272
        INTJ       0.96      0.95      0.95      7518
        INTP       0.90      0.95      0.92      4324
        ISFJ       0.63      0.78      0.70       187
        ISFP       0.79      0.70      0.74       234
        ISTJ       0.78      0.78      0.78       292
        ISTP       0.88      0.89      0.89       677

    accuracy                           0.93     30942
   macro avg       0.86   

In [100]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.9297395126


In [102]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['INTP' 'INFP' 'INTP' ... 'INTP' 'INFP' 'ISTJ']


In [105]:
import csv
wf = open('test_16_pred.csv','a', newline='')
wr = csv.writer(wf)
wr.writerow(predictions)
wf.close()

## ver8
- train: 70문항
- test: 16문항
- (557916, 2) (10484, 1)

In [73]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.9339152567


## ver7
- train, test: 16문항
- (692946, 2) (10484, 1)

In [63]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.5090699185


## ver6
- train, test 모두 16문항
- (230982, 2) (5242, 1)

In [54]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.5121328225


## ver5
- train data 16문항

In [44]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.5121328225


In [25]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['INTP' 'INFP' 'INTP' ... 'INTP' 'INFP' 'ISTJ']


In [26]:
import csv
wf = open('test_16_pred.csv','a', newline='')
wr = csv.writer(wf)
wr.writerow(predictions)
wf.close()

## ver4

In [52]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.9339152567


## ver3.1

In [40]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.8976018272


## ver3

In [21]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.3711781847


## ver2

In [13]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.8976018272


In [None]:
predictions = text_clf.predict(test['posts'])

In [None]:
print(predictions)

['ENFP' 'ENTP' 'INTJ' ... 'INTP' 'ENFP' 'INFP']


In [None]:
# submission
sub_dir = './sub'
if not os.path.exists(sub_dir):
  os.mkdir(sub_dir)

sub = pd.DataFrame(predictions)
assert sub.shape == test.shape
sub.to_csv(os.path.join(sub_dir, 'baseline.csv'), index=False, header=None) # no header
sub.head()

Unnamed: 0,0
0,ENFP
1,ENTP
2,INTJ
3,INTJ
4,INTJ
