# 텍스트 데이터로 MBTI 예측하기
- 과제(task) : Text Classification
- 데이터셋 : MBTI 500 [kaggle](https://www.kaggle.com/mercurio117/mbti-500/data)
  - 전처리된 텍스트 데이터와 MBTI 유형으로 이루어짐
- 주요 참고 코드 : [MBTI 500 - 84% Accuracy](https://www.kaggle.com/clebermarques/mbti-500-84-accuracy)


In [2]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.4.0-cp39-cp39-win_amd64.whl (11.8 MB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.3-py3-none-any.whl (9.3 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.2-py3-none-any.whl (42 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.6-cp39-cp39-win_amd64.whl (36 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.7-py3-none-any.whl (17 kB)
Collecting thinc<8.2.0,>=8.1.0
  Downloading thinc-8.1.0-cp39-cp39-win_amd64.whl (1.3 MB)
Collecting wasabi<1.1.0,>=0.9.1
  Downloading wasabi-0.9.1-py3-none-any.whl (26 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.7-cp39-cp39-win_amd64.whl (18 kB)
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.6-cp39-cp39-win_amd64.whl (112 kB)
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.3-cp39-cp39-win_amd64.whl (448 kB)
Collecting spacy-lega

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
import pickle
import os.path
import plotly.offline as pyo
import plotly.graph_objs as go
import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

## 1. 데이터 로드 및 확인
- kaggle에서 데이터 다운로드 후 `read_csv` 사용하여 데이터 로드
- 훈련 데이터 : 74357개, 테스트 데이터 : 9337개
- 훈련 데이터에만 MBTI `type` 컬럼 존재
- 다른 MBTI 데이터셋([(MBTI) Myers-Briggs Personality Type Dataset](https://www.kaggle.com/datasnaek/mbti-type)) 과의 차이점 
  - 데이터 크기가 크기 때문에 모델링 시에 연산량을 신경써야함
  - 텍스트 데이터가 이미 전처리(tokenization / Stemming 또는 Lemmatization)가 되어 있음


In [3]:
import pandas as pd
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

TESTDATA = StringIO("""
    typeestj1 typeestj2 typeestj3 typeestj4 typeestj5 typeestj6 typeestj7 typeestj8 typeestj9 typeestj10 typeestj11 typeestj12 typeestj13 typeestj14 typeestj15 typeestj16
    """)

value = 'typeestj1 typeestj2 typeestj3 typeestj4 typeestj5 typeestj6 typeestj7 typeestj8 typeestj9 typeestj10 typeestj11 typeestj12 typeestj13 typeestj14 typeestj15 typeestj16'
test = pd.read_csv(TESTDATA, header=None, names=['posts'])

In [35]:
data_dir = './MBTI_dataset/'

train = pd.read_csv(os.path.join(data_dir, 'MBTI_train.csv'), encoding='ISO 8859-1', header=None, names=['type', 'posts'])
test = pd.read_csv(os.path.join(data_dir, 'MBTI_test_7.csv'), encoding='utf-8-sig', header=None, names=['posts'])

print(train.shape, test.shape) # 74357, 9337 -> (90833, 2) (16313, 1)
train.head()

(91942, 2) (1, 1)


Unnamed: 0,type,posts
0,INTJ,find article interest well actually religious ...
1,INTJ,say fully consciously aware big picture try st...
2,INTJ,torture eternity kill pretty quickly ever get ...
3,INTJ,twice discussion awhile ago nobody say regret ...
4,INTJ,consciously think material even tho intj prett...


In [36]:
test.head()

Unnamed: 0,posts
0,Principles Somewhat impulsively What is actual...


## 2. 모델 로드 또는 재생성 후 학습
- 이미 저장된 모델이 있는 경우 해당 모델을 불러오고 그렇지 않다면 재생성하여 새로 학습시킴

In [37]:
# 이미 만들어진 모델이 있어서 재생성해야하는지 여부를 지정
recreate_model=False

In [38]:
# 해당 이름의 모델 파일이 있다면 모델 학습을 수행하지 않음
filename = 'mbti_svm_v10.sav'

In [39]:
# 만약 모델이 존재하지 않는다면 모델을 재생성
if not os.path.isfile(filename):
    recreate_model=True

In [40]:
X = train['posts'] # features
y = train['type']  # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
# 모델 재생성 여부 확인
if recreate_model:    
    
    # vectorizer 정의 및 fit_transform
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    # 훈련
    clf = LinearSVC()
    clf.fit(X_train_tfidf, y_train)
    
    # vectorizer 및 모델 파이프라인
    text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
    text_clf.fit(X_train, y_train)
    
    # 모델 저장
    pickle.dump(text_clf, open(filename, 'wb'))

# 모델 재생성하지 않으면 기존 저장된 모델 불러오기
else:
    # loading the model from disk
    text_clf = pickle.load(open(filename, 'rb'))

In [42]:
predictions = text_clf.predict(X_test)

In [12]:
import numpy
predictions = ['INFP']
print(type(predictions))


# predictions_slice = predictions[1,1]
# prediction = numpy.array_str(predictions)
# print(predictions_slice)

<class 'list'>
['INFP']


## ver10
- 70문항 train
- ESTJ

In [43]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['ESTJ']


## ver9
- train 16문항 숫자 맞춰서(16_v2)
- ESTP

In [34]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['INFP']


## ver8
-  ESTP

In [26]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['INFP']


## ver7
- 70문항으로 train
- ENTP

In [18]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['ESTJ']


## ver6
- 16문항으로 train
- 원래 정답: ENTP

In [10]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['ESTJ']


## ver5
- train data 기존 70문항
- test data 16문항
- 원래 정답: ENTP

In [9]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['ESTJ']


## ver4
- 원래 정답: ISTP

In [33]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['INTP']


## ver3.1

In [27]:
predictions = text_clf.predict(test['posts'])
print(predictions)


['INTP']


## ver3
- 원래정답: ENFP
- 실제 답: INTP

In [17]:
predictions = text_clf.predict(test['posts'])
print(predictions)


['INTP']


## ver2
- 원래 정답: ESFJ
- 모델 답: ESFJ
    

In [10]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['ESFJ']


## ver 1 
 - 원래 정답: ISFP
 - 모델 답: ISFJ

In [41]:
predictions = text_clf.predict(test['posts'])
print(predictions)

['ISFJ']


## ver 0

In [25]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        ENFJ       0.84      0.49      0.62       253
        ENFP       0.81      0.73      0.77      1049
        ENTJ       0.87      0.75      0.80       552
        ENTP       0.84      0.82      0.83      2017
        ESFJ       0.55      0.28      0.37        39
        ESFP       0.81      0.47      0.59        92
        ESTJ       0.77      0.76      0.76        90
        ESTP       0.91      0.91      0.91       336
        INFJ       0.81      0.83      0.82      2491
        INFP       0.79      0.82      0.81      2058
        INTJ       0.82      0.86      0.84      3761
        INTP       0.84      0.88      0.86      4346
        ISFJ       0.65      0.50      0.57       117
        ISFP       0.72      0.57      0.63       174
        ISTJ       0.77      0.56      0.65       230
        ISTP       0.83      0.77      0.80       562

    accuracy                           0.82     18167
   macro avg       0.79   

In [26]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),10)}")

Overall accuracy of the model: 0.8225353663


In [27]:
predictions = text_clf.predict(test['posts'])

In [28]:
print(predictions)

['INFJ' 'ENFJ' 'INTP' ... 'ENTP' 'ISTP' 'ISTP']


In [29]:
# submission
sub_dir = './sub'
if not os.path.exists(sub_dir):
  os.mkdir(sub_dir)

sub = pd.DataFrame(predictions)
assert sub.shape == test.shape
sub.to_csv(os.path.join(sub_dir, 'baseline.csv'), index=False, header=None) # no header
sub.head()

Unnamed: 0,0
0,INFJ
1,ENFJ
2,INTP
3,ENFJ
4,ENFJ
