## 타이타닉 생존율 구하기

In [3]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
t_df = pd.read_pickle('./dataset/t_df.pkl')
t_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   survived  1309 non-null   int64  
 2   sex       1309 non-null   int32  
 3   age       1309 non-null   float64
 4   sibsp     1309 non-null   int64  
 5   parch     1309 non-null   int64  
 6   fare      1309 non-null   float64
 7   cabin     1309 non-null   int32  
 8   embarked  1309 non-null   int32  
dtypes: float64(2), int32(3), int64(4)
memory usage: 76.8 KB


### Decision Tree

In [5]:
y_df = t_df.survived
x_df = t_df.drop('survived', axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x_df
                                                    , y_df
                                                    , test_size=0.2
                                                    , random_state=0)

dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(x_train, y_train)
dt_pred = dt_clf.predict(x_test)

print('Score: {}'.format(dt_clf.score(x_train, y_train)))
print()

df_accuracy = accuracy_score(y_test, dt_pred)
print('예측 정확도', df_accuracy)

Score: 0.9751671442215855

예측 정확도 0.7595419847328244


### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(random_state=0)

lr_clf.fit(x_train, y_train)

lr_pred = lr_clf.predict(x_test)

lr_accuracy = accuracy_score(y_test, lr_pred)
print('lr 예측 정확도: ', lr_accuracy)

# 아래와 같은 값임
print('lr_clf.score(x_test, y_test)): ', lr_clf.score(x_test, y_test)); print()

print('[train_score]: ', lr_clf.score(x_train, y_train))
print('[test_score]: ', lr_clf.score(x_test, y_test))

lr 예측 정확도:  0.7938931297709924
lr_clf.score(x_test, y_test)):  0.7938931297709924

[train_score]:  0.789875835721108
[test_score]:  0.7938931297709924


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 평가 사용자 함수

In [7]:
# 평가 사용자 함수
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print()
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

get_clf_eval(y_test, dt_pred)

"""
오차 행렬 (생존자 예측이므로 1을 기준으로 하기 때문에 TP와 TN이 뒤집어짐)
예측---N---------P
실N[[135(TN)  27(FP)]
제P [ 36(FN)  64(TP)]]

정밀도 = TP / (TP + FP)
재현율 = TP / (TP + FN)
f1 = 2 * {정밀도 * 재현율 / (정밀도 + 재현율)}

정확도: 0.7595, 정밀도: 0.7033, 재현율: 0.6400, F1: 0.6702
"""

오차 행렬
[[135  27]
 [ 36  64]]

정확도: 0.7595, 정밀도: 0.7033, 재현율: 0.6400, F1: 0.6702


'\n오차 행렬 (생존자 예측이므로 1을 기준으로 하기 때문에 TP와 TN이 뒤집어짐)\n예측---N---------P\n실N[[135(TN)  27(FP)]\n제P [ 36(FN)  64(TP)]]\n\n정밀도 = TP / (TP + FP)\n재현율 = TP / (TP + FN)\nf1 = 2 * {정밀도 * 재현율 / (정밀도 + 재현율)}\n\n정확도: 0.7595, 정밀도: 0.7033, 재현율: 0.6400, F1: 0.6702\n'

----

### 교차 검증 - KFold
- 문제가 많아서 제한적으로 사용

![screenshot](./images/grid_search_workflow.png)

출처: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

A solution to this problem is a procedure called cross-validation (CV for short). A test set should still be held out for final evaluation, but the validation set is no longer needed when doing CV. In the basic approach, called k-fold CV, the training set is split into k smaller sets (other approaches are described below, but generally follow the same principles). The following procedure is followed for each of the k “folds”:

A model is trained using  of the folds as training data;

the resulting model is validated on the remaining part of the data (i.e., it is used as a test set to compute a performance measure such as accuracy).

The performance measure reported by k-fold cross-validation is then the average of the values computed in the loop. This approach can be computationally expensive, but does not waste too much data (as is the case when fixing an arbitrary validation set), which is a major advantage in problems such as inverse inference where the number of samples is very small.

The simplest way to use cross-validation is to call the cross_val_score helper function on the estimator and the dataset.

![screenshot](./images/grid_search_cross_validation.png)

In [8]:
# cross_val_scores
# KFold의 일련 과정을 한꺼번에 수행해주는 API

from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf, x_df, y_df, cv=10)
for iter_count, accuracy in enumerate(scores):
    print('교차 검증 {0}, 정확도: {1:.4f}'.format(iter_count, accuracy))

print()
print('평균 정확도: {0:.4f}'.format(np.mean(scores)))

교차 검증 0, 정확도: 0.5802
교차 검증 1, 정확도: 0.7863
교차 검증 2, 정확도: 0.8092
교차 검증 3, 정확도: 0.7710
교차 검증 4, 정확도: 0.7252
교차 검증 5, 정확도: 0.7176
교차 검증 6, 정확도: 0.6641
교차 검증 7, 정확도: 0.6031
교차 검증 8, 정확도: 0.6947
교차 검증 9, 정확도: 0.7231

평균 정확도: 0.7074


### GridSearchCV
- CV 종합편

In [9]:
#GridSearchCV
# DT 파라미터
# max_depth: 트리의 최대 깊이
# max_features: 최적의 분할을 위해 고려할 최대 피처 개수


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

parameters = {'max_depth': [2, 3, 5, 10]
              , 'min_samples_split' : [2, 3, 5]
              , 'min_samples_leaf': [1, 5, 8]}

grid_dclf = GridSearchCV(dt_clf
                         # 매개변수들
                         , param_grid=parameters
                         # 판정하기 위한 기준
                         , scoring = 'accuracy'
                         # 교차 검증 횟수
                         , cv=5
                         # 하이퍼 파라미터 적용 여부
                         , refit=True)

grid_dclf.fit(x_train, y_train)

print(grid_dclf); print()
# 교차 검증을 기반으로 최적의 하이퍼 파라미터를 찾아줌('max_depth', 'min_samples_split' , 'min_samples_leaf')
print('GridSearchCV 최적 하이퍼 파라미터: ', grid_dclf.best_params_); print()
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dclf.best_score_)); print()

# 최적의 하이퍼 파라미터 적용
best_dclf = grid_dclf.best_estimator_
print(best_dclf); print()

dt_pred = best_dclf.predict(x_test)
accuracy = accuracy_score(y_test, dt_pred)
print('dt 예측 정확도: ', accuracy); print()

# 평가 사용자 함수(재사용)
get_clf_eval(y_test, dt_pred)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=0, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [2, 3, 5, 10],
                         'min_samples_leaf': [1, 5, 8],
                  

----
## 한글 텍스트 처리 - 감성분석(네이버 영화평점)

In [10]:
import pandas as pd
news_df = pd.read_csv('./dataset/nsmc/ratings_train.txt', sep='\t')
news_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [11]:
news_df.label.value_counts()

0    75173
1    74827
Name: label, dtype: int64

In [12]:
x = news_df.iloc[:, :-1]
y = news_df.iloc[:, -1]

print(x.shape)
print(y.shape)

(150000, 2)
(150000,)


In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y
    , test_size=0.2
    , random_state=11
)

In [14]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120000 entries, 94561 to 141209
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        120000 non-null  int64 
 1   document  119996 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.7+ MB


In [15]:
# Null, 숫자를 공백으로 처리

import re
x_train = x_train.fillna(" ")
# 숫자로 된 거는 공백으로 채우기
x_train.document = x_train.document.apply(lambda x : re.sub(r"\d+", " ", x))
x_test = x_test.fillna(" ")
x_test.document = x_test.document.apply(lambda x : re.sub(r"\d+", " ", x))

In [16]:
x_train.info()
print()
x_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120000 entries, 94561 to 141209
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        120000 non-null  int64 
 1   document  120000 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.7+ MB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 77509 to 36912
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        30000 non-null  int64 
 1   document  30000 non-null  object
dtypes: int64(1), object(1)
memory usage: 703.1+ KB


In [17]:
# morphs() 메소드는 입력 인자로 들어온 문장을 형태소 단어 형태로 토큰화하여 list로 변환

import warnings

warnings.filterwarnings('ignore')


from konlpy.tag import Okt

okt = Okt()

def tw_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko

In [18]:
# 사이킷런의 TfidfVectorizer를 이용, TF-IDF 피처 모델을 생성(10분 소요)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# 위에서 만든 tw_tokenizer() 함수를 tokenizer로 사용, ngram_range는 (1,2)
tfidf_vect = TfidfVectorizer(
    tokenizer=tw_tokenizer
    , ngram_range=(1,2)
    , min_df=3
    # 상위 10% 피처로 추출하지 않음
    , max_df=0.9
)

tfidf_vect.fit(x_train.document)
tfidf_train = tfidf_vect.transform(x_train.document)

In [19]:
# 교차검증 및 하이퍼 파라미터 튜닝
# Logistic Regression을 이용하여 감성 분석 Classification 수행
# alpha 값의 역수(값이 작을수록 규제가 강한 것)
# 로지스틱 회귀의 하이퍼 파라미터 C를 설정
# C는 규제 강도를 조절하는  alpha 값의 역수로 작을수록 규제강도가 크며
# Penalty는 규제의 유형을 설정하며 11 규제와 12규제가 있으며 기본은 12임
lr_clf = LogisticRegression(random_state=0)
params = {'C': [1,3.5,4.5,5.5,10]}

gcv_lr = GridSearchCV(
    lr_clf
    , param_grid=params
    , cv=3
    , scoring='accuracy'
    , verbose=0
)

gcv_lr.fit(tfidf_train, y_train)

print(gcv_lr.best_params_, round(gcv_lr.best_score_, 4))

{'C': 3.5} 0.8555


In [20]:
# 테스트 검증을 가지고 최종 검증
# 테스트 세트를 이용 예측 시 학습할 때 적용한 TfidfVectorizer를 그대로 사용해야 함
# 그래야 학습 시 설정한 피처 개수와 테스트 데이터를 TfidfVectorizer로 변경할 피처 개수가 같아짐

from sklearn.metrics import accuracy_score

tfidf_test = tfidf_vect.transform(x_test.document)

best_estimator = gcv_lr.best_estimator_

lr_preds = best_estimator.predict(tfidf_test)

print('Logistic Regression 정확도: ', accuracy_score(y_test, lr_preds))

Logistic Regression 정확도:  0.859


In [21]:
print(y_test.values[:10])
print(lr_preds[:10])

[0 0 1 0 0 0 0 1 1 1]
[0 0 1 0 0 0 0 1 1 1]


### Q. RF, DT를 이용하여 네이버 영화 평점 감성 분석 Classification 수행
- GridSearchCV를 이용 교차검증(cv=3)과 하이퍼 파라미터 튜닝
 - RF params = {'n_estimators':[50,100,200], 'max_depth':[2,3,5], 'min_samples_leaf':[1,5,8]}
 - DT params = {'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

In [31]:
# rf를 이용하여 감성 분석 Classification 수행
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(random_state=0)

rf_params = {
    # 나무 개수
    'n_estimators':[50, 100, 200]
    , 'max_depth':[2, 3, 5]
    , 'min_samples_leaf':[1, 5, 8]}

gcv_rf = GridSearchCV(rf_clf
                      # 매개변수들
                      , param_grid=rf_params
                      # 판정하기 위한 기준
                      , scoring = 'accuracy'
                      # 교차 검증 횟수
                      , cv=3
                      , verbose=1)

gcv_rf.fit(tfidf_train, y_train)

print(gcv_rf.best_params_, round(gcv_rf.best_score_, 4))

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:  4.1min finished


{'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 200} 0.7538


In [32]:
# 테스트 검증을 가지고 최종 검증
# 테스트 세트를 이용 예측 시 학습할 때 적용한 TfidfVectorizer를 그대로 사용해야 함
# 그래야 학습 시 설정한 피처 개수와 테스트 데이터를 TfidfVectorizer로 변경할 피처 개수가 같아짐

from sklearn.metrics import accuracy_score

tfidf_test = tfidf_vect.transform(x_test.document)
best_estimator = gcv_rf.best_estimator_
rf_preds = best_estimator.predict(tfidf_test)

print('RandomForest 정확도: ', accuracy_score(y_test, rf_preds))

print(y_test.values[:10])
print(rf_preds[:10])

RandomForest 정확도:  0.7545
[0 0 1 0 0 0 0 1 1 1]
[0 0 0 0 0 0 0 1 1 1]


In [None]:
# dt를 이용하여 감성 분석 Classification 수행
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_clf = DecisionTreeClassifier(random_state=0)

dt_params = {
    'max_depth':[2,3,5,10]
    , 'max_depth':[2, 3, 5]
    , 'min_samples_leaf':[1, 5, 8]}

gcv_dt = GridSearchCV(dt_clf
                      # 매개변수들
                      , param_grid=dt_params
                      # 판정하기 위한 기준
                      , scoring = 'accuracy'
                      # 교차 검증 횟수
                      , cv=3
                      , verbose=0)

gcv_dt.fit(tfidf_train, y_train)

print(gcv_dt.best_params_, round(gcv_dt.best_score_, 4))

tfidf_test = tfidf_vect.transform(x_test.document)
best_estimator = gcv_dt.best_estimator_
dt_preds = best_estimator.predict(tfidf_test)

print('Decision Tree 정확도: ', accuracy_score(y_test, dt_preds))