## 1. 가우시안 나이브 베이지안

#### 데이터 준비

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

In [None]:
data_w_null = pd.read_csv('../data/size_korea_w_null.csv')

In [None]:
data_w_null.head()

In [None]:
data_w_null.isnull().sum()

In [None]:
data_small_w_null = data_w_null[['키','몸무게','체지방율','성별','나이']]

In [None]:
data_small_w_null.head()

In [None]:
data_small_w_null.isnull().sum()

In [None]:
data_small_w_null[data_small_w_null.isnull().sum(axis=1)>0]    # null이 들어 있는 데이터 확인

In [None]:
data_small = data_small_w_null.dropna().reset_index(drop=True)

In [None]:
X = data_small.drop('성별', axis=1)                            # 성별을 y로 만들기
y = data_small['성별'].replace({'남':0,'여':1})                # 성별 '남', '여'를 0,1로 변환

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, stratify= y, random_state=0)

#### 파이썬 코드

클래스와 알고리즘 선택, 적합

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)

예측 확률 분포 그리기

In [None]:
y_predict = clf.predict(X_test)
y_probas = clf.predict_proba(X_test)
data = pd.DataFrame(y_probas, columns =['남', '여']).round(2)
data['분류'] = y_predict
data

In [None]:
fig, axs = plt.subplots(1,2, figsize=(12,4))
axs[0].hist(y_probas[:,0], color ='blue', label='남'); axs[0].legend()
axs[1].hist(y_probas[:,1], color ='red', label='여'); axs[1].legend()

성능 평가

In [None]:
# confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix, classification_report
plot_confusion_matrix(clf, X_test, y_test) 
print(classification_report(y_test, y_predict))

In [None]:
# roc, precision_recall
import scikitplot as skplt
skplt.metrics.plot_roc(y_test, y_probas)
skplt.metrics.plot_precision_recall(y_test, y_probas)

모델 저장

In [None]:
from joblib import dump, load
dump(clf, '../working/gnb.joblib')

In [None]:
# 참고: RandomizedSearchCV 적용 – cross-validation 수준
from sklearn.model_selection import RandomizedSearchCV
hyperparameters = {}
randomized_search_lr = RandomizedSearchCV(clf, hyperparameters, cv=5, scoring = 'accuracy')
randomized_search_lr.fit(X_train, y_train)

In [None]:
randomized_search_lr.cv_results_

In [None]:
# 참고: roc, precision_recall를 그리는 다른 방법
from sklearn.metrics import roc_curve, precision_recall_curve, RocCurveDisplay, PrecisionRecallDisplay
y_score = clf.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1])
prec, recall, _ = precision_recall_curve(y_test, y_score, pos_label=clf.classes_[1])
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(ax=axs[0])
PrecisionRecallDisplay(precision=prec, recall=recall).plot(ax=axs[1])
plt.show()

## 2. 로지스틱 회귀

#### 파이썬 코드

클래스와 알고리즘 선택

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear', random_state=0)

Hyperparameters 설정과 적합

In [None]:
clf.get_params().keys()

In [None]:
hyperparameters = [{'C': np.logspace(-5, 5, 10), 
                   'penalty': ['l2']},
                   {'penalty': ['l1']}]

randomized_search = RandomizedSearchCV(clf, hyperparameters, cv=5, scoring='accuracy')
randomized_search.fit(X_train, y_train)

In [None]:
randomized_search.cv_results_

In [None]:
randomized_search.best_estimator_.coef_, randomized_search.best_estimator_.intercept_

모델 저장

In [None]:
from joblib import dump, load
dump(clf, '../working/lr.joblib')

저장한 모델 불러오기

In [None]:
model = load('../working/lr.joblib')
model.get_params()['solver']

## 3. SVC

#### 파이썬 코드

클래스와 알고리즘 선택, 적합

In [None]:
from sklearn.svm import SVC

svc = SVC(random_state=0)
hyperparameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
randomized_search=RandomizedSearchCV(svc, hyperparameters, cv=5, n_jobs=-1,
                                     scoring='accuracy', verbose=2)
randomized_search.fit(X_train, y_train)

성능 평가

In [None]:
model = randomized_search.best_estimator_
y_predict = model.predict(X_test)
y_scores = model.decision_function(X_test)
print(classification_report(y_test, y_predict))
plot_confusion_matrix(model, X_test, y_test, normalize='true')

모델 저장

In [None]:
from joblib import dump, load
dump(model, '../working/svc.joblib')

## 4. 의사결정 트리

#### 파이썬 코드

클래스와 알고리즘 선택, 적합

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=0)

hyperparameters = {'criterion': ['gini', 'entropy'],
                   'splitter': ['best', 'random'],
                   'max_depth': np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                   'min_samples_split': [0.005, 0.01, 0.05, 0.1],
                   'min_samples_leaf': [0.005, 0.01, 0.05, 0.1],
                   'max_features': ['auto', 'sqrt', 'log2']}

randomized_search = RandomizedSearchCV(model, hyperparameters, cv=5, scoring='accuracy')
randomized_search.fit(X_train, y_train)

성능 평가

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
y_predict = randomized_search.predict(X_test)
print(classification_report(y_test, y_predict))
print(confusion_matrix(y_test, y_predict))
print(confusion_matrix(y_test, y_predict, normalize='true').round(4)*100)

변수 중요도 알아보기

In [None]:
# RandomizedSearchCV에서 찾은 hyperparameters로 알고리즘 설정
tuned_params = randomized_search.best_params_
model= DecisionTreeClassifier(random_state=11).set_params(**tuned_params)
model.get_params()

In [None]:
model.fit(X_train, y_train)
print(f"Feature importances:\n{model.feature_importances_}")

변수 중요도 그리기

In [None]:
plt.barh(range(X_train.shape[1]), model.feature_importances_, align='center')
plt.yticks(np.arange(X_train.shape[1]), X_train.columns)
plt.xlabel('변수의 중요도')
plt.ylabel('변수')

모델 저장

In [None]:
from joblib import dump, load
dump(model, '../working/dt.joblib')