In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [7]:
fetal = pd.read_csv("fetal_health.csv")
X = fetal.drop(["fetal_health"],axis=1).values
y = fetal["fetal_health"].values.ravel().astype(int) - 1

In [3]:
fetal[fetal.duplicated()]
fetal_dup = fetal.drop_duplicates(subset = None , keep = 'first', inplace = False)
corr = fetal.corr()

In [4]:
corr[abs(corr['fetal_health']) > 0.15]['fetal_health']
X_remove = fetal[corr[abs(corr['fetal_health']) > 0.10]['fetal_health'].index]
X_cor = X_remove.drop(['fetal_health'], axis=1)
y_cor = fetal['fetal_health']

s_scaler = preprocessing.StandardScaler()
X_trans= s_scaler.fit_transform(X)
X_cor_train, X_cor_test, y_cor_train,y_cor_test = train_test_split(X_cor,y_cor,test_size=0.3,random_state=123, stratify = y)

In [16]:
def get_results_simple(model, prediction): # 각 지표들 리턴해주는 함수
    result_dict = dict()
    
    test_accuracy = round(accuracy_score(y_cor_test, prediction),4)
    recall = round(recall_score(y_cor_test, prediction, average = "weighted", labels = np.unique(prediction)), 3)
    precision = round(precision_score(y_cor_test, prediction, average = "weighted", labels = np.unique(prediction)), 3)
    f1 = round(f1_score(y_cor_test, prediction, average = "micro", labels = np.unique(prediction)), 3)

    result_dict["test_accuracy"] = test_accuracy
    result_dict["recall"] = recall
    result_dict["f1_score"] = f1
    result_dict["precision"] = precision
    
    return result_dict

In [8]:
clf = DecisionTreeClassifier(random_state=123)

params =  {
    'min_samples_split': [2, 3, 4],
    'max_depth': [6, 16, None]
}

grid = GridSearchCV(estimator=clf,
                    param_grid=params,
                    cv=100,
                    n_jobs=1,
                    verbose=2)

grid.fit(X_cor_train, y_cor_train)

dt_grid = DecisionTreeClassifier(random_state=123, max_depth = grid.best_params_['max_depth'], min_samples_split = grid.best_params_['min_samples_split'])
dt_grid.fit(X_cor_train, y_cor_train)

Fitting 100 folds for each of 9 candidates, totalling 900 fits
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_samples_split=2; total time=   0.0s
[CV] END ...................max_depth=6, min_s

DecisionTreeClassifier(max_depth=16, random_state=123)

In [9]:
print(f"Training Accuracy: {dt_grid.score(X_cor_train, y_cor_train):0.3f}")
print(f"Test Accuracy: {dt_grid.score(X_cor_test, y_cor_test):0.3f}")

Training Accuracy: 1.000
Test Accuracy: 0.929


In [17]:
prediction = dt_grid.predict(X_cor_test)
dt_grid_result = get_results_simple(dt_grid, prediction)
dt_grid_result['train_accuracy'] = round(dt_grid.score(X_cor_train, y_cor_train),3)
dt_grid_result 

{'test_accuracy': 0.9295,
 'recall': 0.929,
 'f1_score': 0.929,
 'precision': 0.929,
 'train_accuracy': 1.0}