# 결정트리

## 라이브러리 Import 및 설정

In [1]:
%matplotlib inline

import graphviz
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import warnings

rcParams['figure.figsize'] = (16,8)
plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 4)
warnings.simplefilter('ignore')

In [2]:
# Load the data

data_dir = Path(r'C:\Users\789\Desktop\github_local\Machine Learning\machine-learning-projects\data\astronomical-object')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'type'
seed = 42

algo_name = 'dt' #decision tree
feature_name = 'j1' #
model_name = f'{algo_name}_{feature_name}'
# model_name은 알고리즘과 피쳐링의 조합으로 만들어짐.
# 디시젼 트리와 j1 피쳐 조합으로 모델 한 개.

sub_file = data_dir / f'{model_name}.csv'
# sub_file을 저장하고 싶은 path를 생성
# 나중에 이 곳에 pd.to_csv를 통해 df를 반환할 것임

In [7]:
trn = pd.read_csv(trn_file, index_col = 0)
tst = pd.read_csv(tst_file, index_col = 0)
y = trn[target_col] # tst에는 y 컬럼이 없음. 예측해야 하는 대상이므로.
trn.drop(target_col, axis = 1, inplace = True) # trn을 피쳐만으로 구성

print(y.shape, trn.shape, tst.shape)
trn.head()

#trn.shape은 y 컬럼 하나 삭제한 21이 나와야 하고
#tst.shape 또한 y가 원래 없었으므로 21이 나와줘야함

(199991,) (199991, 21) (10009, 21)


Unnamed: 0_level_0,fiberID,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_u,fiberMag_g,fiberMag_r,fiberMag_i,fiberMag_z,petroMag_u,petroMag_g,petroMag_r,petroMag_i,petroMag_z,modelMag_u,modelMag_g,modelMag_r,modelMag_i,modelMag_z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,601,23.1982,21.432,21.3141,21.1766,21.1714,22.5813,21.6445,21.6576,21.3877,21.5728,22.5043,21.4316,21.4783,21.1454,20.4224,22.7492,21.4655,21.3642,21.0206,21.1473
1,788,21.4314,20.7081,20.6789,20.7034,20.4732,21.8688,21.0298,20.9671,20.9377,21.0636,21.3607,20.779,20.8897,20.6398,20.6467,21.493,20.7585,20.7539,20.6934,20.5123
2,427,17.8515,16.7279,16.6797,16.6946,16.6418,18.1719,17.0331,16.9997,17.096,17.0764,17.8673,16.7388,16.6889,16.7442,16.808,17.8181,16.6974,16.6412,16.6602,16.6889
3,864,20.7899,20.0404,19.9269,19.8438,19.4633,21.039,20.3172,20.2179,20.0739,19.7945,20.4339,19.9937,19.9855,19.7509,19.4551,20.7707,20.0017,19.8898,19.7581,19.5529
4,612,26.455,23.0588,21.4714,19.505,18.3891,25.7006,23.6291,21.7428,19.8617,18.8104,25.8592,22.4269,21.6736,19.61,18.3761,24.8771,23.148,21.4753,19.4873,18.3757


## Train Decision Tree

In [8]:
clf = DecisionTreeClassifier(max_depth = 5, 
                             min_samples_leaf= 10,
                             random_state= 42)
# 인수가 많이 필요 없음
# 단일 트리의 단점 : 하이 배리언스. CV에서 낮은 성능을 보일 확률이 높음.
clf.fit(trn, y)

In [9]:
print(f'{accuracy_score(y, clf.predict(trn))*100:.4f}%')

76.9380%


## 결정트리 시각화

In [10]:
dot_data = export_graphviz(clf, out_file = None,
                           feature_names= trn.columns,
                           filled = True,
                           rounded = True,
                           special_characters= True)

graph = graphviz.Source(dot_data)
graph

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x16733a00e90>

## 시험 데이터 예측

In [11]:
sub = pd.read_csv(sample_file, index_col = 0)
print(sub.shape)
sub.head()

(10009, 19)


Unnamed: 0_level_0,STAR_WHITE_DWARF,STAR_CATY_VAR,STAR_BROWN_DWARF,SERENDIPITY_RED,REDDEN_STD,STAR_BHB,GALAXY,SERENDIPITY_DISTANT,QSO,SKY,STAR_RED_DWARF,ROSAT_D,STAR_PN,SERENDIPITY_FIRST,STAR_CARBON,SPECTROPHOTO_STD,STAR_SUB_DWARF,SERENDIPITY_MANUAL,SERENDIPITY_BLUE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
199991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# 확률 어레이 구하기
probs = clf.predict_proba(tst)

# 서브미션 파일의 컬럼 구하기
class_cols = [col for col in sub.columns if col!= 'id']

# 각각의 컬럼에 대해 어레이 배정해주기
sub[class_cols] = probs

sub.head()

Unnamed: 0_level_0,STAR_WHITE_DWARF,STAR_CATY_VAR,STAR_BROWN_DWARF,SERENDIPITY_RED,REDDEN_STD,STAR_BHB,GALAXY,SERENDIPITY_DISTANT,QSO,SKY,STAR_RED_DWARF,ROSAT_D,STAR_PN,SERENDIPITY_FIRST,STAR_CARBON,SPECTROPHOTO_STD,STAR_SUB_DWARF,SERENDIPITY_MANUAL,SERENDIPITY_BLUE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
199991,0.0026,0.011,0.0,0.0173,0.001,0.0005,0.0005,0.00047015,0.024134,0.0,0.0,0.0,0.0011,0.0004,7.8358e-05,0.0,0.94037,0.0006,0.0
199992,0.0067,0.7184,0.0,0.1196,0.0247,0.003,0.1263,0.0,0.0,0.00026874,0.0,0.00026874,0.0,0.0003,0.0,0.0,0.00053749,0.0,0.0
199993,0.9692,0.0083,0.0003,0.0062,0.0002,0.0,0.0018,6.5117e-05,0.0,0.0,0.0,3.2558e-05,0.0,0.0131,0.0,0.0,3.2558e-05,0.0,0.0009
199994,0.0007,0.1371,0.0,0.0344,0.6928,0.0846,0.038,0.00050869,3.913e-05,7.826e-05,0.0,0.004813,0.0,0.0,0.0037956,0.0002,7.826e-05,0.0,0.003
199995,0.0026,0.011,0.0,0.0173,0.001,0.0005,0.0005,0.00047015,0.024134,0.0,0.0,0.0,0.0011,0.0004,7.8358e-05,0.0,0.94037,0.0006,0.0


In [16]:
sub.iloc[:5,1:].sum(axis=1)

id
199991    0.9974
199992    0.9933
199993    0.0308
199994    0.9993
199995    0.9974
dtype: float64

In [18]:
sub.to_csv(sub_file)

In [19]:
submission = pd.read_csv(sub_file, index_col = 0)
submission.head()


Unnamed: 0_level_0,STAR_WHITE_DWARF,STAR_CATY_VAR,STAR_BROWN_DWARF,SERENDIPITY_RED,REDDEN_STD,STAR_BHB,GALAXY,SERENDIPITY_DISTANT,QSO,SKY,STAR_RED_DWARF,ROSAT_D,STAR_PN,SERENDIPITY_FIRST,STAR_CARBON,SPECTROPHOTO_STD,STAR_SUB_DWARF,SERENDIPITY_MANUAL,SERENDIPITY_BLUE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
199991,0.0026,0.011,0.0,0.0173,0.001,0.0005,0.0005,0.00047015,0.024134,0.0,0.0,0.0,0.0011,0.0004,7.8358e-05,0.0,0.94037,0.0006,0.0
199992,0.0067,0.7184,0.0,0.1196,0.0247,0.003,0.1263,0.0,0.0,0.00026874,0.0,0.00026874,0.0,0.0003,0.0,0.0,0.00053749,0.0,0.0
199993,0.9692,0.0083,0.0003,0.0062,0.0002,0.0,0.0018,6.5117e-05,0.0,0.0,0.0,3.2558e-05,0.0,0.0131,0.0,0.0,3.2558e-05,0.0,0.0009
199994,0.0007,0.1371,0.0,0.0344,0.6928,0.0846,0.038,0.00050869,3.913e-05,7.826e-05,0.0,0.004813,0.0,0.0,0.0037956,0.0002,7.826e-05,0.0,0.003
199995,0.0026,0.011,0.0,0.0173,0.001,0.0005,0.0005,0.00047015,0.024134,0.0,0.0,0.0,0.0011,0.0004,7.8358e-05,0.0,0.94037,0.0006,0.0
