- 파라미터를 주고 전부 실행

------

# 1. 환경준비

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
path = 'https://raw.githubusercontent.com/jangrae/csv/master/titanic.csv'
data = pd.read_csv(path)

# 2. 데이터 이해

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [6]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# 3. 데이터 준비

#### 1) title 변수 추가

In [7]:
# 호칭 추출
data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [8]:
# 대표 호칭 이외는 Others로 변경
main_tit_list = ['Mr', 'Miss', 'Mrs']
data.loc[data['Title'].isin(main_tit_list)==False, ['Title']] = 'Others'

#### 2) 결측치 처리

In [9]:
# Age 중앙값으로 채우기
data['Age'].fillna(data.groupby('Title')['Age'].transform('median'), inplace=True)

In [10]:
# Embarked 최빈값으로 채우기
data['Embarked'] = data['Embarked'].fillna('S')

**3) AgeGroup 변수 추가**

In [11]:
# 변수 추가
bins = [0, 16, 25, 35, 60, 100]
data['AgeGroup'] = pd.cut(data['Age'], 
                          bins=bins, 
                          labels=['0_16', '17_25', '26_35', '36_60', '61_100'])

**4) 가변수화**

In [12]:
# 가변수화
dummy_vars = ['Title', 'AgeGroup', 'Pclass', 'Sex', 'Embarked']

for v in dummy_vars:
    dummies = pd.get_dummies(data[v], prefix=v, drop_first=True)
    data = pd.concat([data, dummies], axis=1)
    data.drop(v, axis=1, inplace=True)

**5) 불필요한 변수 제거**

In [13]:
# 불필요한 열 제거
drop_cols = ['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin']
data.drop(drop_cols, axis=1, inplace=True)

**6) x, y 분리**

In [14]:
# target 확인
target = 'Survived'

# 데이터 분리
x = data.drop(target, axis = 1)
y = data[target] 

**7) 정규화**

In [15]:
# 정규화
cols = ['SibSp', 'Parch', 'Fare']
x[cols] = (x[cols] - x[cols].min()) / (x[cols].max() - x[cols].min())

# 확인
x.head()

Unnamed: 0,SibSp,Parch,Fare,Title_Mr,Title_Mrs,Title_Others,AgeGroup_17_25,AgeGroup_26_35,AgeGroup_36_60,AgeGroup_61_100,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0.125,0.0,0.014151,1,0,0,1,0,0,0,0,1,1,0,1
1,0.125,0.0,0.139136,0,1,0,0,0,1,0,0,0,0,0,0
2,0.0,0.0,0.015469,0,0,0,0,1,0,0,0,1,0,0,1
3,0.125,0.0,0.103644,0,1,0,0,1,0,0,0,0,0,0,1
4,0.0,0.0,0.015713,1,0,0,0,1,0,0,0,1,1,0,1


**8) 학습용, 평가용 데이터 분리**

In [16]:
# 모듈 불러오기
from sklearn.model_selection import train_test_split

# 7:3으로 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2022)

# 4. 모델링

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [36]:
model = DecisionTreeClassifier(max_depth=5)

In [37]:
model.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=5)

In [38]:
y_pred = model.predict(x_test)

In [39]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[156  12]
 [ 36  64]]
              precision    recall  f1-score   support

           0       0.81      0.93      0.87       168
           1       0.84      0.64      0.73       100

    accuracy                           0.82       268
   macro avg       0.83      0.78      0.80       268
weighted avg       0.82      0.82      0.81       268



# 5. 성능튜닝

#### GridSearch

#### 1) 모델링 & 튜닝

In [41]:
# 파라미터 선언

param = {'max_depth' : range(3,21)}
param

{'max_depth': range(3, 21)}

In [42]:
# 불러오기

from sklearn.model_selection import GridSearchCV

In [43]:
# 선언하기

model_tree = DecisionTreeClassifier()

model = GridSearchCV(model_tree,             # 기본 모델
                     param,                  # 파라미터
                     cv=10,                  # k-Fold k개수
                     scoring='accuracy')     # 비교 지수

In [44]:
# 학습하기

model.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(3, 21)}, scoring='accuracy')

#### 2) 결과 확인

In [45]:
model.cv_results_

{'mean_fit_time': array([0.00376544, 0.00308812, 0.00368559, 0.00358472, 0.00391893,
        0.00490065, 0.00389187, 0.00425267, 0.00332997, 0.00326355,
        0.00351653, 0.00344095, 0.00339055, 0.00340967, 0.00342119,
        0.00377541, 0.00335493, 0.00300436]),
 'std_fit_time': array([1.05791207e-03, 5.10966640e-04, 4.54401109e-04, 5.04696894e-04,
        7.16348606e-04, 7.12923721e-04, 5.79397148e-04, 9.50246043e-04,
        6.32640251e-04, 4.25567774e-04, 5.02509441e-04, 4.31025372e-04,
        4.99385309e-04, 1.56400759e-03, 4.91992463e-04, 3.82786517e-04,
        6.40184956e-04, 7.83703158e-05]),
 'mean_score_time': array([0.00203867, 0.00163975, 0.00208824, 0.00229967, 0.0021122 ,
        0.00280693, 0.0020865 , 0.00333302, 0.00185666, 0.00175436,
        0.00210454, 0.0015588 , 0.00179048, 0.00189662, 0.00183864,
        0.00173199, 0.00210073, 0.00155718]),
 'std_score_time': array([0.00076957, 0.00054285, 0.00030222, 0.00045464, 0.00068367,
        0.00075048, 0.00031248, 

In [28]:
# 최적 파라미터
model.best_params_

{'max_depth': 5}

In [29]:
# 최고 성능
model.best_score_

0.8235535074244751

In [32]:
# 최적 모델 확인
model.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

#### 3) 예측과 평가

In [33]:
y_pred = model.predict(x_test)

In [35]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[156  12]
 [ 36  64]]
              precision    recall  f1-score   support

           0       0.81      0.93      0.87       168
           1       0.84      0.64      0.73       100

    accuracy                           0.82       268
   macro avg       0.83      0.78      0.80       268
weighted avg       0.82      0.82      0.81       268

