<a href="https://colab.research.google.com/github/hyunicecream/ML-DL/blob/main/GBM(for%20Classification)(Titanic).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# 타이타닉 데이터를 불러온다.
df = pd.read_csv('titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
# Null 값 확인
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
# 나이에 대한 Null 값을 평균 나이 값으로 대체 하자
df['Age'].fillna(df['Age'].mean(),inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


In [None]:
# Mr, Miss, Mrs 등 호칭으로 분류
name = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
name.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Lady          1
Mme           1
Don           1
Capt          1
Countess      1
Ms            1
Jonkheer      1
Sir           1
Name: Name, dtype: int64

In [None]:
# 4개의 호칭에 대해서만 사용하도록 하자
# Name 이라는 열을 지우고 Title이라는 것으로 만들어서 사용할 것이다. 
title = ['Mr', 'Miss', 'Mrs', 'Master']
df['Title'] = [x if x in title else 'Other' for x in name]

In [None]:
# LabelEncoder 진행
le = {}
for feat in ['Cabin', 'Sex', 'Embarked', 'Title']:
    le[feat] = LabelEncoder()
    df[feat] = le[feat].fit_transform(df[feat].astype(str))
print(title)
print(df['Title'])

['Mr', 'Miss', 'Mrs', 'Master']
0      2
1      3
2      1
3      3
4      2
      ..
886    4
887    1
888    1
889    2
890    2
Name: Title, Length: 891, dtype: int64


In [None]:
# 불필요한 feature를 제거한다.
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Title
0,0,3,1,22.0,1,0,7.25,147,2,2
1,1,1,0,38.0,1,0,71.2833,81,0,3
2,1,3,0,26.0,0,0,7.925,147,2,1
3,1,1,0,35.0,1,0,53.1,55,2,3
4,0,3,1,35.0,0,0,8.05,147,2,2


In [None]:
# features 와 target 값을 구성한다.
target_data = df['Survived']
feature_data = df.drop('Survived', axis=1)

In [None]:
# train 셋과 test셋을 구성한다.
trainX, testX, trainY, testY = train_test_split(feature_data, target_data, test_size=0.2)

In [None]:
model = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, max_depth=3)
model.fit(trainX, trainY)

GradientBoostingClassifier()

In [None]:
predY = model.predict(testX)
accuracy = (testY == predY).mean()
print()
print("* 시험용 데이터로 측정한 정확도 = %.2f" % accuracy)

predY = model.predict(trainX)
accuracy = (trainY == predY).mean()
print()
print("* 학습용 데이터로 측정한 정확도 = %.2f" % accuracy)
print('시험 데이터 전체 오류 (R2-score) = %.4f' % model.score(testX, testY))


* 시험용 데이터로 측정한 정확도 = 0.87

* 학습용 데이터로 측정한 정확도 = 0.91
시험 데이터 전체 오류 (R2-score) = 0.8659
