## 모듈 import

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb


## 데이터 준비

In [2]:
df = pd.read_csv("data/titanic.csv")

In [3]:
## 데이터 개요
df.head()
## 데이터 변수 확인
df.columns # 변수 이름들 확인
df.columns.size # 몇개의 변수
df.shape # 행의 개수, 열의 개수
df.dtypes # 변수의 데이터 타입 (수치형으로 변환을 위해)
## 특징(변수)는 반드시 na가 있으면 안되므로 na 확인
df.isna().sum()
## 필요한 컬럼만 셀렉트
selected_cols = ["Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked","Survived"]
data = df[selected_cols]

In [4]:
## 분석데이터 개요
data.head()
## 데이터 변수 확인
data.columns # 변수 이름들 확인
data.columns.size # 몇개의 변수
data.shape # 행의 개수, 열의 개수
data.dtypes # 변수의 데이터 타입 (수치형으로 변환을 위해)
## 특징(변수)는 반드시 na가 있으면 안되므로 na 확인
data.isna().sum()
#data.info()
## Na 정리
#1. drop데이터 선별 (컬럼)
# Cabin 
#2. drop데이터 선별 (로우)
#Embarked
#data.drop(data.Embarked.isna(),axis=0)
data = data.loc[~data.Embarked.isna(),:]

## 데이터 전처리

In [5]:
## Cabin imputation
# 수치형 --> 세심한 주의(평균 사용시 주의 요망)
# 범주형데이터 --> real Na? 범주외의 대체값
## Cabin EDA 
data.Cabin.str[:1].value_counts()
data.Cabin.fillna("N",inplace=True)
data.isna().sum()
data.Cabin.str[:1].value_counts()
newCabin = data.Cabin.str[:1]
data.drop("Cabin",axis=1,inplace=True)
data["Cabin"] = newCabin

In [6]:
## Age imputaion
## Age EDA
data.Age.describe()
data.loc[data.Age <= 18,:] ##
data.loc[data.Age.isna(),["Pclass","Fare"]] ## Age값이 NUll인 사람의 선실과 요금
data.groupby(["Pclass","Sex"])["Fare"].mean()

## Age 예측을 위한 회귀모델 생성
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
df_age = data.loc[~data.Age.isna(),["Pclass","Sex","Fare","Embarked","Age"]]
pred_age = data.loc[data.Age.isna(),["Pclass","Sex","Fare","Embarked"]]
le_sex = LabelEncoder()
le_sex.fit(df_age.Sex)
newSex = le_sex.transform(df_age.Sex)
le_embarked = LabelEncoder()
le_embarked.fit(df_age.Embarked)
newEmbarked = le_embarked.transform(df_age.Embarked)
df_age.drop(["Sex","Embarked"],axis=1,inplace=True)
newCols = np.c_[newSex,newEmbarked]
df_age = pd.concat([pd.DataFrame(newCols,index=df_age.index),df_age],axis=1)
age_model = RandomForestRegressor()
age_model.fit(df_age.iloc[:,:-1],df_age.iloc[:,-1])

## Null 예측실행
newSex = le_sex.transform(pred_age.Sex)
newEmbarked = le_embarked.transform(pred_age.Embarked)
pred_age.drop(["Sex","Embarked"],axis=1,inplace=True)
newCols = np.c_[newSex,newEmbarked]
pred_age = pd.concat([pd.DataFrame(newCols,index=pred_age.index),pred_age],axis=1)
newAge = age_model.predict(pred_age)
pred_age["Age"] = newAge

## 
newAge = pd.merge(df_age,pred_age,how="outer")[["Age"]]
newAge.reset_index(inplace=True)
newAge.drop("index",axis=1,inplace=True)

## data에 newAge컬럼 추가
data.reset_index(inplace=True)
data.drop(["Age","index"],axis=1,inplace=True)
data["Age"] = newAge


In [7]:
## 파생변수 생성 (Name) 결혼유무판단
## data.Name에서 Mrs. 가 존재하면 기혼
## data.Name에서 Mr. 가족(SibSp)이 있으면 기혼 혹은 25세 이상이면 기혼
married = np.where((data.Name.str.contains("Mrs.")) | ((data.Name.str.contains("Mr.")) & (data.SibSp >0)),1,0)
data["Married"] = married

In [8]:
data

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,Survived,Cabin,Age,Married
0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.2500,S,0,N,22.000000,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C,1,C,22.000000,1
2,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.9250,S,1,N,22.000000,0
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1000,S,1,C,38.000000,1
4,3,"Allen, Mr. William Henry",male,0,0,373450,8.0500,S,0,N,26.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
884,2,"Montvila, Rev. Juozas",male,0,0,211536,13.0000,S,0,N,25.258694,0
885,1,"Graham, Miss. Margaret Edith",female,0,0,112053,30.0000,S,1,B,44.448500,0
886,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,1,2,W./C. 6607,23.4500,S,0,N,31.956833,0
887,1,"Behr, Mr. Karl Howell",male,0,0,111369,30.0000,C,1,C,26.270145,0


In [9]:
## Name, Ticket컬럼 제거
data.drop(["Name","Ticket"],axis=1,inplace=True)
data.head()
## data 컬럼 타입확인
data.dtypes

Pclass        int64
Sex          object
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
Survived      int64
Cabin        object
Age         float64
Married       int64
dtype: object

In [10]:
## preprocessing 
# categorical 변수에서 종류(unique)가 2개 이면 정수인코딩, 나머지는 원핫인코딩
# obj = data.dtypes[data.dtypes == "object"].index
# data[obj[0]].unique().size
# 수치형(연속형)변수일 경우 정규화(MinMax, Robust, Standard)
# mmScale = (X-np.min(X))/(np.max(X)-np.min(x))
# rbScale = (X - np.percetile(X,50))/(np.percentile(X,75) - np.percentile(X,25))
# stScale = (X - np.mean(x))/(np.sd(X))

In [11]:
newAge = np.where(data.Age > np.percentile(data.Age,90),0,
                  np.where(data.Age > np.percentile(data.Age,75),1,
                           np.where(data.Age > np.percentile(data.Age,25),2,3)))
data.drop("Age",axis=1,inplace=True)
data["Age"] = newAge

In [12]:
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# newPclass = OneHotEncoder().fit_transform(data[["Pclass"]]).toarray()
# newSex = OneHotEncoder().fit_transform(data[["Sex"]]).toarray()
# newSibSp = OneHotEncoder().fit_transform(data[["SibSp"]]).toarray()
# newCols = np.c_[newPclass,newSex,newSibSp]
# newCols.shape

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
selected_cols = ["Pclass","Sex","Parch","SibSp","Age","Embarked","Cabin","Married"]
ct = make_column_transformer(
    (OneHotEncoder(),selected_cols)
)
newCols = ct.fit_transform(data).toarray()
data.drop(selected_cols,axis=1,inplace=True)
data = pd.concat([pd.DataFrame(newCols),data],axis=1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,Fare,Survived
0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,7.2500,0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,71.2833,1
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,7.9250,1
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,53.1000,1
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,8.0500,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,13.0000,0
885,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,30.0000,1
886,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,23.4500,0
887,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,30.0000,1


In [13]:
data.dtypes

0           float64
1           float64
2           float64
3           float64
4           float64
5           float64
6           float64
7           float64
8           float64
9           float64
10          float64
11          float64
12          float64
13          float64
14          float64
15          float64
16          float64
17          float64
18          float64
19          float64
20          float64
21          float64
22          float64
23          float64
24          float64
25          float64
26          float64
27          float64
28          float64
29          float64
30          float64
31          float64
32          float64
33          float64
34          float64
35          float64
36          float64
Fare        float64
Survived      int64
dtype: object

In [14]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2)

In [15]:
y

0      0
1      1
2      1
3      1
4      0
      ..
884    0
885    1
886    0
887    1
888    0
Name: Survived, Length: 889, dtype: int64

## 모델 학습

In [16]:
# XGboost 모델을 위한 파라미터 설정
param = { 'n_estimators':100,
         'learning_rate':0.1,
         'gamma':1,
        'max_depth':10,
         'subsample':0.75,
        'colsample_bytree':1,
        'verbosity':0}



ada_model = AdaBoostClassifier()
rf_model = RandomForestClassifier()
lr_model = LogisticRegression()
svc_model = SVC(probability=True)
knn_model = KNeighborsClassifier()
bst_model = xgb.XGBClassifier(**param)


In [17]:
ada_model.fit(X_train,y_train)
rf_model.fit(X_train,y_train) 
lr_model.fit(X_train,y_train) 
svc_model.fit(X_train,y_train) 
knn_model.fit(X_train,y_train)

X_meta = np.c_[rf_model.predict(X_train),ada_model.predict(X_train),lr_model.predict(X_train),svc_model.predict(X_train),knn_model.predict(X_train)]

bst_model.fit(X_meta,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=1, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.75,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [18]:
X_meta_test = np.c_[rf_model.predict(X_test),ada_model.predict(X_test),lr_model.predict(X_test),svc_model.predict(X_test),knn_model.predict(X_test)]

print('rf_model : ', rf_model.score(X_test,y_test))
print('ada_model : ', ada_model.score(X_test,y_test))
print('lr_model : ', lr_model.score(X_test,y_test))
print('svc_model : ', svc_model.score(X_test,y_test))
print('knn_model : ', knn_model.score(X_test,y_test))
print('bst_model : ', bst_model.score(X_meta_test,y_test))

rf_model :  0.8089887640449438
ada_model :  0.7865168539325843
lr_model :  0.7808988764044944
svc_model :  0.6741573033707865
knn_model :  0.7303370786516854
bst_model :  0.8089887640449438


In [19]:
hard_voting_result = (rf_model.predict(X_test) + ada_model.predict(X_test) + lr_model.predict(X_test) + svc_model.predict(X_test) + knn_model.predict(X_test)+bst_model.predict(X_meta_test))
hard_acc = sum(np.where(hard_voting_result > 2,1,0) == y_test)/ hard_voting_result.size
print('hard_voting_result : ',hard_acc)

hard_voting_result :  0.8089887640449438


In [20]:
soft_voting_result = (rf_model.predict_proba(X_test) + ada_model.predict_proba(X_test) + lr_model.predict_proba(X_test) + svc_model.predict_proba(X_test) + bst_model.predict_proba(X_meta_test))
soft_acc = sum(np.array(soft_voting_result[:,0]<soft_voting_result[:,1],dtype='int') == y_test)/ y_test.size
print('soft_voting_result : ',soft_acc)

soft_voting_result :  0.8146067415730337


In [21]:
v1 = rf_model.predict(X_train)
v2 = ada_model.predict(X_train)
v3 = lr_model.predict(X_train)
v4 = svc_model.predict(X_train)
v5 = bst_model.predict(X_meta)
X_train_meta = np.c_[v1,v2,v3,v4,v5]

In [22]:
meta_model = KNeighborsClassifier()

In [23]:
meta_model.fit(X_train_meta,y_train)

KNeighborsClassifier()

In [24]:
v1 = rf_model.predict(X_test)
v2 = ada_model.predict(X_test)
v3 = lr_model.predict(X_test)
v4 = svc_model.predict(X_test)
v5 = bst_model.predict(X_meta_test)
X_test_meta = np.c_[v1,v2,v3,v4,v5]

In [25]:
meta_model.score(X_test_meta,y_test)

0.8089887640449438

In [26]:
print('rf_model : ', rf_model.score(X_test,y_test))
print('ada_model : ', ada_model.score(X_test,y_test))
print('lr_model : ', lr_model.score(X_test,y_test))
print('svc_model : ', svc_model.score(X_test,y_test))
print('knn_model : ', knn_model.score(X_test,y_test))
print('*--------------------------------------------*')
print('hard_acc : ', hard_acc)
print('soft_acc : ', soft_acc)
print('*--------------------------------------------*')
print('meta_model : ', meta_model.score(X_test_meta,y_test))

rf_model :  0.8089887640449438
ada_model :  0.7865168539325843
lr_model :  0.7808988764044944
svc_model :  0.6741573033707865
knn_model :  0.7303370786516854
*--------------------------------------------*
hard_acc :  0.8089887640449438
soft_acc :  0.8146067415730337
*--------------------------------------------*
meta_model :  0.8089887640449438
