In [218]:
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost

In [197]:
df = pd.read_csv("data/titanic.csv")

In [198]:
## 데이터 개요
df.head()
## 데이터 변수 확인
df.columns # 변수 이름들 확인
df.columns.size # 몇개의 변수
df.shape # 행의 개수, 열의 개수
df.dtypes # 변수의 데이터 타입 (수치형으로 변환을 위해)
## 특징(변수)는 반드시 na가 있으면 안되므로 na 확인
df.isna().sum()
## 필요한 컬럼만 셀렉트
selected_cols = ["Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked","Survived"]
data = df[selected_cols]

In [199]:
## 분석데이터 개요
data.head()
## 데이터 변수 확인
data.columns # 변수 이름들 확인
data.columns.size # 몇개의 변수
data.shape # 행의 개수, 열의 개수
data.dtypes # 변수의 데이터 타입 (수치형으로 변환을 위해)
## 특징(변수)는 반드시 na가 있으면 안되므로 na 확인
data.isna().sum()
#data.info()
## Na 정리
#1. drop데이터 선별 (컬럼)
# Cabin 
#2. drop데이터 선별 (로우)
#Embarked
#data.drop(data.Embarked.isna(),axis=0)
data = data.loc[~data.Embarked.isna(),:]

In [200]:
## Cabin imputation
# 수치형 --> 세심한 주의(평균 사용시 주의 요망)
# 범주형데이터 --> real Na? 범주외의 대체값
## Cabin EDA 
data.Cabin.str[:1].value_counts()
data.Cabin.fillna("N",inplace=True)
data.isna().sum()
data.Cabin.str[:1].value_counts()
newCabin = data.Cabin.str[:1]
data.drop("Cabin",axis=1,inplace=True)
data["Cabin"] = newCabin

In [201]:
## Age imputaion
## Age EDA
data.Age.describe()
data.loc[data.Age <= 18,:] ##
data.loc[data.Age.isna(),["Pclass","Fare"]] ## Age값이 NUll인 사람의 선실과 요금
data.groupby(["Pclass","Sex"])["Fare"].mean()

## Age 예측을 위한 회귀모델 생성

df_age = data.loc[~data.Age.isna(),["Pclass","Sex","Fare","Embarked","Age"]]
pred_age = data.loc[data.Age.isna(),["Pclass","Sex","Fare","Embarked"]]
le_sex = LabelEncoder()
le_sex.fit(df_age.Sex)
newSex = le_sex.transform(df_age.Sex)
le_embarked = LabelEncoder()
le_embarked.fit(df_age.Embarked)
newEmbarked = le_embarked.transform(df_age.Embarked)
df_age.drop(["Sex","Embarked"],axis=1,inplace=True)
newCols = np.c_[newSex,newEmbarked]
df_age = pd.concat([pd.DataFrame(newCols,index=df_age.index),df_age],axis=1)
age_model = RandomForestRegressor()
age_model.fit(df_age.iloc[:,:-1],df_age.iloc[:,-1])

## Null 예측실행
newSex = le_sex.transform(pred_age.Sex)
newEmbarked = le_embarked.transform(pred_age.Embarked)
pred_age.drop(["Sex","Embarked"],axis=1,inplace=True)
newCols = np.c_[newSex,newEmbarked]
pred_age = pd.concat([pd.DataFrame(newCols,index=pred_age.index),pred_age],axis=1)
newAge = age_model.predict(pred_age)
pred_age["Age"] = newAge

## 
newAge = pd.merge(df_age,pred_age,how="outer")[["Age"]]
newAge.reset_index(inplace=True)
newAge.drop("index",axis=1,inplace=True)

## data에 newAge컬럼 추가
data.reset_index(inplace=True)
data.drop(["Age","index"],axis=1,inplace=True)
data["Age"] = newAge


In [202]:
## 파생변수 생성 (Name) 결혼유무판단
## data.Name에서 Mrs. 가 존재하면 기혼
## data.Name에서 Mr. 가족(SibSp)이 있으면 기혼 혹은 25세 이상이면 기혼
married = np.where((data.Name.str.contains("Mrs.")) | ((data.Name.str.contains("Mr.")) & (data.SibSp >0)),1,0)
data["Married"] = married

In [203]:
## Name, Ticket컬럼 제거
data.drop(["Name","Ticket"],axis=1,inplace=True)
data.head()
## data 컬럼 타입확인
data.dtypes

Pclass        int64
Sex          object
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
Survived      int64
Cabin        object
Age         float64
Married       int64
dtype: object

In [185]:
## preprocessing 
# categorical 변수에서 종류(unique)가 2개 이면 정수인코딩, 나머지는 원핫인코딩
# obj = data.dtypes[data.dtypes == "object"].index
# data[obj[0]].unique().size
# 수치형(연속형)변수일 경우 정규화(MinMax, Robust, Standard)
# mmScale = (X-np.min(X))/(np.max(X)-np.min(x))
# rbScale = (X - np.percetile(X,50))/(np.percentile(X,75) - np.percentile(X,25))
# stScale = (X - np.mean(x))/(np.sd(X))

In [204]:
newAge = np.where(data.Age > np.percentile(data.Age,90),0,
                  np.where(data.Age > np.percentile(data.Age,75),1,
                           np.where(data.Age > np.percentile(data.Age,25),2,3)))
data.drop("Age",axis=1,inplace=True)
data["Age"] = newAge

In [205]:
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# newPclass = OneHotEncoder().fit_transform(data[["Pclass"]]).toarray()
# newSex = OneHotEncoder().fit_transform(data[["Sex"]]).toarray()
# newSibSp = OneHotEncoder().fit_transform(data[["SibSp"]]).toarray()
# newCols = np.c_[newPclass,newSex,newSibSp]
# newCols.shape


selected_cols = ["Pclass","Sex","Parch","SibSp","Age","Embarked","Cabin","Married"]
ct = make_column_transformer(
    (OneHotEncoder(),selected_cols)
)
newCols = ct.fit_transform(data).toarray()
data.drop(selected_cols,axis=1,inplace=True)
data = pd.concat([pd.DataFrame(newCols),data],axis=1)
data.shape

(889, 39)

In [258]:
## Train, test 분리
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [266]:
## Model Instance 생성
rf_model = RandomForestClassifier() 
ada_model = AdaBoostClassifier()
lr_model = LogisticRegression()
svc_model = SVC(probability=True)
meta_model = KNeighborsClassifier()
xgb_param = {
    "n_estimators":100, 
     "learning_rate":0.08, 
     "gamma":0, 
      "subsample":0.75,
      "colsample_bytree":1, 
      "max_depth":7,
    "verbosity":0
}
xgb_model = xgboost.XGBClassifier(**xgb_param)

In [267]:
## Model Training
rf_model.fit(X_train,y_train)
ada_model.fit(X_train,y_train)
lr_model.fit(X_train,y_train)
svc_model.fit(X_train,y_train)
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.08, max_delta_step=0,
              max_depth=7, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=1,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.75,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [268]:
rf_result = rf_model.predict_proba(X_test)
ada_result = ada_model.predict_proba(X_test)
lr_result = lr_model.predict_proba(X_test)
svc_result = svc_model.predict_proba(X_test)
xgb_result = xgb_model.predict_proba(X_test)
hard_voting_result = rf_model.predict(X_test)+ada_model.predict(X_test)+lr_model.predict(X_test)+svc_model.predict(X_test)+xgb_model.predict(X_test)
hard_acc = sum(np.where(hard_voting_result > 2,1,0) == y_test)/hard_voting_result.size

In [269]:
print("rf_model :",rf_model.score(X_test,y_test))
print("ada_model :",ada_model.score(X_test,y_test))
print("lr_model :",lr_model.score(X_test,y_test))
print("svc_model :",svc_model.score(X_test,y_test))
print("xgb_model :",xgb_model.score(X_test,y_test))
print("hard_voting_model :",hard_acc)

rf_model : 0.8258426966292135
ada_model : 0.8146067415730337
lr_model : 0.8370786516853933
svc_model : 0.7303370786516854
xgb_model : 0.8202247191011236
hard_voting_model : 0.8258426966292135


In [270]:
## Meta Model의 train x 생성
v1 = rf_model.predict(X_train)
v2 = ada_model.predict(X_train)
v3 = lr_model.predict(X_train)
v4 = svc_model.predict(X_train)
v5 = xgb_model.predict(X_train)
X_train_meta = np.c_[v1,v2,v3,v4,v5]

In [271]:
## Meta Model 학습
meta_model.fit(X_train_meta,y_train)

KNeighborsClassifier()

In [274]:
## Meta Model test x 생성
v1 = rf_model.predict(X_test)
v2 = ada_model.predict(X_test)
v3 = lr_model.predict(X_test)
v4 = svc_model.predict(X_test)
v5 = xgb_model.predict(X_test)
X_test_meta = np.c_[v1,v2,v3,v4,v5]

In [275]:
## Meta Model 검증
meta_model.score(X_test_meta,y_test)

0.8258426966292135

In [256]:
X_meta_test = np.c_[rf_model.predict(X_test),ada_model.predict(X_test)]

In [257]:
sum(xgb_model.predict(X_meta_test) == y_test)/y_test.size

0.7921348314606742