In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score

In [2]:
train=pd.read_csv('./titanic_train.csv')
test=pd.read_csv('./titanic_test.csv')

print(train.shape,test.shape)

(891, 12) (418, 11)


In [3]:
'''
PassengerId : 각 승객의 고유 번호
Survived : 생존 여부 -> 0 = 사망 / 1 = 생존
Pclass : 객실 등급
Name : 이름
Sex : 성별
Age : 나이
SibSp : 동반한 형제자매와 배우자 수
Parch : 동반한 부모, 자식 수
Ticket : 티켓의 고유넘부
Fare : 티켓의 요금
Cabin : 객실 번호
Embarked : 승선한 항

'''
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# PassengerId를 비롯해 모델을 돌리는 작업에 필요없는 컬럼 삭제하기
x_train = train.drop(columns = ['PassengerId','Survived','Name','Ticket','Cabin'])

x_test_id = test['PassengerId'] # 결과표에 붙일 예정
x_test = test.drop(columns = ['PassengerId','Name','Ticket','Cabin'])

# train에 있는 종속변수 survived y_train에 저장하기
y_train = train[['Survived']]

In [5]:
x_train.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [6]:
x_test.isnull().sum()

Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [7]:
x_train['Age'] = x_train['Age'].fillna(x_train['Age'].median())
x_test['Age'] = x_test['Age'].fillna(x_test['Age'].median())

In [8]:
'''
method의 종류로는 결측치 바로 뒤에값으로 채워주는'bfill'과 앞에 값으로 채워주는 'ffill'이 있다.
method를 적용하고나면 꼭 isnull().sum()으로 확인해야한다.
왜냐하면 맨 앞과 맨 마지막 컬럼이 nan값이면 채워질 값이 없어 두개의 method중 하나는 여전히 nan값으로 남는다.
위와같은 이유로 해당 csv파일에서 method를 'ffill'을 적용시킨다면 결측치가 다 채워지지 않는다.
'''

nan = ['Embarked','Fare']

for i in nan:
    x_train[i] = x_train[i].fillna(method = 'bfill') # 결측치를 바로 뒤에 값으로 채운다
    x_test[i] = x_test[i].fillna(method='bfill')

In [9]:
x_train.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [10]:
x_test.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [11]:
# label encoding작업 전 dytpe이 object , category인 즉, 숫자가 아닌 문자열인 컬럼을 찾는다.
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


In [12]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [13]:
encoder = LabelEncoder()
label = ['Sex','Embarked']

x_train[label] = x_train[label].apply(encoder.fit_transform)
x_test[label] = x_test[label].apply(encoder.fit_transform)

In [14]:
# 확인하기
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [15]:
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


In [16]:
x_train.describe() # min, max 값들을 기준으로 연속형 값들을 minmaxscaling 한다.

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,29.361582,0.523008,0.381594,32.204208,1.534231
std,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.793021
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,2.0,0.0,22.0,0.0,0.0,7.9104,1.0
50%,3.0,1.0,28.0,0.0,0.0,14.4542,2.0
75%,3.0,1.0,35.0,1.0,0.0,31.0,2.0
max,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [17]:
scaler = MinMaxScaler()
minmax = ['Age','Fare']

scaler.fit(x_train[minmax])
x_train[minmax] = scaler.transform(x_train[minmax])

scaler.fit(x_test[minmax])
x_test[minmax] = scaler.transform(x_test[minmax])

In [18]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,0.271174,1,0,0.014151,2
1,1,0,0.472229,1,0,0.139136,0
2,3,0,0.321438,0,0,0.015469,2
3,1,0,0.434531,1,0,0.103644,2
4,3,1,0.434531,0,0,0.015713,2


In [19]:
y_train

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [20]:
# train, validation 나누기
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, stratify = y_train['Survived'],
                                                   test_size = 0.2, random_state = 2022)

x_tr.shape, x_val.shape, y_tr.shape, y_val.shape

((712, 7), (179, 7), (712, 1), (179, 1))

In [21]:
rfc = RandomForestClassifier(n_estimators = 100 ,max_depth = 3)
rfc.fit(x_tr, y_tr)
pred = rfc.predict_proba(x_val)[:,1] # predict_proba를 한다면 뒤에 [:,1]을 해줘야됨
pred

  rfc.fit(x_tr, y_tr)


array([0.366989  , 0.13951046, 0.44109757, 0.12864528, 0.3699501 ,
       0.14072171, 0.12551596, 0.41027996, 0.40028939, 0.70457891,
       0.14530648, 0.42962993, 0.17666355, 0.23042026, 0.31249967,
       0.25695938, 0.72527504, 0.71966101, 0.77316692, 0.16254172,
       0.48104176, 0.49877867, 0.77995317, 0.76812974, 0.42166646,
       0.14101588, 0.36356592, 0.12864528, 0.25695938, 0.1474505 ,
       0.14289806, 0.49082429, 0.24941496, 0.13647822, 0.2194721 ,
       0.12430471, 0.36419202, 0.49932184, 0.74727935, 0.26027028,
       0.12430471, 0.23381938, 0.18307712, 0.21345656, 0.50143154,
       0.43043819, 0.33660222, 0.14538208, 0.45177714, 0.36652703,
       0.78948878, 0.70908735, 0.50663769, 0.22785851, 0.1311299 ,
       0.12430471, 0.44293445, 0.81478855, 0.32380345, 0.40428172,
       0.57763881, 0.39985158, 0.36652703, 0.2170769 , 0.14072171,
       0.22068335, 0.18829898, 0.7617287 , 0.31249967, 0.48719469,
       0.46051502, 0.72078748, 0.53463618, 0.18829898, 0.50173

In [22]:
print(roc_auc_score(y_val, pred)) # 1에 가까울수록 좋음

0.8590250329380764


In [23]:
xg = XGBClassifier(n_estimators = 100 ,max_depth = 3)
xg.fit(x_tr, y_tr)
pred = xg.predict_proba(x_val)[:,1]
pred

array([0.75679016, 0.01780268, 0.7346408 , 0.06768073, 0.6628271 ,
       0.03069706, 0.02254575, 0.51992667, 0.13194044, 0.866137  ,
       0.12582506, 0.91558146, 0.00806837, 0.03535495, 0.08022736,
       0.16813965, 0.8641747 , 0.9385185 , 0.99546945, 0.0319758 ,
       0.26312244, 0.74468386, 0.9678441 , 0.93586385, 0.3787995 ,
       0.09236454, 0.10327721, 0.06768073, 0.2029866 , 0.13244137,
       0.25253728, 0.6335468 , 0.07155852, 0.01617529, 0.07483833,
       0.06849186, 0.32429352, 0.65175885, 0.7973817 , 0.18136977,
       0.06075306, 0.10470828, 0.02016026, 0.06258717, 0.7772227 ,
       0.38091323, 0.94139576, 0.08039083, 0.02346894, 0.2640474 ,
       0.9707695 , 0.83620095, 0.78087187, 0.11712567, 0.01423784,
       0.06075306, 0.668325  , 0.9802862 , 0.9550367 , 0.4132798 ,
       0.9302484 , 0.03262201, 0.4835772 , 0.17781436, 0.05192732,
       0.0858712 , 0.19895728, 0.97378236, 0.06821967, 0.17094964,
       0.651435  , 0.91933054, 0.98074603, 0.19895728, 0.25358

In [24]:
print(roc_auc_score(y_val, pred))

0.8548089591567852


#### modeling

In [25]:
xg = XGBClassifier(n_estimators = 100 ,max_depth = 3)
xg.fit(x_train, y_train)
pred = xg.predict_proba(x_test)[:,1]
pred

array([0.0309699 , 0.06284906, 0.07626989, 0.0686041 , 0.35415214,
       0.09917002, 0.25619215, 0.06572887, 0.9622808 , 0.02547976,
       0.03773899, 0.3586566 , 0.93959254, 0.03410101, 0.98324955,
       0.84265953, 0.0835636 , 0.3855232 , 0.5257023 , 0.11988548,
       0.42215273, 0.18054527, 0.9919035 , 0.60333145, 0.990956  ,
       0.03588508, 0.99554473, 0.3635637 , 0.6158621 , 0.16159686,
       0.04234828, 0.16878077, 0.41083756, 0.11773786, 0.88465923,
       0.40745133, 0.34653994, 0.2242139 , 0.11335222, 0.55214894,
       0.10073981, 0.60402185, 0.05044821, 0.9558653 , 0.9628    ,
       0.2758551 , 0.11825985, 0.13821585, 0.99351853, 0.58542097,
       0.53387904, 0.13655984, 0.88408977, 0.9194474 , 0.35290697,
       0.01246351, 0.02640134, 0.13013779, 0.09247188, 0.9966274 ,
       0.06228453, 0.18963687, 0.16907573, 0.8459778 , 0.67084336,
       0.88436806, 0.8458378 , 0.06544142, 0.2767714 , 0.9766159 ,
       0.74345356, 0.03235615, 0.69489646, 0.17098066, 0.99004

In [26]:
result_pred = pd.DataFrame(pred)
result_pred.columns = ['predict']
result_pred

Unnamed: 0,predict
0,0.030970
1,0.062849
2,0.076270
3,0.068604
4,0.354152
...,...
413,0.097092
414,0.990508
415,0.003759
416,0.097092


In [27]:
result = pd.concat([x_test_id, result_pred],axis=1)
result

Unnamed: 0,PassengerId,predict
0,892,0.030970
1,893,0.062849
2,894,0.076270
3,895,0.068604
4,896,0.354152
...,...,...
413,1305,0.097092
414,1306,0.990508
415,1307,0.003759
416,1308,0.097092


In [None]:
# result.to_csv('수험번호.csv', index=False) #index=False **필수**