In [130]:
from sklearn.linear_model import LinearRegression  # 선형회귀
from sklearn.preprocessing import PolynomialFeatures # 다항특성을 만들어주는 라이브러리
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier # 분류
from sklearn.neighbors import KNeighborsRegressor  # 회귀-예측
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False  # 마이너스 표시 해결
# 한글설정
matplotlib.rcParams['font.family'] = 'AppleGothic' # Mac사용자
matplotlib.rcParams['font.size'] = '10' # 글자크기

분류 : 로지스틱 회귀, 결정트리, 랜덤포레스트

In [131]:
# 타이타닉의 생존자 분류 모델을 구현하시오.
# target : survived
# data : 그외

# 데이터 확인 : nan,0.타입,....
# 데이터 전처리 - nan,0
# train,test 세트

# 불필요한 피처를 제거 - 제거 피처는 체크해 둘 것
# name ticket cabin embarked home.dest - name, cabin

# 각각 모델을 적용

# 정답률 체크

df_train = pd.read_csv('titanic_train.csv')
df_train.head()
df_test = pd.read_csv('titanic_test.csv')
df_test.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,body,home.dest
0,3,0,"Rekic, Mr. Tido",male,38.0,0,0,349249,7.8958,,S,,
1,2,1,"Phillips, Miss. Alice Frances Louisa",female,21.0,0,1,S.O./P.P. 2,21.0,,S,,"Ilfracombe, Devon"
2,2,0,"Jacobsohn, Mr. Sidney Samuel",male,42.0,1,0,243847,27.0,,S,,London
3,3,0,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S,,
4,2,0,"Denbury, Mr. Herbert",male,25.0,0,0,C.A. 31029,31.5,,S,,"Guernsey / Elizabeth, NJ"


##### 데이터 피처 설명
- pclass : Passenger Class, 승객 등급
- survived : 생존 여부 : target 값이 됨.
- name : 승객 이름
- sex : 승객 성별
- age : 승객 나이
- sibsp : 탑승 한 형제/배우자 수
- parch : 탑승 한 부모/자녀 수
- ticket : 티켓 번호
- fare : 승객 지불 요금
- cabin : 선실 이름
- embarked : 승선항 (C = 쉘 부르그, Q = 퀸즈타운, S = 사우스 햄튼)
- body : 사망자 확인 번호 - 분석과정에서 제외해야 함. 머신러닝에 100% 영향을 미침.
- home.dest : 고향/목적지

In [132]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     916 non-null    int64  
 1   survived   916 non-null    int64  
 2   name       916 non-null    object 
 3   sex        916 non-null    object 
 4   age        741 non-null    float64
 5   sibsp      916 non-null    int64  
 6   parch      916 non-null    int64  
 7   ticket     916 non-null    object 
 8   fare       916 non-null    float64
 9   cabin      214 non-null    object 
 10  embarked   914 non-null    object 
 11  body       85 non-null     float64
 12  home.dest  527 non-null    object 
dtypes: float64(3), int64(4), object(6)
memory usage: 93.2+ KB


In [133]:
df_train.isnull().sum()

pclass         0
survived       0
name           0
sex            0
age          175
sibsp          0
parch          0
ticket         0
fare           0
cabin        702
embarked       2
body         831
home.dest    389
dtype: int64

In [134]:
head_name = [i.split(',')[1].split('.')[0].strip() for i in df_train['name']]
df_train['head_name'] = pd.Series(head_name)
df_train['head_name'].head()


head_name = [i.split(',')[1].split('.')[0].strip() for i in df_test['name']]
df_test['head_name'] = pd.Series(head_name)
df_test['head_name'].value_counts()

head_name
Mr        234
Miss       75
Mrs        55
Master     21
Major       2
Rev         2
Don         1
Col         1
Ms          1
Capt        1
Name: count, dtype: int64

In [135]:
pd.crosstab(df_train['head_name'], df_train['sex'])

pd.crosstab(df_test['head_name'], df_test['sex'])

sex,female,male
head_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,1
Don,0,1
Major,0,2
Master,0,21
Miss,75,0
Mr,0,234
Mrs,55,0
Ms,1,0
Rev,0,2


In [136]:
# train data
df_train['head_name'] = df_train['head_name'].replace(['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir'], 'Mr')
df_train['head_name'] = df_train['head_name'].replace(['Dona', 'Lady', 'the Countess'], 'Mrs')
df_train['head_name'] = df_train['head_name'].replace(['Mlle', 'Mme', 'Ms'], 'Miss')

# test data
df_test['head_name'] = df_test['head_name'].replace(['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir'], 'Mr')
df_test['head_name'] = df_test['head_name'].replace(['Dona', 'Lady', 'the Countess'], 'Mrs')
df_test['head_name'] = df_test['head_name'].replace(['Mlle', 'Mme', 'Ms'], 'Miss')


In [137]:
# train Dr은 남자 여자가 따로 있어서 나눔
for i in df_train[df_train['head_name']=='Dr'].index:
    if df_train.loc[i]['sex'] == 'male':
        df_train['head_name'] = df_train['head_name'].replace('Dr', 'Mr')
    else:
        df_train['head_name'] = df_train['head_name'].replace('Dr', 'Mrs')
        
# test Dr은 남자 여자가 따로 있어서 나눔
for i in df_test[df_test['head_name']=='Dr'].index:
    if df_test.loc[i]['sex'] == 'male':
        df_test['head_name'] = df_test['head_name'].replace('Dr', 'Mr')
    else:
        df_test['head_name'] = df_test['head_name'].replace('Dr', 'Mrs')

In [138]:
df_train['cabin'].value_counts(dropna=False)

cabin
NaN            702
C23 C25 C27      5
C78              4
G6               4
C22 C26          3
              ... 
A9               1
E68              1
C30              1
E60              1
B78              1
Name: count, Length: 151, dtype: int64

In [139]:
# train data cabin
df_train['cabin_category'] = df_train['cabin'].str[0]
df_train['cabin_category'].fillna('U', inplace=True)

new_data = pd.get_dummies(df_train, columns=['cabin_category'])

df_train = pd.concat([df_train, new_data[new_data.columns.difference(df_train.columns)]], axis=1)

# test data cabin
df_test['cabin_category'] = df_test['cabin'].str[0]
df_test['cabin_category'].fillna('U', inplace=True)

new_data = pd.get_dummies(df_test, columns=['cabin_category'])

df_test = pd.concat([df_test, new_data[new_data.columns.difference(df_test.columns)]], axis=1)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['cabin_category'].fillna('U', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['cabin_category'].fillna('U', inplace=True)


In [140]:
df_train['cabin_category_T'].value_counts()

cabin_category_T
False    915
True       1
Name: count, dtype: int64

In [141]:
embarked_mode = df_train['embarked'].value_counts().index[0]
df_train['embarked'] = df_train['embarked'].fillna(embarked_mode)
df_test['embarked'] = df_test['embarked'].fillna(embarked_mode)

In [142]:
df_train.drop(['name','ticket','cabin','body','home.dest','cabin_category','cabin_category_T'],axis=1,inplace=True)
df_test.drop(['name','ticket','cabin','body','home.dest','cabin_category'],axis=1,inplace=True)

In [143]:
# train 성별 변경
df_train['sex'] = df_train['sex'].apply(lambda x: 1 if x == 'female' else 0)

# train 성별 변경
df_test['sex'] = df_test['sex'].apply(lambda x: 1 if x == 'female' else 0)

In [144]:
# train 선박 변경
df_train['embarked'] = df_train['embarked'].apply(lambda x: 1 if x == 'S' else (2 if x == 'C' else 0))
# test 선박 변경
df_test['embarked'] = df_test['embarked'].apply(lambda x: 1 if x == 'S' else (2 if x == 'C' else 0))

In [145]:
# head_name 변경
df_train['head_name'] = df_train['head_name'].apply(lambda x: 1 if x == 'Mrs' else (2 if x == 'Miss' else 0))
# head_name 변경
df_test['head_name'] = df_test['head_name'].apply(lambda x: 1 if x == 'Mrs' else (2 if x == 'Miss' else 0))

In [146]:
# train age null 값 평균으로 대치
replace_mean = df_train[df_train['age']>0]['age'].mean()
df_train['age'] = df_train['age'].fillna(replace_mean)
df_test['age'] = df_test['age'].fillna(replace_mean)
replace_mean

30.23144399460189

In [147]:
# data = df_train[df_train.columns.difference(['survived'])]
# test = df_train['survived']

In [148]:
# train_input,test_input,train_target,test_target = train_test_split(
#     data,test,test_size=0.2,random_state=42
# )

train_input = df_train[df_train.columns.difference(['survived'])]
train_target = df_train['survived']
test_input = df_test[df_test.columns.difference(['survived'])]
test_target = df_test['survived']

In [149]:
train_input

Unnamed: 0,age,cabin_category_A,cabin_category_B,cabin_category_C,cabin_category_D,cabin_category_E,cabin_category_F,cabin_category_G,cabin_category_U,embarked,fare,head_name,parch,pclass,sex,sibsp
0,13.000000,False,False,False,False,False,False,False,True,1,19.5000,2,1,2,1,0
1,4.000000,False,False,False,False,False,False,False,True,1,23.0000,2,1,2,1,1
2,30.000000,False,False,False,False,False,False,False,True,2,13.8583,2,0,2,1,1
3,30.231444,False,False,False,False,False,False,False,True,0,7.7250,0,0,3,0,0
4,22.000000,False,False,False,False,False,False,False,True,0,7.7250,2,0,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
911,0.170000,False,False,False,False,False,False,False,True,1,20.5750,2,2,3,1,1
912,30.231444,False,False,False,False,False,False,False,True,1,8.0500,0,0,3,0,0
913,30.231444,False,False,False,False,False,False,False,True,0,7.7333,2,0,3,1,0
914,20.000000,False,False,False,False,False,False,False,True,1,36.7500,2,0,2,1,0


In [150]:
test_input

Unnamed: 0,age,cabin_category_A,cabin_category_B,cabin_category_C,cabin_category_D,cabin_category_E,cabin_category_F,cabin_category_G,cabin_category_U,embarked,fare,head_name,parch,pclass,sex,sibsp
0,38.000000,False,False,False,False,False,False,False,True,1,7.8958,0,0,3,0,0
1,21.000000,False,False,False,False,False,False,False,True,1,21.0000,2,1,2,1,0
2,42.000000,False,False,False,False,False,False,False,True,1,27.0000,0,0,2,0,1
3,30.231444,False,False,False,False,False,False,False,True,1,14.5000,0,0,3,0,0
4,25.000000,False,False,False,False,False,False,False,True,1,31.5000,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388,33.000000,False,False,False,False,False,False,False,True,1,7.8542,0,0,3,0,0
389,31.000000,False,False,False,False,False,False,False,True,1,21.0000,1,0,2,1,0
390,30.231444,False,False,False,False,False,False,False,True,0,7.7500,0,0,3,0,0
391,30.231444,False,False,False,False,False,True,False,False,0,7.7500,0,0,3,0,0


In [151]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(train_input)

train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)


In [152]:
lr = LogisticRegression()
lr.fit(train_scaled,train_target)

In [153]:
print(lr.score(train_scaled,train_target))
print(lr.score(test_scaled,test_target))

0.8013100436681223
0.8015267175572519


In [154]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(train_scaled,train_target)

In [155]:
print(dt.score(train_scaled,train_target))
print(dt.score(test_scaled,test_target))

0.8111353711790393
0.7989821882951654


In [156]:
rf = RandomForestClassifier()
rf.fit(train_scaled,train_target)

In [157]:
print(rf.score(train_scaled,train_target))
print(rf.score(test_scaled,test_target))

0.980349344978166
0.7862595419847328


In [158]:
rfcv = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rfcv,train_scaled,train_target,return_train_score=True,n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9825326713334676 0.805648610121169


In [159]:
df_train.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'head_name', 'cabin_category_A', 'cabin_category_B',
       'cabin_category_C', 'cabin_category_D', 'cabin_category_E',
       'cabin_category_F', 'cabin_category_G', 'cabin_category_U'],
      dtype='object')

In [160]:
rfcv.fit(train_scaled,train_target)
print(rfcv.feature_importances_)

[0.23486304 0.00464325 0.00545756 0.00722211 0.00532782 0.00797016
 0.00357157 0.00127041 0.03426087 0.03500445 0.2355385  0.14893895
 0.03358546 0.06505335 0.13114156 0.04615093]
