In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import re
from collections import Counter
from itertools import combinations, permutations, chain
from statistics import median

In [4]:
gender_submission = pd.read_csv("../input/titanic/gender_submission.csv")
test_df = pd.read_csv("../input/titanic/test.csv")
train_df = pd.read_csv("../input/titanic/train.csv")




train_df['Set'] = [0]*len(train_df)
test_df['Set'] = [1]*len(test_df)
df = pd.merge(train_df, test_df, how='outer').set_index('PassengerId')
df.index.name = 'Id'

In [5]:
df.shape

(1309, 12)

In [6]:
df['Set'].value_counts()

0    891
1    418
Name: Set, dtype: int64

In [7]:
df['Survived'].mean()

0.3838383838383838

# ---------------- [ Preprocessing ] 결측치, 클렌징, dummy, 영향력 ----------------

In [8]:
df['NanAge'] = df['Age'].apply(lambda x: 1 if np.isnan(x)==True else 0)
df[['NanAge', 'Survived']].groupby(['NanAge']).mean()

Unnamed: 0_level_0,Survived
NanAge,Unnamed: 1_level_1
0,0.406162
1,0.293785


### Pclass

In [9]:
## 수치형으로 사용

### Name

In [10]:
## 규칙 'Last Name, Title. First Name (Miss Name)'
df.loc[557, 'Name'] = 'Duff Gordon, Mrs. Morgan'
df.loc[600, 'Name'] = 'Duff Gordon, Mr. Morgan'
df.loc[711, 'Name'] = 'Mayne, Mrs. Berthe'
df.loc[760, 'Name'] = 'Rothes, Countess. Lucy'

In [11]:
## Title
f = lambda x: x.split(',')[1].split('.')[0].strip()
df['Ttl'] = df['Name'].apply(f)

In [12]:
## Last Name
## 하이픈 있으면 아버지 이름 추출
## 특수문자 및 스페이스 기준으로 [-1] 추출 (조사 제거)
f = lambda x: re.sub('[\W]', ' ', x.split(',')[0].split('-')[0]).strip().split(' ')[-1].title()
df['LNm'] = df['Name'].apply(f)

In [13]:
## First Name
## 특수문자 및 스페이스 기준으로 [0] 추출 (배우자 구분)
f = lambda x: re.findall('[\w]+', x.split('.')[1])[0].title()
df['FNm'] = df['Name'].apply(f)

In [14]:
## Miss Name
## 결혼 전 이름이 명시되어 있으면 Last Name 추출
f = (lambda x:
     re.sub('[\W]', ' ', x['Name'].strip().split(' ')[-1].split('-')[0]).strip().split(' ')[-1].title()
     if (x['Ttl']=='Mrs') & ('(' in x['Name'])
     else np.nan)
df['MNm'] = df.apply(f, axis=1)
df['MNm'] = df.apply(lambda x: x['LNm'] if x['Ttl']!='Mrs' else x['MNm'], axis=1)

### Sex

In [15]:
## encoding
df['SexF'] = df['Sex'].map({'male': 0, 'female': 1})
df = df.drop(['Sex'], axis=1)

### Ticket

In [16]:
## Prefix
f = lambda x: re.sub('[\d\s.]', '', x.strip().rsplit(' ', 1)[0].upper())
df['TPx'] = df['Ticket'].apply(f)
df['TPx'] = df['TPx'].replace(['A/', 'A/S', 'AQ/'], 'A')
df['TPx'] = df['TPx'].replace({'SC/AHBASLE': 'SC/AH', 'SOC': 'SO/C', 'WEP': 'WE/P'})
df['TPx'] = df['TPx'].replace({'': 'Z'})

In [17]:
## Suffix
f = lambda x: x.strip().rsplit(' ', 1)[-1]
df['TSx'] = df['Ticket'].apply(f)
df['TSx'] = df['TSx'].replace({'LINE': '9999999'})

In [18]:
## 규칙 '문자열/숫자열'
f = lambda x: '/'.join([x['TPx'], x['TSx']])
df['Ticket'] = df.apply(f, axis=1)
df = df.drop(['TPx', 'TSx'], axis=1)

### Fare

In [19]:
{k: list(set(v)) for k, v in df.groupby(['Ticket'])['Fare'] if len(set(v))>1}

{'Z/7534': [9.2167, 9.8458]}

In [20]:
df.loc[df['Ticket']=='Z/7534', 'Fare'] = sum(df.loc[df['Ticket']=='Z/7534', 'Fare'])

In [21]:
## 개인 가격으로 변환
df['Fare'] = df.apply(lambda x: x['Fare']/df['Ticket'].value_counts()[x['Ticket']], axis=1)

In [22]:
## 결측치 처리
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

### Embarked

In [23]:
## 결측치 처리
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode())

In [24]:
## encoding
df = pd.get_dummies(df, columns=['Embarked'], prefix=['Emb'], prefix_sep='')

# -------------------- [ Feature Engineering ] 파생변수 -------------------

### Family (가족 구분 키값 생성)

In [25]:
## 가족이면 함께 탑승하기 때문에 티켓이 같다
df['SSPC'] = df['SibSp'] + df['Parch'] + 1
Counter({k: len(set(v["Ticket"])) for k, v in df[df['SSPC']>1].groupby(['LNm'])}.values())

Counter({1: 178, 3: 4, 2: 25})

In [26]:
## 가족을 구분하는 데에 티켓을 활용하자
f = lambda x: np.nan if x['SSPC']==1 else x['Ticket']
df['Fmly'] = df.apply(f, axis=1)

In [27]:
## 추정되는 가족 구성원 수 vs. 티켓을 공유하는 가족 구성원 수
g = df.groupby(['LNm', 'Fmly'])['SSPC'].agg([np.max, np.size])
g[g['amax']!=g['size']].index.tolist()[:5]

[('Ahlin', 'Z/7546'),
 ('Andersen', 'Z/350046'),
 ('Andersson', 'Z/3101281'),
 ('Andersson', 'Z/347091'),
 ('Andrews', 'Z/13502')]

In [28]:
ix = [1, 478]
df.loc[ix, 'Fmly'] = 'A/21171'

In [29]:
ix = [19, 39, 334, 1037]
df.loc[ix, 'Fmly'] = 'Z/345763'

In [30]:
ix = [70, 185, 1057, 1268, 1286]
df.loc[ix, 'Fmly'] = 'Z/315151'

In [31]:
ix = [86, 105, 207, 393]
df.loc[ix, 'Fmly'] = 'Z/3101278'

In [32]:
ix = [114, 403]
df.loc[ix, 'Fmly'] = 'Z/4136'

In [33]:
ix = [119, 300, 1076]
df.loc[ix, 'Fmly'] = 'PC/17558'
df.loc[119, ['SibSp', 'Parch']] = [1, 1]
df.loc[300, ['SibSp', 'Parch']] = [0, 2]

In [34]:
ix = [167, 357]
df.loc[ix, 'Fmly'] = 'Z/113505'
df.loc[167, ['SibSp', 'Parch']] = [1, 0]
df.loc[357, ['SibSp', 'Parch']] = [1, 0]

In [35]:
ix = [176, 1045, 1155]
df.loc[ix, 'Fmly'] = 'Z/350404'

In [36]:
ix = [189, 594, 658]
df.loc[ix, 'Fmly'] = 'Z/364849'

In [37]:
ix = [198, 913]
df.loc[ix, 'Fmly'] = 'Z/4579'

In [38]:
ix = [206, 252, 268]
df.loc[ix, 'Fmly'] = 'Z/347054'

In [39]:
ix = [215, 1013]
df.loc[ix, 'Fmly'] = 'Z/367229'

In [40]:
ix = [218, 581, 601, 1133]
df.loc[ix, 'Fmly'] = 'Z/243847'

In [41]:
ix = [248, 756]
df.loc[ix, 'Fmly'] = 'Z/250649'
df.loc[248, ['SibSp', 'Parch']] = [0, 1]
df.loc[756, ['SibSp', 'Parch']] = [0, 1]

In [42]:
ix = [249, 872]
df.loc[ix, 'Fmly'] = 'Z/11751'
df.loc[249, ['SibSp', 'Parch']] = [1, 0]
df.loc[872, ['SibSp', 'Parch']] = [1, 0]

In [43]:
ix = [276, 766]
df.loc[ix, 'Fmly'] = 'Z/13502'

In [44]:
ix = [313, 1041]
df.loc[ix, 'Fmly'] = 'Z/250651'
df.loc[313, ['SibSp', 'Parch']] = [1, 0]
df.loc[1041, ['SibSp', 'Parch']] = [1, 0]

In [45]:
ix = [335, 661, 1296]
df.loc[ix, 'Fmly'] = 'PC/17611'

In [46]:
ix = [353, 533, 1229]
df.loc[ix, 'Fmly'] = 'Z/2695'

In [47]:
ix = [372, 1124]
df.loc[ix, 'Fmly'] = 'Z/3101267'

In [48]:
ix = [408, 438, 530, 775, 832, 893, 944]
df.loc[ix, 'Fmly'] = 'Z/29106'

In [49]:
ix = [452, 491]
df.loc[ix, 'Fmly'] = 'Z/65303'

In [50]:
ix = [477, 727, 923, 1211]
df.loc[ix, 'Fmly'] = 'Z/31027'

In [51]:
ix = [480, 665, 896]
df.loc[ix, 'Fmly'] = 'Z/3101298'

In [52]:
ix = [497, 592]
df.loc[ix, 'Fmly'] = 'Z/36947'

In [53]:
ix = [539, 1274]
df.loc[ix, 'Fmly'] = 'Z/364498'
df.loc[539, ['SibSp', 'Parch']] = [1, 0]
df.loc[1274, ['SibSp', 'Parch']] = [1, 0]

In [54]:
ix = [540, 588, 1289]
df.loc[ix, 'Fmly'] = 'Z/13568'

In [55]:
ix = [541, 746, 1197]
df.loc[ix, 'Fmly'] = 'WE/P/5735'

In [56]:
ix = [550, 1222]
df.loc[ix, 'Fmly'] = 'CA/33112'
df.loc[550, ['SibSp', 'Parch']] = [0, 1]
df.loc[1222, ['SibSp', 'Parch']] = [0, 1]

In [57]:
ix = [557, 600]
df.loc[ix, 'Fmly'] = 'Z/11755'

In [58]:
ix = [566, 901, 1079]
df.loc[ix, 'Fmly'] = 'A/48871'

In [59]:
ix = [572, 969, 1248]
df.loc[ix, 'Fmly'] = 'Z/11769'

In [60]:
ix = [672, 821, 984, 1200]
df.loc[ix, 'Fmly'] = 'FC/12750'

In [61]:
ix = [705, 861, 1201]
df.loc[ix, 'Fmly'] = 'Z/350025'

In [62]:
ix = [730, 910]
df.loc[ix, 'Fmly'] = 'STON/O/3101271'

In [63]:
ix = [804, 996]
df.loc[ix, 'Fmly'] = 'Z/2625'
df.loc[996, ['SibSp', 'Parch']] = [0, 1]

In [64]:
ix = [862, 1262]
df.loc[ix, 'Fmly'] = 'Z/28134'

In [65]:
ix = [867, 1112]
df.loc[ix, 'Fmly'] = 'SC/PARIS/2149'

In [66]:
ix = [880, 1042]
df.loc[ix, 'Fmly'] = 'Z/11767'

In [67]:
ix = [926, 1014]
df.loc[ix, 'Fmly'] = 'Z/13236'

In [68]:
ix = [1170, 1254, 1298]
df.loc[ix, 'Fmly'] = 'CA/31352'
df.loc[1170, ['SibSp', 'Parch']] = [2, 0]
df.loc[1254, ['SibSp', 'Parch']] = [1, 0]

In [69]:
ix = [41, 69, 137, 146, 193, 260, 274, 418, 443, 690, 722, 780, 881, 1025, 1106, 1130]
df.loc[ix, 'Fmly'] = np.nan
df.loc[ix, ['SibSp', 'Parch']] = [0, 0]

In [70]:
## 데이터 변경에 따라 업데이트

In [71]:
df = df.drop(['SSPC'], axis=1)
df['SSPC'] = df['SibSp'] + df['Parch'] + 1
df['NFmly'] = df['Fmly'].apply(lambda x: 1 if pd.isnull(x)==True else df['Fmly'].tolist().count(x))

### Family2 (SpSib)

In [72]:
## spouse
## Last Name, Mr/Mrs. First Name

In [73]:
df['Ttl'] = df['Ttl'].replace(['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Major', 'Rev'], 'Mr')
df['Ttl'] = df['Ttl'].replace(['Countess', 'Dona', 'Mlle', 'Mme'], 'Ms')

In [74]:
## 기혼 남녀 조건이 필요하다
{k: v.index.tolist() for k, v in df.groupby(['Fmly', 'LNm', 'FNm']) if len(v.index)>2}

{('CA/2315', 'Dean', 'Bertram'): [94, 789, 924],
 ('Z/113760', 'Carter', 'William'): [391, 764, 803],
 ('Z/113781', 'Allison', 'Hudson'): [306, 499, 1198],
 ('Z/17421', 'Thayer', 'John'): [551, 582, 699],
 ('Z/33638', 'Dodge', 'Washington'): [446, 1185, 1266],
 ('Z/347077', 'Asplund', 'Carl'): [26, 1066, 1271],
 ('Z/363291', 'Goldsmith', 'Frank'): [166, 329, 549]}

In [75]:
sp = {k: v.index.tolist() for k, v in df[(df['Ttl'].apply(lambda x: x in ['Mr', 'Mrs'])) & (df['SibSp']>0)].groupby(['Fmly', 'LNm', 'FNm']) if len(v.index)==2}
sp = {k: v for k, v in sp.items() if df['SexF'][v[0]]^df['SexF'][v[1]]==1}

In [76]:
sp = {p[0]: p[1] for v in sp.values() for p in permutations(v, 2)}

In [77]:
sp = pd.DataFrame.from_dict(sp, orient='index', columns=['Sp'])
df = pd.merge(df, sp, how='outer', left_index=True, right_index=True)
df.index.name = 'Id'

In [78]:
df['NSp'] = df['Sp'].apply(lambda x: 0 if pd.isnull(x)==True else 1)
df['NSib'] = df['SibSp'] - df['NSp']

In [79]:
## sibling
## Miss Name, 형제자매 숫자 동일

In [80]:
## 실제 형제자매 숫자 vs. 파악된 형제자매 숫자
g = df[df['NSib']>0].groupby(['Fmly', 'MNm'])['NSib'].agg([np.max, np.size])
g[(g['amax']+1)!=g['size']].index.tolist()[:5]

[('CA/2144', 'Goodwin'),
 ('CA/2144', 'Tyler'),
 ('CA/2673', 'Abbott'),
 ('CA/2673', 'Hunt'),
 ('W/C/6608', 'Ford')]

In [81]:
df.loc[[54, 1169], 'FNm'] = 'Harry'
df.loc[[168, 361], 'FNm'] = 'Wilhelm'
df.loc[[623, 1225], 'FNm'] = 'Sahid'
df.loc[[679, 1031], 'FNm'] = 'Frederick'

In [82]:
df.loc[[87, 148, 437, 1059], ['SibSp', 'Parch']] = [3, 1]
df.loc[737, ['SibSp', 'Parch']] = [0, 4]

In [83]:
df.loc[280, ['SibSp', 'Parch']] = [0, 2]
df.loc[1284, ['SibSp', 'Parch']] = [1, 1]

In [84]:
## 데이터 변경에 따라 업데이트
## spouse, sibling 다시 생성

In [85]:
df = df.drop(['SSPC', 'Sp', 'NSp', 'NSib'], axis=1)
df['SSPC'] = df['SibSp'] + df['Parch'] + 1

In [86]:
sp = {k: v.index.tolist() for k, v in df[(df['Ttl'].apply(lambda x: x in ['Mr', 'Mrs'])) & (df['SibSp']>0)].groupby(['Fmly', 'LNm', 'FNm']) if len(v.index)==2}
sp = {k: v for k, v in sp.items() if df['SexF'][v[0]]^df['SexF'][v[1]]==1}

In [87]:
sp_net = [sorted([c[0], c[1]]) for v in sp.values() for c in combinations(v, 2)]

In [88]:
sp_net[:5]

[[254, 618], [133, 917], [679, 1031], [94, 924], [1234, 1257]]

In [89]:
sp = {p[0]: p[1] for v in sp.values() for p in permutations(v, 2)}

In [90]:
sp = pd.DataFrame.from_dict(sp, orient='index', columns=['Sp'])
df = pd.merge(df, sp, how='outer', left_index=True, right_index=True)
df.index.name = 'Id'

In [91]:
df['NSp'] = df['Sp'].apply(lambda x: 0 if pd.isnull(x)==True else 1)
df['NSib'] = df['SibSp'] - df['NSp']

In [92]:
sib = {k: v.index.tolist() for k, v in df[df['NSib']>0].groupby(['Fmly', 'MNm'])}

In [93]:
sib_net = [sorted([c[0], c[1]]) for v in sib.values() for c in combinations(v, 2)]

In [94]:
sib_net[:5]

[[1, 478], [566, 901], [566, 1079], [901, 1079], [1084, 1236]]

### Family3 (Parch)

In [95]:
## 1) 가족 내 parch 존재하는 구성원 중 2명 조합
## 2) sib sp 관계 아니어야 함
## 3) LNm MNm 같은 케이스 존재
## 4) 나이가 많으면 부모 (여자이면 엄마, 남자이면 아빠)

In [96]:
## [부모, 자녀]
parch_net = []
chck = []
for fmly in list(set(df[df['Fmly'].isnull()==False]['Fmly'])):
    for c in combinations(df[(df['Parch']>0)&(df['Fmly']==fmly)].index, 2):
        if (sorted(c) not in sp_net) & (sorted(c) not in sib_net):
            if (df.loc[c[0],'LNm']==df.loc[c[1],'MNm'])|(df.loc[c[0],'MNm']==df.loc[c[1],'LNm']):
                if df.loc[c[0],'Age']>df.loc[c[1],'Age']:
                    parch_net.append([c[0], c[1]])
                elif df.loc[c[1],'Age']>df.loc[c[0],'Age']:
                    parch_net.append([c[1], c[0]])
                else:
                    chck.append([c[0], c[1]])

In [97]:
chck[:5]

[[177, 1024], [230, 1024], [410, 1024], [486, 1024], [160, 1234]]

In [98]:
parch_net.extend([
    [534, 129], [534, 1309], [784, 889], [784, 1136], [925, 889], [925, 1136], [189, 594], [658, 594], [1117, 66], [1117, 710],
    [154, 1236], [1024, 177], [1024, 230], [1024, 410], [1024, 486], [1234, 160], [1257, 160], [1234, 181], [1257, 181],
    [1234, 202], [1257, 202], [1234, 325], [1257, 325], [1234, 793], [1257, 793], [1234, 847], [1257, 847], [1234, 864],
    [1257, 864], [1234, 1080], [1257, 1080], [1234, 1252], [1257, 1252], [141, 853], [141, 972]])

In [99]:
## NPar, NCh 생성
df['NPar'] = [[c[1] for c in parch_net].count(ix) for ix in df.index]
df['NCh'] = [[c[0] for c in parch_net].count(ix) for ix in df.index]

In [100]:
df[(df.NPar+df.NCh)!=df.Parch].index

Int64Index([660], dtype='int64', name='Id')

In [101]:
df.loc[[216, 394], ['SibSp', 'Parch']] = [1, 1]

In [102]:
## 데이터 변경에 따라 업데이트
## parents, children 다시 생성

In [103]:
df = df.drop(['SSPC', 'NPar', 'NCh'], axis=1)
df['SSPC'] = df['SibSp'] + df['Parch'] + 1

In [104]:
parch_net = []
chck = []
for fmly in list(set(df[df['Fmly'].isnull()==False]['Fmly'])):
    for c in combinations(df[(df['Parch']>0)&(df['Fmly']==fmly)].index, 2):
        if (sorted(c) not in sp_net) & (sorted(c) not in sib_net):
            if (df.loc[c[0],'LNm']==df.loc[c[1],'MNm'])|(df.loc[c[0],'MNm']==df.loc[c[1],'LNm']):
                if df.loc[c[0],'Age']>df.loc[c[1],'Age']:
                    parch_net.append([c[0], c[1]])
                elif df.loc[c[1],'Age']>df.loc[c[0],'Age']:
                    parch_net.append([c[1], c[0]])
                else:
                    chck.append([c[0], c[1]])

In [105]:
parch_net.extend([
    [534, 129], [534, 1309], [784, 889], [784, 1136], [925, 889], [925, 1136], [189, 594], [658, 594], [1117, 66], [1117, 710],
    [154, 1236], [1024, 177], [1024, 230], [1024, 410], [1024, 486], [1234, 160], [1257, 160], [1234, 181], [1257, 181],
    [1234, 202], [1257, 202], [1234, 325], [1257, 325], [1234, 793], [1257, 793], [1234, 847], [1257, 847], [1234, 864],
    [1257, 864], [1234, 1080], [1257, 1080], [1234, 1252], [1257, 1252], [141, 853], [141, 972]])

In [106]:
parch_net[:5]

[[671, 1067], [685, 1067], [856, 1199], [916, 312], [1034, 312]]

In [107]:
df['NPar'] = [[c[1] for c in parch_net].count(ix) for ix in df.index]
df['NCh'] = [[c[0] for c in parch_net].count(ix) for ix in df.index]

# -------------------- [ Preprocessing ] 추가 --------------------

### Age (결측치 처리)

In [108]:
## 타이틀이 마스터이면 평균 6세
df.loc[df['Ttl']=='Master', 'Age'].median()

4.0

In [109]:
## 남편이 아내보다 3.5세 많다
sp_age = [df.loc[v[0], 'Age']-df.loc[v[1], 'Age'] if df.loc[v[0], 'SexF']==0 else df.loc[v[1], 'Age']-df.loc[v[0], 'Age'] for v in sp_net]
median([v for v in sp_age if np.isnan(v)==False])

3.5

In [110]:
## 부모가 자식보다 28세 많다
parch_age = [df.loc[v[0], 'Age']-df.loc[v[1], 'Age'] for v in parch_net]
median([v for v in parch_age if np.isnan(v)==False])

28.0

In [111]:
df.loc[(df['Ttl']=='Master')&(df['Age'].isnull()==True), 'Age'] = 6

In [112]:
df_age = pd.DataFrame(index=df[(df['Age'].isnull()==True)].index, columns=['SpAge', 'SibAge', 'ParAge', 'ChAge', 'TtlAge'])

In [113]:
## 배우자 나이 활용
age = []
for ix in df_age.index:
    if df.loc[ix, 'NSp']==1:
        if df.loc[ix, 'SexF']==1:
            age.append(df.loc[df.loc[ix, 'Sp'], 'Age']-3.5)
        else:
            age.append(df.loc[df.loc[ix, 'Sp'], 'Age']+3.5)
    else:
        age.append(np.nan)
df_age['SpAge'] = age

In [114]:
## 자녀이면 부모 나이 활용
age = []
for ix in df_age.index:
    if df.loc[ix, 'NPar']>0:
        age.append(df.loc[list(set([x[0] for x in parch_net if ix==x[1]])), 'Age'].mean()-28.0)
    else:
        age.append(np.nan)
df_age['ParAge'] = age

In [115]:
## 부모이면 자녀 나이 활용
age = []
for ix in df_age.index:
    if df.loc[ix, 'NCh']>0:
        age.append(df.loc[list(set([x[1] for x in parch_net if ix==x[0]])), 'Age'].mean()+28.0)
    else:
        age.append(np.nan)
df_age['ChAge'] = age

In [116]:
## sibling 나이 활용
age = []
for ix in df_age.index:
    if df.loc[ix, 'NSib']>0:
        age.append(df.loc[list(set(chain(*[x for x in sib_net if ix in x]))), 'Age'].mean())
    else:
        age.append(np.nan)
df_age['SibAge'] = age

In [117]:
## title 나이 활용
age = []
for ix in df_age.index:
    age.append(df.groupby(['Ttl'])['Age'].median()[df.loc[ix, 'Ttl']])
df_age['TtlAge'] = age

In [118]:
df.loc[df_age.index, 'Age'] = df_age.mean(axis=1)

### 파생변수

In [119]:
## 가족 최소 나이
age = []
for ix in df.index:
    if df.loc[ix, 'NFmly']>1:
        age.append(df.groupby(['Fmly'])['Age'].min()[df.loc[ix, 'Fmly']])
    else:
        age.append(999)
df['MinAge'] = age

In [120]:
## 자녀 최소 나이
age = []
for ix in df.index:
    if df.loc[ix, 'NCh']>0:
        age.append(df.loc[list(set([x[1] for x in parch_net if ix==x[0]])), 'Age'].min())
    else:
        age.append(999)
df['ChMinAge'] = age

In [121]:
df['NFmlyBn'] = df['NFmly'].apply(lambda x: 1 if x>4 else x)
df[['NFmlyBn', 'Survived']].groupby(['NFmlyBn']).mean()

Unnamed: 0_level_0,Survived
NFmlyBn,Unnamed: 1_level_1
1,0.297561
2,0.566434
3,0.585106
4,0.589744


In [122]:
df['NChBn'] = df['NCh'].apply(lambda x: 0 if x>2 else x)
df[['NChBn', 'Survived']].groupby(['NChBn']).mean()

Unnamed: 0_level_0,Survived
NChBn,Unnamed: 1_level_1
0,0.373333
1,0.488372
2,0.565217


# -------------------- [ Model ] Random Forest --------------------

In [123]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [124]:
## feature selection
cols = ['Pclass', 'Age', 'Fare', 'SexF', 'EmbC', 'EmbS', 'NFmly', 'NSp', 'NSib', 'NPar', 'NCh', 'MinAge']
X_train = df[df['Set']==0][cols]
y_train = df[df['Set']==0]['Survived']

In [125]:
rfe = RFECV(estimator=RandomForestClassifier(random_state=999, n_estimators=100), step=1, cv=5, scoring='accuracy')
rfe.fit(X_train, y_train)

RFECV(cv=5,
      estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                       criterion='gini', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                                       oob_score=False, random_state=999,
                                       verbose=0, warm_start=False),
      min_features_to_select=1, n_jobs=None, scoring='accuracy', step=1,
      verbose=0)

In [126]:
cols = [v[0] for v in zip(cols, rfe.support_) if v[1]==True]
X_train = df[df['Set']==0][cols]
y_train = df[df['Set']==0]['Survived']
X_test = df[df['Set']==1][cols]

In [127]:
cols

['Pclass', 'Age', 'Fare', 'SexF', 'NFmly', 'NSib', 'MinAge']

In [128]:
params = {'n_estimators': range(10, 100, 10), 'max_depth': range(3, 15), 'min_samples_leaf': range(3, 10)}
clf = GridSearchCV(estimator=RandomForestClassifier(random_state=999), param_grid=params, cv=5, scoring='accuracy')
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=999,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={

In [129]:
clf.best_params_, clf.best_score_

({'max_depth': 14, 'min_samples_leaf': 3, 'n_estimators': 20},
 0.8484848484848485)

In [130]:
clf = RandomForestClassifier(random_state=999,
                             n_estimators=clf.best_params_['n_estimators'],
                             max_depth=clf.best_params_['max_depth'],
                             min_samples_leaf=clf.best_params_['min_samples_leaf']).fit(X_train, y_train)
pd.DataFrame(zip(X_train.columns, clf.feature_importances_), columns=['Col', 'Imp']).sort_values(by=['Imp'], ascending=False)

Unnamed: 0,Col,Imp
3,SexF,0.372416
2,Fare,0.209628
1,Age,0.176522
0,Pclass,0.092216
6,MinAge,0.0706
4,NFmly,0.042429
5,NSib,0.036188


In [131]:
y_pred = clf.predict(X_test)
result = pd.DataFrame({'PassengerId': df[df['Set']==1].index, 'Survived': y_pred.astype('int32')})
result.to_csv('result_rf.csv', index=False)

# -------------------- [ Model ] MLP Classifier --------------------

In [136]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [137]:
cols = ['Pclass', 'Age', 'Fare', 'SexF', 'MinAge', 'NFmlyBn']
df.loc[:, cols] = df.loc[:, cols].replace(999, -1)

In [138]:
X_train = df[df['Set']==0][cols]
y_train = df[df['Set']==0]['Survived']
X_test = df[df['Set']==1][cols]

In [139]:
params2 = {'hidden_layer_sizes': [(100,), (150,), (200,)],
          'activation': ['tanh', 'relu'],
          'alpha': [0.0001, 0.001, 0.01]}

In [140]:
clf2 = GridSearchCV(estimator=MLPClassifier(random_state=999, max_iter=1000, solver='lbfgs'), param_grid=params2, cv=5, scoring='accuracy')
clf2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=1000,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=999, shuffle=True,
                                     solver='lbfgs', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'activation': ['tanh', 'relu'],
        

In [141]:
clf2.best_params_, clf2.best_score_

({'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (200,)},
 0.8114478114478114)

In [142]:
y_pred2 = clf2.predict(X_test)
result2 = pd.DataFrame({'PassengerId': df[df['Set']==1].index, 'Survived': y_pred2.astype('int32')})
result2.to_csv('result_mlp.csv', index=False)

In [144]:
result2.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [145]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [155]:
#result3 = pd.DataFrame({'PassengerId': df[df['Set']==1].index, 'Survived1': y_pred.astype('int32'), 'Survived2': y_pred2.astype('int32')})

In [156]:
#result3['Survived3'] = result3['Survived1'] + result3['Survived2']

In [217]:
y_pred3 = y_pred + y_pred2

In [218]:
print(y_pred3)

[0. 0. 0. 0. 2. 0. 0. 1. 2. 0. 0. 0. 2. 0. 2. 2. 0. 0. 2. 1. 1. 2. 2. 1.
 2. 0. 2. 0. 1. 0. 0. 0. 0. 0. 2. 0. 1. 1. 0. 0. 0. 1. 0. 2. 2. 0. 0. 0.
 2. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 2. 0. 0. 0. 2. 2. 2. 2. 0. 0. 2. 2. 0.
 0. 1. 2. 0. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 1. 1. 0. 2. 2. 0. 2. 0. 0. 0.
 2. 0. 2. 0. 2. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 2. 2. 2. 2. 0. 0. 2. 1. 2.
 2. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 2. 0. 0.
 2. 0. 0. 0. 1. 0. 2. 0. 0. 1. 0. 0. 2. 2. 2. 1. 2. 2. 2. 0. 0. 0. 1. 0.
 2. 1. 0. 0. 0. 0. 0. 2. 2. 1. 2. 2. 0. 0. 2. 0. 2. 0. 2. 0. 0. 0. 0. 0.
 1. 0. 2. 0. 2. 1. 0. 0. 2. 2. 1. 2. 0. 0. 1. 0. 2. 0. 0. 0. 0. 1. 1. 1.
 2. 0. 2. 0. 2. 0. 2. 0. 2. 0. 0. 2. 0. 0. 0. 2. 0. 0. 1. 0. 1. 0. 2. 2.
 2. 2. 0. 0. 2. 1. 2. 0. 2. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 2. 2.
 0. 0. 0. 0. 1. 0. 0. 0. 2. 2. 0. 2. 0. 0. 0. 0. 1. 2. 2. 1. 2. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 2. 0. 2. 1. 0. 0. 0.
 0. 1. 2. 2. 1. 0. 0. 0. 0. 0. 0. 1. 2. 0. 2. 0. 0.

In [219]:
list_tmp = np.where(y_pred3 == 1)

In [221]:
print(list_tmp[0])

[  7  19  20  23  28  36  37  41  73  86  87 118 138 148 153 159 166 169
 177 192 197 202 206 213 214 215 234 236 245 268 280 283 291 308 313 316
 323 331 339 365 367 382 391 407]


In [222]:
# 50 random 1/2
import random
k = len(list_tmp[0]) * (50) // 100
n = int(k)
indicies = random.sample(range(len(list_tmp[0])), n)
#new_list1 = [list1[i] for i in indicies]
#new_list2 = [list2[i] for i in indicies]
#new_list = [pid_to_list[i] for i in indicies]
#for i in indices

In [223]:
print(indicies)

[20, 8, 24, 40, 32, 23, 38, 2, 17, 5, 34, 18, 31, 7, 41, 9, 15, 16, 27, 25, 14, 43]


In [224]:
new_list = [list_tmp[0][i] for i in indicies]
print(new_list)

[197, 73, 214, 367, 291, 213, 339, 20, 169, 36, 313, 177, 283, 41, 382, 86, 159, 166, 236, 215, 153, 407]


In [225]:
y_pred3[new_list] = 2

In [226]:
print(y_pred3)

[0. 0. 0. 0. 2. 0. 0. 1. 2. 0. 0. 0. 2. 0. 2. 2. 0. 0. 2. 1. 2. 2. 2. 1.
 2. 0. 2. 0. 1. 0. 0. 0. 0. 0. 2. 0. 2. 1. 0. 0. 0. 2. 0. 2. 2. 0. 0. 0.
 2. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 2. 0. 0. 0. 2. 2. 2. 2. 0. 0. 2. 2. 0.
 0. 2. 2. 0. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 2. 1. 0. 2. 2. 0. 2. 0. 0. 0.
 2. 0. 2. 0. 2. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 2. 2. 2. 2. 0. 0. 2. 1. 2.
 2. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 2. 0. 0.
 2. 0. 0. 0. 1. 0. 2. 0. 0. 2. 0. 0. 2. 2. 2. 2. 2. 2. 2. 0. 0. 0. 2. 0.
 2. 2. 0. 0. 0. 0. 0. 2. 2. 2. 2. 2. 0. 0. 2. 0. 2. 0. 2. 0. 0. 0. 0. 0.
 1. 0. 2. 0. 2. 2. 0. 0. 2. 2. 1. 2. 0. 0. 1. 0. 2. 0. 0. 0. 0. 2. 2. 2.
 2. 0. 2. 0. 2. 0. 2. 0. 2. 0. 0. 2. 0. 0. 0. 2. 0. 0. 1. 0. 2. 0. 2. 2.
 2. 2. 0. 0. 2. 1. 2. 0. 2. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 2. 2.
 0. 0. 0. 0. 1. 0. 0. 0. 2. 2. 0. 2. 0. 0. 0. 0. 1. 2. 2. 2. 2. 0. 0. 0.
 0. 0. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 2. 0. 2. 1. 0. 0. 0.
 0. 2. 2. 2. 1. 0. 0. 0. 0. 0. 0. 1. 2. 0. 2. 0. 0.

In [227]:
list_tmp = np.where(y_pred3 == 1)
print(list_tmp[0])

[  7  19  23  28  37  87 118 138 148 192 202 206 234 245 268 280 308 316
 323 331 365 391]


In [228]:
y_pred3[list_tmp[0]] = 0

In [229]:
print(y_pred3)

[0. 0. 0. 0. 2. 0. 0. 0. 2. 0. 0. 0. 2. 0. 2. 2. 0. 0. 2. 0. 2. 2. 2. 0.
 2. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 2. 0. 0. 0. 0. 2. 0. 2. 2. 0. 0. 0.
 2. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 2. 0. 0. 0. 2. 2. 2. 2. 0. 0. 2. 2. 0.
 0. 2. 2. 0. 0. 2. 0. 2. 2. 0. 0. 0. 0. 0. 2. 0. 0. 2. 2. 0. 2. 0. 0. 0.
 2. 0. 2. 0. 2. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 2. 2. 2. 2. 0. 0. 2. 0. 2.
 2. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0.
 2. 0. 0. 0. 0. 0. 2. 0. 0. 2. 0. 0. 2. 2. 2. 2. 2. 2. 2. 0. 0. 0. 2. 0.
 2. 2. 0. 0. 0. 0. 0. 2. 2. 2. 2. 2. 0. 0. 2. 0. 2. 0. 2. 0. 0. 0. 0. 0.
 0. 0. 2. 0. 2. 2. 0. 0. 2. 2. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 2. 2. 2.
 2. 0. 2. 0. 2. 0. 2. 0. 2. 0. 0. 2. 0. 0. 0. 2. 0. 0. 0. 0. 2. 0. 2. 2.
 2. 2. 0. 0. 2. 0. 2. 0. 2. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 2. 2.
 0. 0. 0. 0. 0. 0. 0. 0. 2. 2. 0. 2. 0. 0. 0. 0. 0. 2. 2. 2. 2. 0. 0. 0.
 0. 0. 0. 2. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 2. 0. 2. 0. 0. 0. 0.
 0. 2. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 2. 0. 0.

In [231]:
list_tmp = np.where(y_pred3 == 2)
print(list_tmp[0])
y_pred3[list_tmp[0]] = 1
print(y_pred3)

[  4   8  12  14  15  18  20  21  22  24  26  34  36  41  43  44  48  50
  52  53  59  63  64  65  66  69  70  73  74  77  79  80  86  89  90  92
  96  98 100 104 111 112 113 114 117 119 120 122 127 141 144 150 153 156
 157 158 159 160 161 162 166 168 169 175 176 177 178 179 182 184 186 194
 196 197 200 201 203 208 213 214 215 216 218 220 222 224 227 231 236 238
 239 240 241 244 246 248 250 258 262 263 272 273 275 281 282 283 284 291
 296 304 305 307 313 314 315 324 326 330 333 339 343 344 345 347 349 350
 354 356 359 361 362 364 367 368 371 374 375 382 385 395 397 400 402 407
 408 409 410 411 414 417]
[0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0.
 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0.
 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0.
 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
 1. 0. 1. 0. 0. 0. 0. 1. 

In [232]:
result3 = pd.DataFrame({'PassengerId': df[df['Set']==1].index, 'Survived': y_pred3.astype('int32')})


In [233]:
result3.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [234]:
result3.to_csv('result_ensemble_r50p_rf_mlp.csv', index=False)

In [158]:
#result3.groupby('Survived3').agg('mean').reset_index()

Unnamed: 0,Survived3,PassengerId,Survived1,Survived2
0,0,1103.686992,0.0,0.0
1,1,1092.068182,0.454545,0.545455
2,2,1097.273438,1.0,1.0


In [162]:
# 50 random 1/2
#result4 = result3[result3['Survived3'] == 1]
#result4.head(50)

Unnamed: 0,PassengerId,Survived1,Survived2,Survived3
7,899,0,1,1
19,911,0,1,1
20,912,0,1,1
23,915,1,0,1
28,920,1,0,1
36,928,0,1,1
37,929,0,1,1
41,933,1,0,1
73,965,1,0,1
86,978,1,0,1


In [163]:
pid_to_list = result4.PassengerId.to_list()

In [164]:
print(pid_to_list)

[899, 911, 912, 915, 920, 928, 929, 933, 965, 978, 979, 1010, 1030, 1040, 1045, 1051, 1058, 1061, 1069, 1084, 1089, 1094, 1098, 1105, 1106, 1107, 1126, 1128, 1137, 1160, 1172, 1175, 1183, 1200, 1205, 1208, 1215, 1223, 1231, 1257, 1259, 1274, 1283, 1299]


In [178]:
import random
k = len(pid_to_list) * (50) // 100
n = int(k)
indicies = random.sample(range(len(pid_to_list)), n)
#new_list1 = [list1[i] for i in indicies]
#new_list2 = [list2[i] for i in indicies]
new_list = [pid_to_list[i] for i in indicies]

In [213]:
print(new_list)

[899, 933, 1040, 965, 920, 979, 1137, 1160, 1061, 1231, 929, 1107, 1259, 1215, 1205, 1030, 1175, 1098, 1051, 1089, 915, 1183]


In [None]:
result5 = result3[result3['Survived3'] == 2]
result6 = result3[result3['Survived3'] == 0]

In [214]:
result7 = result3
cols = ['Survived3']
#result7['Survived3'] = result7[(result7['PassengerId'].apply(lambda x: x in new_list))].loc[:, cols].replace(1, 2)

In [216]:
result7.head()

Unnamed: 0,PassengerId,Survived1,Survived2,Survived3
0,892,0,0,
1,893,0,0,
2,894,0,0,
3,895,0,0,
4,896,1,1,


In [215]:
result7[(result7['PassengerId'].apply(lambda x: x in new_list))].head(10)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [207]:
result8 = result7[['PassengerId', 'Survived3']]

In [205]:
result8[['Survived3']] = result7['Survived3'].round(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [208]:
result8.head()

Unnamed: 0,PassengerId,Survived3
0,892,
1,893,
2,894,
3,895,
4,896,
