# Import

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
seed=2023

# Load Data

In [5]:
df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [6]:
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


# 0, NaN 피처 어떻게 처리할까?

### SibSp : 0이 68.1%

In [14]:
df.groupby(by='SibSp').count()[['PassengerId']]

Unnamed: 0_level_0,PassengerId
SibSp,Unnamed: 1_level_1
0,608
1,209
2,28
3,16
4,18
5,5
8,7


### Parch 0이 76.5%

In [15]:
df.groupby(by='Parch').count()[['PassengerId']]

Unnamed: 0_level_0,PassengerId
Parch,Unnamed: 1_level_1
0,678
1,118
2,80
3,5
4,4
5,5
6,1


### Ticket : 종류가 매우 많음

In [16]:
df.groupby(by='Ticket').count()[['PassengerId']] # categorical로 규정될 수 없음

Unnamed: 0_level_0,PassengerId
Ticket,Unnamed: 1_level_1
110152,3
110413,3
110465,2
110564,1
110813,1
...,...
W./C. 6608,4
W./C. 6609,1
W.E.P. 5734,1
W/C 14208,1


In [18]:
set(df['Ticket']) # 그냥 숫자와 문자가 붙은 경우로 구분됨

{'110152',
 '110413',
 '110465',
 '110564',
 '110813',
 '111240',
 '111320',
 '111361',
 '111369',
 '111426',
 '111427',
 '111428',
 '112050',
 '112052',
 '112053',
 '112058',
 '112059',
 '112277',
 '112379',
 '113028',
 '113043',
 '113050',
 '113051',
 '113055',
 '113056',
 '113059',
 '113501',
 '113503',
 '113505',
 '113509',
 '113510',
 '113514',
 '113572',
 '113760',
 '113767',
 '113773',
 '113776',
 '113781',
 '113783',
 '113784',
 '113786',
 '113787',
 '113788',
 '113789',
 '113792',
 '113794',
 '113796',
 '113798',
 '113800',
 '113803',
 '113804',
 '113806',
 '113807',
 '11668',
 '11751',
 '11752',
 '11753',
 '11755',
 '11765',
 '11767',
 '11769',
 '11771',
 '11774',
 '11813',
 '11967',
 '12233',
 '12460',
 '12749',
 '13049',
 '13213',
 '13214',
 '13502',
 '13507',
 '13509',
 '13567',
 '13568',
 '14311',
 '14312',
 '14313',
 '14973',
 '1601',
 '16966',
 '16988',
 '17421',
 '17453',
 '17463',
 '17464',
 '17465',
 '17466',
 '17474',
 '17764',
 '19877',
 '19928',
 '19943',
 '19947'

In [36]:
tickets = {}
tickets['just_number'] = 0

for ticket in list(df['Ticket']) + list(test_df['Ticket']):
    try:
        int(ticket.strip())
        tickets['just_number'] += 1
            
    except Exception:
        key = ticket.split(' ')[0]
        if key in tickets:
            tickets[key] += 1
        else:
            tickets[key] = 1
        
sorted(tickets.items())
# 이 버전, 맨 앞 글자만 있는 버전 등 다양하게 실험해볼 수 있을 듯

[('A.', 1),
 ('A./5.', 3),
 ('A.5.', 3),
 ('A/4', 6),
 ('A/4.', 3),
 ('A/5', 12),
 ('A/5.', 10),
 ('A/S', 1),
 ('A4.', 1),
 ('AQ/3.', 1),
 ('AQ/4', 1),
 ('C', 8),
 ('C.A.', 46),
 ('C.A./SOTON', 1),
 ('CA', 10),
 ('CA.', 12),
 ('F.C.', 3),
 ('F.C.C.', 9),
 ('Fa', 1),
 ('LINE', 4),
 ('LP', 1),
 ('P/PP', 2),
 ('PC', 92),
 ('PP', 4),
 ('S.C./A.4.', 1),
 ('S.C./PARIS', 3),
 ('S.O./P.P.', 7),
 ('S.O.C.', 7),
 ('S.O.P.', 1),
 ('S.P.', 1),
 ('S.W./PP', 1),
 ('SC', 2),
 ('SC/A.3', 1),
 ('SC/A4', 1),
 ('SC/AH', 5),
 ('SC/PARIS', 11),
 ('SC/Paris', 5),
 ('SCO/W', 1),
 ('SO/C', 1),
 ('SOTON/O.Q.', 16),
 ('SOTON/O2', 3),
 ('SOTON/OQ', 8),
 ('STON/O', 14),
 ('STON/O2.', 7),
 ('STON/OQ.', 1),
 ('SW/PP', 1),
 ('W./C.', 14),
 ('W.E.P.', 2),
 ('W/C', 1),
 ('WE/P', 2),
 ('just_number', 957)]

### Cabin : NaN이 19.2%, 값이 매우 다양함

In [36]:
df.groupby(by='Cabin').count()[['PassengerId']]

Unnamed: 0_level_0,PassengerId
Cabin,Unnamed: 1_level_1
A10,1
A14,1
A16,1
A19,1
A20,1
...,...
F33,3
F38,1
F4,2
G6,4


In [39]:
cabins = {}
cabins['just_number'] = 0

for cabin in list(df['Cabin']) + list(test_df['Cabin']):
    try:
        int(cabin.strip())
        cabins['just_number'] += 1
            
    except Exception:
        if cabin != cabin:
            continue
        
        key = cabin[0]
        if key in cabins:
            cabins[key] += 1
        else:
            cabins[key] = 1
        
sorted(cabins.items())

[('A', 22),
 ('B', 65),
 ('C', 94),
 ('D', 46),
 ('E', 41),
 ('F', 21),
 ('G', 5),
 ('T', 1),
 ('just_number', 0)]

# 학습해보자
- SibSp, Parch는 일단 그대로 두고
- ticket과 cabin은 위 방법으로 카테고리를 바꿔서
- gbc, lgbm 해보자
- random 5 fold로 학습

### feature engineering

In [6]:
df['type'] = 'train'
test_df['type'] = 'test'
all_df = pd.concat([df, test_df])

In [31]:
all_df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,type
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train


In [7]:
def transform_ticket(df):
    tr_tickets = []
    ticket_class = {'number': 0}
    last_num = 0
    
    for ticket in list(df['Ticket']):
        try:
            int(ticket.strip())
            tr_tickets.append(ticket_class['number'])
        except:
            if 'STON' in ticket:
                key = 'STON'
            else:
                key = ticket[0]
            if key not in ticket_class.keys():
                last_num += 1
                ticket_class[key] = last_num
            tr_tickets.append(ticket_class[key])
    
    df['tr_Ticket'] = tr_tickets
    return df, ticket_class

In [8]:
all_df, ticket_class = transform_ticket(all_df)
print(ticket_class)
all_df.head()

{'number': 0, 'A': 1, 'P': 2, 'STON': 3, 'C': 4, 'S': 5, 'W': 6, 'F': 7, 'L': 8}


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,type,tr_Ticket
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,2
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,3
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,0


In [9]:
def transform_cabin(df):
    tr_cabins = []
    cabin_class = {'NaN': 0}
    last_num = 0
    
    for cabin in list(df['Cabin']):
        if cabin != cabin:
            tr_cabins.append(cabin_class['NaN'])
        else:
            key = cabin[0]
            if key not in cabin_class.keys():
                last_num += 1
                cabin_class[key] = last_num
            tr_cabins.append(cabin_class[key])
    
    df['tr_Cabin'] = tr_cabins
    return df, cabin_class

In [10]:
all_df, cabin_class = transform_cabin(all_df)
print(cabin_class)
all_df.head()

{'NaN': 0, 'C': 1, 'E': 2, 'G': 3, 'D': 4, 'A': 5, 'B': 6, 'F': 7, 'T': 8}


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,type,tr_Ticket,tr_Cabin
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,1,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,2,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,3,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,0,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,0,0


In [11]:
all_df['tr_Sex'] = pd.Categorical(all_df['Sex']).codes
all_df['tr_Embarked'] = pd.Categorical(all_df['Embarked']).codes
all_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,type,tr_Ticket,tr_Cabin,tr_Sex,tr_Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,1,0,1,2
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,2,1,0,0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,3,0,0,2
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,0,1,0,2
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,0,0,1,2


In [12]:
# NaN이 있는 Age, Fare는 일단 평균값 대치하고 나중에 처리한다.
all_df['Age'] = all_df['Age'].fillna(all_df['Age'].mean())
all_df['Fare'] = all_df['Fare'].fillna(all_df['Fare'].mean())

In [13]:
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'tr_Ticket', 'tr_Cabin', 'tr_Sex', 'tr_Embarked']
label = ['Survived']

In [14]:
train_df = all_df[all_df['type']=='train']
train_df['Survived'] = train_df['Survived'].astype(int)

test_df = all_df[all_df['type']=='test']
test_df = test_df.drop(columns=['Survived'])

### GBC

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [16]:
gbc_model = GradientBoostingClassifier(random_state=seed)

In [17]:
gbc_fold = StratifiedKFold(n_splits=5, random_state=seed)

In [18]:
n_iter = 0
score_list = []
for train_idx, valid_idx in gbc_fold.split(train_df[features], train_df[label]):
    n_iter += 1
    
    X_train, X_valid = train_df[features].iloc[train_idx, :], train_df[features].iloc[valid_idx, :]
    y_train, y_valid = train_df[label].iloc[train_idx,:], train_df[label].iloc[valid_idx,:]
    
    gbc_model.fit(X_train, y_train)
    
    preds = gbc_model.predict(X_valid)
    score = accuracy_score(y_valid, preds)
    
    score_list.append(score)
    
    print(f'iter {n_iter} score : ', round(score, 4))
    
print('total score', round(sum(score_list)/len(score_list), 2))

iter 1 score :  0.8156
iter 2 score :  0.8146
iter 3 score :  0.8539
iter 4 score :  0.7753
iter 5 score :  0.8539
total score 0.82


In [20]:
predictions = gbc_model.predict(test_df[features])

In [22]:
submissions = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
submissions.to_csv('submission.csv', index=False)

### LGBM

In [24]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df[features], 
                                                      train_df[label], 
                                                      test_size=0.2, 
                                                      random_state=seed)

In [28]:
lgbm_model = lgb.LGBMClassifier(random_state=seed)
lgbm_model.fit(X_train, y_train, 
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=10)

[1]	valid_0's binary_logloss: 0.619132
[2]	valid_0's binary_logloss: 0.587725
[3]	valid_0's binary_logloss: 0.561511
[4]	valid_0's binary_logloss: 0.541057
[5]	valid_0's binary_logloss: 0.523913
[6]	valid_0's binary_logloss: 0.509778
[7]	valid_0's binary_logloss: 0.498971
[8]	valid_0's binary_logloss: 0.487396
[9]	valid_0's binary_logloss: 0.479098
[10]	valid_0's binary_logloss: 0.468845
[11]	valid_0's binary_logloss: 0.462668
[12]	valid_0's binary_logloss: 0.452696
[13]	valid_0's binary_logloss: 0.445107
[14]	valid_0's binary_logloss: 0.438309
[15]	valid_0's binary_logloss: 0.432328
[16]	valid_0's binary_logloss: 0.429358
[17]	valid_0's binary_logloss: 0.427486
[18]	valid_0's binary_logloss: 0.425655
[19]	valid_0's binary_logloss: 0.423954
[20]	valid_0's binary_logloss: 0.423045
[21]	valid_0's binary_logloss: 0.422747
[22]	valid_0's binary_logloss: 0.421783
[23]	valid_0's binary_logloss: 0.420251
[24]	valid_0's binary_logloss: 0.419632
[25]	valid_0's binary_logloss: 0.419551
[26]	vali

LGBMClassifier(random_state=2023)

In [30]:
predictions = lgbm_model.predict(test_df[features], num_iteration=lgbm_model.best_iteration_)

In [31]:
submissions = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
submissions.to_csv('submission.csv', index=False)