In [1]:
import numpy as np
import pandas as pd

In [2]:
ds_train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
ds_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
dataset = pd.concat([ds_train, ds_test], axis=0)

PassengerId = ds_test['PassengerId'].values

dataset = dataset.drop(['PassengerId', 'Name'], axis=1)

print(dataset.info())
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   Survived  100000 non-null  float64
 1   Pclass    200000 non-null  int64  
 2   Sex       200000 non-null  object 
 3   Age       193221 non-null  float64
 4   SibSp     200000 non-null  int64  
 5   Parch     200000 non-null  int64  
 6   Ticket    190196 non-null  object 
 7   Fare      199733 non-null  float64
 8   Cabin     61303 non-null   object 
 9   Embarked  199473 non-null  object 
dtypes: float64(3), int64(3), object(4)
memory usage: 16.8+ MB
None


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,1,male,,2,0,209245,27.14,C12239,S
1,0.0,3,male,,0,0,27323,13.35,,S
2,0.0,3,male,0.33,1,2,CA 457703,71.29,,S
3,0.0,3,male,19.0,0,0,A. 10866,13.04,,S
4,1.0,3,male,25.0,0,0,427635,7.76,,S


# Feature Engginering

In [3]:
## Cabin
# NaN means they didn't have a private cabin
dataset['Cabin'] = dataset['Cabin'].fillna('N')

# Retrive top alphabet
dataset['Cabin'] = dataset['Cabin'].map(lambda x: x[0])
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,1,male,,2,0,209245,27.14,C,S
1,0.0,3,male,,0,0,27323,13.35,N,S
2,0.0,3,male,0.33,1,2,CA 457703,71.29,N,S
3,0.0,3,male,19.0,0,0,A. 10866,13.04,N,S
4,1.0,3,male,25.0,0,0,427635,7.76,N,S


In [4]:
## Ticket
import re

'''
### strip_key_word ###
### remove string based on regrex key and "." or "(space)" etc..
def strip_key_word(key='', target=''):
    
    return re.sub(key, '', target).replace('.', '').replace(' ', '').replace('/', '')
    

ticket = dataset[['Ticket']].copy()
ticket['Ticket_cat'] = ticket['Ticket'].apply(lambda x: strip_key_word('[0-9]+', str(x)) 
                                              if strip_key_word('[0-9]+', str(x)) != '' else strip_key_word('[a-zA-Z]+', str(x))[0])
dataset['Ticket'] = ticket['Ticket_cat']

'''
# Don't use Ticket in prediction.
dataset = dataset.drop(['Ticket'], axis=1)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1.0,1,male,,2,0,27.14,C,S
1,0.0,3,male,,0,0,13.35,N,S
2,0.0,3,male,0.33,1,2,71.29,N,S
3,0.0,3,male,19.0,0,0,13.04,N,S
4,1.0,3,male,25.0,0,0,7.76,N,S


In [5]:
## SibSp and Parch
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 #Add one, it's ownself
#dataset['Alone'] = dataset['Family'].apply(lambda x: 0 if x >= 1 else 0)

#dataset = dataset.drop(['SibSp', 'Parch'], axis=1)
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,FamilySize
0,1.0,1,male,,2,0,27.14,C,S,3
1,0.0,3,male,,0,0,13.35,N,S,1
2,0.0,3,male,0.33,1,2,71.29,N,S,4
3,0.0,3,male,19.0,0,0,13.04,N,S,1
4,1.0,3,male,25.0,0,0,7.76,N,S,1


# Missing Value Engineering

### Original dataset info()
 0   Survived  100000 non-null  float64  
 1   Pclass    200000 non-null  int64    
 2   Sex       200000 non-null  object   
 3   Age       193221 non-null  float64  
 4   SibSp     200000 non-null  int64    
 5   Parch     200000 non-null  int64    
 6   Ticket    190196 non-null  object   
 7   Fare      199733 non-null  float64  
 8   Cabin     61303 non-null   object   
 9   Embarked  199473 non-null  object   

In [6]:
## Show missing value information
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Survived    100000 non-null  float64
 1   Pclass      200000 non-null  int64  
 2   Sex         200000 non-null  object 
 3   Age         193221 non-null  float64
 4   SibSp       200000 non-null  int64  
 5   Parch       200000 non-null  int64  
 6   Fare        199733 non-null  float64
 7   Cabin       200000 non-null  object 
 8   Embarked    199473 non-null  object 
 9   FamilySize  200000 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 16.8+ MB
None


In [7]:
## use MICE for numerical columns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=42)
dataset[['Age', 'Fare']] = imputer.fit_transform(dataset[['Age', 'Fare']])

## Fare has a large skew
dataset['Fare'] = np.log(np.clip(dataset['Fare'], 1e-100, 1e+100))

dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,FamilySize
0,1.0,1,male,33.951978,2,0,3.301009,C,S,3
1,0.0,3,male,33.548728,0,0,2.591516,N,S,1
2,0.0,3,male,0.33,1,2,4.266756,N,S,4
3,0.0,3,male,19.0,0,0,2.568022,N,S,1
4,1.0,3,male,25.0,0,0,2.048982,N,S,1


In [8]:
## Fillna with mode in Embarked
dataset.fillna({'Embarked': dataset['Embarked'].mode().values[0]}, inplace=True)

print(dataset.info())
dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Survived    100000 non-null  float64
 1   Pclass      200000 non-null  int64  
 2   Sex         200000 non-null  object 
 3   Age         200000 non-null  float64
 4   SibSp       200000 non-null  int64  
 5   Parch       200000 non-null  int64  
 6   Fare        200000 non-null  float64
 7   Cabin       200000 non-null  object 
 8   Embarked    200000 non-null  object 
 9   FamilySize  200000 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 16.8+ MB
None


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,FamilySize
0,1.0,1,male,33.951978,2,0,3.301009,C,S,3
1,0.0,3,male,33.548728,0,0,2.591516,N,S,1
2,0.0,3,male,0.33,1,2,4.266756,N,S,4
3,0.0,3,male,19.0,0,0,2.568022,N,S,1
4,1.0,3,male,25.0,0,0,2.048982,N,S,1


# OneHot Encoding

In [9]:
dataset = pd.get_dummies(data=dataset, columns=['Pclass', 'Sex', 'Cabin', 'Embarked'], drop_first=True)
dataset.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,FamilySize,Pclass_2,Pclass_3,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_N,Cabin_T,Embarked_Q,Embarked_S
0,1.0,33.951978,2,0,3.301009,3,0,0,1,0,1,0,0,0,0,0,0,0,1
1,0.0,33.548728,0,0,2.591516,1,0,1,1,0,0,0,0,0,0,1,0,0,1
2,0.0,0.33,1,2,4.266756,4,0,1,1,0,0,0,0,0,0,1,0,0,1
3,0.0,19.0,0,0,2.568022,1,0,1,1,0,0,0,0,0,0,1,0,0,1
4,1.0,25.0,0,0,2.048982,1,0,1,1,0,0,0,0,0,0,1,0,0,1


# Modeling

In [10]:
ds_train = dataset[dataset['Survived'].notnull()]
ds_test = dataset[dataset['Survived'].isnull()]
ds_test = ds_test.drop(columns=['Survived'], axis=1)

y = ds_train['Survived']
X = ds_train.drop(columns=['Survived'], axis=1)

print(ds_train.shape)
print(ds_test.shape)

(100000, 19)
(100000, 18)


In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def stratified_lgb(X,y, params):
    kf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)                  
    accuracy=[]   # list contains AUC for each fold  
    for tr_idx, te_idx in kf.split(X, y):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        lgb_classifier = lgb.LGBMClassifier(**params)
        lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=200)
        
        accuracy.append(accuracy_score(y_te, lgb_classifier.predict(X_te))) 
    return np.mean(accuracy)

## LightGBM Classification

def objective(trial):
    params = {
            'objective': 'binary',
            'metric': 'auc',
            'n_estimators': 500,
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-6, 1e-2)
        }

    return stratified_lgb(X, y, params)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)
lgb_best = study.best_params
print(lgb_best)

[32m[I 2021-04-05 07:40:12,674][0m A new study created in memory with name: no-name-bab466c5-88f4-4aa2-92cc-ea9769a0e0a7[0m
[32m[I 2021-04-05 07:43:41,957][0m Trial 2 finished with value: 0.57226 and parameters: {'lambda_l1': 1.333424495856013e-06, 'lambda_l2': 7.368841046521172e-08, 'num_leaves': 199, 'feature_fraction': 0.568261181193204, 'bagging_fraction': 0.8588093166028761, 'bagging_freq': 5, 'min_child_samples': 61, 'learning_rate': 1.2773419903703142e-06}. Best is trial 2 with value: 0.57226.[0m
[32m[I 2021-04-05 07:44:06,017][0m Trial 1 finished with value: 0.57226 and parameters: {'lambda_l1': 0.8321803170666052, 'lambda_l2': 0.0006307504333340671, 'num_leaves': 205, 'feature_fraction': 0.8204861188563525, 'bagging_fraction': 0.6384581513988342, 'bagging_freq': 2, 'min_child_samples': 28, 'learning_rate': 6.8761190932587065e-06}. Best is trial 2 with value: 0.57226.[0m
[32m[I 2021-04-05 07:44:17,228][0m Trial 0 finished with value: 0.75079 and parameters: {'lambda_

In [None]:
## Predict
lgb_best['n_estimators'] = 10000
kf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)                  
predict = pd.DataFrame()
n=0   
for tr_idx, te_idx in kf.split(X, y):
    X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
    y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
    lgb_classifier = lgb.LGBMClassifier(**lgb_best)
    lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=2000)
    y_pred = lgb_classifier.predict(ds_test)
    predict[n] = y_pred
    n+=1

result = np.round(predict.mode(axis=1).loc[:,0].values).astype(int)

output = pd.DataFrame({'PassengerId': PassengerId, 'Survived': result})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")