In [None]:
import numpy as np
import pandas as pd

In [None]:
ds_train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
ds_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
dataset = pd.concat([ds_train, ds_test], axis=0)

PassengerId = ds_test['PassengerId'].values

dataset = dataset.drop(['PassengerId', 'Name'], axis=1)

print(dataset.info())
dataset.head()

# Feature Engginering

In [None]:
## Cabin
# NaN means they didn't have a private cabin
dataset['Cabin'] = dataset['Cabin'].fillna('N')

# Retrive top alphabet
dataset['Cabin'] = dataset['Cabin'].map(lambda x: x[0])
dataset.head()

In [None]:
## Ticket
import re

def strip_key_word(target=''):    
    return target.replace('.', '').replace(' ', '').replace('/', '')

dataset['Ticket'] = dataset['Ticket'].fillna('X').apply(lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'X')
dataset['Ticket'] = dataset['Ticket'].apply(lambda x: strip_key_word(x))

dataset.head()

In [None]:
## SibSp and Parch
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 #Add one, it's ownself
#dataset['Alone'] = dataset['Family'].apply(lambda x: 0 if x >= 1 else 0)

#dataset = dataset.drop(['SibSp', 'Parch'], axis=1)
dataset.head()

# Missing Value Engineering

### Original dataset info()
 0   Survived  100000 non-null  float64  
 1   Pclass    200000 non-null  int64    
 2   Sex       200000 non-null  object   
 3   Age       193221 non-null  float64  
 4   SibSp     200000 non-null  int64    
 5   Parch     200000 non-null  int64    
 6   Ticket    190196 non-null  object   
 7   Fare      199733 non-null  float64  
 8   Cabin     61303 non-null   object   
 9   Embarked  199473 non-null  object   

In [None]:
## Show missing value information
print(dataset.info())

In [None]:
## Don't use MICE for numerical columns
#  MICE uses linear-regression model, there might not be a linear relation between Age and Fare

pclass_age = dataset[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
dataset.fillna({'Age': dataset['Pclass'].apply(lambda x: pclass_age['Age'][x])}, inplace=True)

## Fare has a large skew
dataset.fillna({'Fare': dataset['Fare'].mean()}, inplace=True)
dataset['Fare'] = np.log(np.clip(dataset['Fare'], 1e-100, 1e+100))

## Fillna with mode in Embarked
dataset.fillna({'Embarked': dataset['Embarked'].mode().values[0]}, inplace=True)

print(dataset.info())
dataset.head()

# Encoding

In [None]:
dataset = pd.get_dummies(data=dataset, columns=['Pclass', 'Sex', 'Cabin', 'Embarked'], drop_first=True)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in ['Ticket']:
    dataset[col] = encoder.fit_transform(dataset[col])

dataset.head()

# Modeling

In [None]:
ds_train = dataset[dataset['Survived'].notnull()]
ds_test = dataset[dataset['Survived'].isnull()]
ds_test = ds_test.drop(columns=['Survived'], axis=1)

y = ds_train['Survived']
X = ds_train.drop(columns=['Survived'], axis=1)

print(ds_train.shape)
print(ds_test.shape)

In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def stratified_lgb(X,y, params):
    kf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)                  
    accuracy=[]   # list contains AUC for each fold  
    for tr_idx, te_idx in kf.split(X, y):
        X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
        y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
        lgb_classifier = lgb.LGBMClassifier(**params)
        lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=200, eval_metric='auc')
        
        accuracy.append(accuracy_score(y_te, lgb_classifier.predict(X_te))) 
    return np.mean(accuracy)

## LightGBM Classification

def objective(trial):
    params = {
            'objective': 'binary',
            'metric': 'auc',
            'n_estimators': 500,
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-7, 1e-2)
        }

    return stratified_lgb(X, y, params)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)
lgb_best = study.best_params
print(lgb_best)

In [None]:
## Predict
lgb_best['n_estimators'] = 10000
lgb_best['objective'] = 'binary'
lgb_best['metric'] = 'auc'

kf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)                  
predict = pd.DataFrame()
n=0   
for tr_idx, te_idx in kf.split(X, y):
    X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
    y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
    lgb_classifier = lgb.LGBMClassifier(**lgb_best)
    lgb_classifier.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], verbose=False, early_stopping_rounds=2000, eval_metric='auc')
    y_pred = lgb_classifier.predict(ds_test)
    predict[n] = y_pred
    n+=1

result = np.round(predict.mode(axis=1).loc[:,0].values).astype(int)

output = pd.DataFrame({'PassengerId': PassengerId, 'Survived': result})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")