In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv('./spaceship-titanic/train.csv')
test_df = pd.read_csv('./spaceship-titanic/test.csv')
print(train_df.shape)
print(test_df.shape)

In [None]:
def function(lastname, passengerid):
    if lastname == 'Unknown' and passengerid != '01':
        return True
    else:
        return False

def setNaN(lastname, test):
    if test == True:
        return np.nan
    else:
        return lastname

def MissingValue_1(dataframe):
    # Handling simple missing value
    dataframe['HomePlanet'].fillna(value = 'Earth', inplace = True)
    dataframe['CryoSleep'].fillna(value = False, inplace = True)
    dataframe['Destination'].fillna(value = 'TRAPPIST-1e', inplace = True)
    dataframe['VIP'].fillna(value = False, inplace = True)
    dataframe['RoomService'].fillna(value = 0, inplace = True)
    dataframe['FoodCourt'].fillna(value = 0, inplace = True)
    dataframe['ShoppingMall'].fillna(value = 0, inplace = True)
    dataframe['Spa'].fillna(value = 0, inplace = True)
    dataframe['VRDeck'].fillna(value = 0, inplace = True)
    
    dataframe['Age'].fillna(train_df['Age'].median(), inplace = True)
    dataframe[['FirstName', 'LastName']] = train_df['Name'].str.split(' ', n = 1, expand = True)
    dataframe[['PassengerId_1', 'PassengerId_2']] = train_df['PassengerId'].str.split('_', n = 1, expand = True)
    
    # Handling LastName missing value
    dataframe['LastName'].fillna(value = 'Unknown', inplace = True)
    dataframe['FirstName'].fillna(value = 'Unknown', inplace = True)
    
    dataframe['Test'] = dataframe.apply(lambda x: function(x.LastName, x.PassengerId_2), axis = 1)
    dataframe['LastName'] = dataframe.apply(lambda x: setNaN(x.LastName, x.Test), axis = 1) # 滿足條件的LastName改設為NaN
    dataframe['LastName'].fillna(method = 'ffill', inplace = True) # LastName為NaN的向前補值
    del dataframe['Test']    
    
    return dataframe

def Cabin_x_y(x, y):
    if x == 'Unknown':
        return y
    else:
        return x

def gen_reference_df(dataframe):
    dataframe['Cabin'].fillna(value = 'Unknown', inplace = True)
    
    train_modify_df = dataframe[dataframe['Cabin'] != 'Unknown']
    PassengerId1_Cabin_df = train_modify_df[['PassengerId_1', 'Cabin']].value_counts().to_frame().reset_index()
    PassengerId1_Cabin_df = PassengerId1_Cabin_df[['PassengerId_1', 'Cabin']]
    return PassengerId1_Cabin_df

def MissingValue_2(dataframe, PassengerId1_Cabin_df):
    # Handling Cabin missing value
    
    dataframe = pd.merge(dataframe, PassengerId1_Cabin_df,how = 'left', left_on = 'PassengerId_1', right_on = 'PassengerId_1')
    
    dataframe['Cabin_x'] = dataframe.apply(lambda x: Cabin_x_y(x.Cabin_x, x.Cabin_y), axis = 1)
    
    del dataframe['Cabin_y']
    dataframe.drop_duplicates(subset = ['PassengerId'], inplace = True)
    dataframe = dataframe.rename(columns = {'Cabin_x': 'Cabin'})
    
    dataframe['Cabin'].fillna(value = 'Unknown', inplace = True)
    dataframe[['Cabin_deck', 'Cabin_num', 'Cabin_side']] = dataframe['Cabin'].str.split('/', n = 2, expand = True)
    dataframe['Cabin_deck'].fillna(value = 'Unknown', inplace = True)
    dataframe['Cabin_num'].fillna(value = 'Unknown', inplace = True)
    dataframe['Cabin_side'].fillna(value = 'Unknown', inplace = True)
    
    return dataframe

def MissingValuePipe(dataframe):
    dataframe = MissingValue_1(dataframe)
    reference_df = gen_reference_df(dataframe)
    dataframe = MissingValue_2(dataframe, reference_df)
    
    return dataframe

In [None]:
train_df = MissingValuePipe(train_df)
test_df = MissingValuePipe(test_df)

### Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

def LabelEncoder_method(dataframe):
    cat_list = []
    num_list = []

    for col in dataframe.columns:
        if dataframe[col].dtype == 'O':
            cat_list.append(col)
        elif dataframe[col].dtype == 'bool':
            dataframe[col] = dataframe[col].astype('O')
            cat_list.append(col)
        else:
            num_list.append(col)
    
    le = LabelEncoder()
    for cat in cat_list:
        dataframe[cat] = le.fit_transform(dataframe[cat])  
    
    return dataframe

In [None]:
train_df = LabelEncoder_method(train_df)
test_df = LabelEncoder_method(test_df)

### Feature Engineering

### Feature Selection and split

In [None]:
train_df.info()

In [None]:
drop_list = ['PassengerId','Cabin','Name','PassengerId_1','PassengerId_2','Transported']
drop2_list = ['PassengerId','Cabin','Name','PassengerId_1','PassengerId_2']

In [None]:
X_train = train_df.drop(drop_list, 1)
y_train = train_df.Transported

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.33, random_state = 42)

In [None]:
test_df = test_df.drop(drop2_list, 1)

### Model Building

In [None]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from xgboost import XGBClassifier
import optuna
from xgboost import plot_importance

In [None]:
def objective(trial, X_train = X_train, y_train = y_train):
    params = {
        "silence": 1,
        "objective": "binary:logistic",
        "eval_metric": "auc",
#         'subsample': trial.suggest_float('subsample', 0.1, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.9),
        'seed': 1000,
        'nthread': 4,
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
    }
    
    if params["booster"] == "gbtree" or params["booster"] == "dart":
        params["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        params["eta"] = trial.suggest_loguniform("eta", 1e-8, 1.0)
        params["gamma"] = trial.suggest_loguniform("gamma", 1e-8, 0.1)
        params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
        params["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)
    
    xgbc = XGBClassifier(**params)
    xgbc.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], early_stopping_rounds=30)
    y_hat = xgbc.predict(X_valid)
    
    return metrics.accuracy_score(y_valid, y_hat)

In [None]:
%%capture --no-display
%%time
import warnings
warnings.filterwarnings('ignore')

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 100)

In [None]:
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

In [None]:
params = study.best_trial.params
xgbc_opt = XGBClassifier(**params)
xgbc_opt.fit(X_train, y_train)
y_hat = xgbc_opt.predict(X_valid)

accuracy = metrics.accuracy_score(y_valid, y_hat)
print(accuracy)

In [None]:
y_predict = xgbc_opt.predict(test_df)

In [None]:
submit_file = pd.read_csv('./spaceship-titanic/sample_submission.csv')
submit_file['Transported'] = y_predict
submit_file['Transported'] = submit_file['Transported'].replace({1:True, 0:False})
submit_file.set_index('PassengerId', inplace = True)
submit_file.head()

In [None]:
submit_file.to_csv('submission_xgboost.csv')