In [1]:
# std python utility
from pathlib import Path
from collections import Counter
import os
import json
import re

# data wrangling and analysis
import pandas
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# stats
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

# modelling 
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgbm

import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
pd.set_option('display.max_columns', None)

In [2]:
def show_missing_report(df:pandas.DataFrame):
    print("Report of Missing Values")
    all_cols = df.isna().any()
    na_cols = [c for c in all_cols.index if all_cols[c] == True]
    if len(na_cols) == 0:
        print("There are no missing values...")
        return
    na_cols.remove("Survived") # random target feature column we DONT impute for
    display("All columns:",df.isna().any(), "Missing cols:", na_cols)
    print()
    for c in na_cols:
        na_count = df[c].isna().sum()
        total = len(df[c])
        na_percent =  na_count / total * 100
        print("{}: {} / {} instances missing --> {:1.5f}%".format(c, na_count, total, na_percent))
        

# Directory setup

In [3]:
ROOT = Path(os.getcwd())
DATA_DIR = ROOT / "input"
PLOT_DIR = ROOT / "plots"

display(ROOT, DATA_DIR)
os.listdir(DATA_DIR)

PosixPath('/home/hanz/github/kaggle-titanic')

PosixPath('/home/hanz/github/kaggle-titanic/input')

['train.csv', 'test.csv', 'gender_submission.csv']

# Data Loading

In [4]:
# Load datasets
train = pd.read_csv(DATA_DIR/"train.csv")
test = pd.read_csv(DATA_DIR/"test.csv")
# fix common lgbm error with not supporting JSON chars in feature name
# reference: https://stackoverflow.com/questions/60582050/lightgbmerror-do-not-support-special-json-characters-in-feature-name-the-same
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train["Type"] = 1
test["Type"] = 0
## Join train and test sets in order to obtain the same number of features during categorical conversion
full_data = pd.concat(objs=[train,test], axis=0).reset_index(drop=True)
# Fill empty and NaNs values with NaN
full_data = full_data.fillna(np.nan)


# Impute "Age" Feature

In [5]:
# Filling missing value of Age 
## Fill Age with the median age of similar rows according to Pclass, Parch and SibSp
# Indices of NaN age rows
index_NaN_age = list(full_data["Age"][full_data["Age"].isnull()].index) 

for i in index_NaN_age:
    age_med = full_data["Age"].median()
    # search for an instance matching with the feature-criteria
    age_pred = full_data["Age"][(
        (full_data['SibSp'] == full_data.iloc[i]["SibSp"]) & 
        (full_data['Parch'] == full_data.iloc[i]["Parch"]) & 
        (full_data['Pclass'] == full_data.iloc[i]["Pclass"]))].median()
    # check if there exists an instance whose correlated features associate to an age
    if not np.isnan(age_pred): 
        print("Filled using predicted value:", age_pred) # DEBUG
        full_data['Age'].iloc[i] = age_pred
    # case in which there does not exist a matching instance of the feature criteria
    else:
        print("Filled using median value:", age_med) # DEBUG
        full_data['Age'].iloc[i] = age_med

Filled using predicted value: 25.0
Filled using predicted value: 30.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 38.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 23.0
Filled using predicted value: 39.0
Filled using predicted value: 39.0
Filled using predicted value: 16.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 25.0
Filled using predicted value: 16.0
Filled using predict

Filled using predicted value: 16.0


# Engineer "Deck" Feature

In [6]:
regex = r"([a-zA-Z])\d?"
test = 'F G73'
re.findall(regex,test)
full_data['Deck'] = full_data['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

df_all_decks = full_data.groupby(
    ['Deck','Pclass']).count().drop(
    columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 
    'Fare', 'Embarked', 'Cabin', 'PassengerId', 'Ticket']).rename(
    columns={'Name': 'Count'}).transpose()

full_data = full_data.drop("Deck", axis=1)
for i, row in full_data.iterrows():
    deck_str = row["Cabin"]
    # verify value is not null 
    if not pd.notnull(deck_str):
        deck_str = "M" # M denoting missing
        cabin = [deck_str]
    else:
        cabins = re.findall(regex,deck_str)
    cabins = re.findall(regex,deck_str)
    deck_count = Counter(cabins)
    if deck_str != "M" and len(deck_count.keys()) > 1: # DEBUG
        print(cabins)
        print(deck_count)
    for k in deck_count.keys():
        # initialized case
        if "Deck_{}".format(k) in full_data.columns:
            pass
        # uninitializated case
        else:
            pass
        if "Deck" in full_data.columns:
            full_data.loc[i,"Deck"] = "".join(cabins)
        else:
            full_data["Deck"] = np.zeros(len(full_data))
            full_data.loc[i,"Deck"] = "".join(cabins)
    
df_all_decks = full_data.groupby(['Deck','Pclass']).count().drop(
columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin',
         'Fare', 'Embarked', 'PassengerId', 'Ticket'] + 
        [c for c in full_data.columns.to_list() if "Deck_" in c]
).rename(columns={'Name': 'Count'}).transpose()

df_all_decks

['F', 'G']
Counter({'F': 1, 'G': 1})
['F', 'E']
Counter({'F': 1, 'E': 1})
['F', 'G']
Counter({'F': 1, 'G': 1})
['F', 'G']
Counter({'F': 1, 'G': 1})
['F', 'G']
Counter({'F': 1, 'G': 1})
['F', 'E']
Counter({'F': 1, 'E': 1})
['F', 'E']
Counter({'F': 1, 'E': 1})


Deck,A,B,BB,BBB,BBBB,C,CC,CCC,D,D,DD,E,E,E,EE,F,F,FE,FG,G,M,M,M,T
Pclass,1,1,1,1,1,1,1,1,1,2,1,1,2,3,1,2,3,3,3,3,1,2,3,1
Count,22,48,8,4,5,80,8,6,38,6,2,33,4,3,1,13,1,3,4,5,67,254,693,1
Type,22,48,8,4,5,80,8,6,38,6,2,33,4,3,1,13,1,3,4,5,67,254,693,1


In [7]:
df_all_decks = full_data.groupby(['Deck','Pclass']).count().drop(
    columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin',
             'Fare', 'Embarked', 'PassengerId', 'Ticket'] + 
            [c for c in full_data.columns.to_list() if "Deck_" in c]
    ).rename(columns={'Name': 'Count'}).transpose()

df_all_decks

Deck,A,B,BB,BBB,BBBB,C,CC,CCC,D,D,DD,E,E,E,EE,F,F,FE,FG,G,M,M,M,T
Pclass,1,1,1,1,1,1,1,1,1,2,1,1,2,3,1,2,3,3,3,3,1,2,3,1
Count,22,48,8,4,5,80,8,6,38,6,2,33,4,3,1,13,1,3,4,5,67,254,693,1
Type,22,48,8,4,5,80,8,6,38,6,2,33,4,3,1,13,1,3,4,5,67,254,693,1


In [8]:
deck_names = sorted(full_data.Deck.unique())
deck_map = {}
for d in deck_names:
    if "A" in d:
        deck_map[d] = "ABC"
    elif "B" in d:
        deck_map[d] = "ABC"
    elif "C" in d:
        deck_map[d] = "ABC"
    elif "D" in d:
        deck_map[d] = "DE"
    elif "E" in d:
        deck_map[d] = "DE"
    elif "F" in d:
        deck_map[d] = "FG"
    elif "G" in d:
        deck_map[d] = "FG"
deck_map[np.nan] = "M"
        
deck_map

{'A': 'ABC',
 'B': 'ABC',
 'BB': 'ABC',
 'BBB': 'ABC',
 'BBBB': 'ABC',
 'C': 'ABC',
 'CC': 'ABC',
 'CCC': 'ABC',
 'D': 'DE',
 'DD': 'DE',
 'E': 'DE',
 'EE': 'DE',
 'F': 'FG',
 'FE': 'DE',
 'FG': 'FG',
 'G': 'FG',
 nan: 'M'}

In [9]:
full_data['Alone'] = (full_data['SibSp'] == 0) & (full_data['Parch'] == 0)

deck_keys = list(full_data.Deck.unique())

In [10]:
# apply map function for replacing exisitng labels with new label group
full_data["Deck"] = full_data["Deck"].map(deck_map)
full_data.Deck = full_data.Deck.replace(np.nan, "M")
full_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Type,Deck,Alone
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,M,False
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,ABC,False
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,M,True
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,ABC,False
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,M,True


In [11]:
df_all_decks = full_data.groupby(['Deck','Pclass']).count().drop(
    columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin',
             'Fare', 'Embarked', 'PassengerId', 'Ticket'] + 
            [c for c in full_data.columns.to_list() if "Deck_" in c]
    ).rename(columns={'Name': 'Count'}).transpose()

df_all_decks

Deck,ABC,DE,DE,DE,FG,FG,M,M,M
Pclass,1,1,2,3,2,3,1,2,3
Count,181,74,10,6,13,10,68,254,693
Type,181,74,10,6,13,10,68,254,693
Alone,181,74,10,6,13,10,68,254,693


In [12]:
full_data = full_data.drop("Cabin", axis=1)

# Impute "Embarked" Feature

In [13]:
# select instance in which Embarked feature is empty to fill
full_data.Embarked = full_data.Embarked.fillna("S")

In [14]:
med_fare = full_data.groupby(['Pclass', 'Parch', 'SibSp']).Fare.median()[3][0][0]
med_fare

7.8542

In [15]:
full_data['Fare'] = full_data['Fare'].fillna(med_fare)

In [16]:
# split back into original sets 
df_train = full_data[full_data["Type"]==1]
df_test = full_data[full_data["Type"]==0]
# remove column denoting set association
df_train = df_train.drop(columns = ['Type'])
df_test = df_test.drop(columns = ['Type', 'Survived'])
# remove extra column

display(df_train.shape, df_test.shape)


(891, 13)

(418, 12)

# Engineer "Title" Feature

In [17]:
full_data['Title'] = full_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
full_data['Title'] = full_data['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
full_data['Title'] = full_data['Title'].replace(['Dr', 'Col', 'Major', 'Jonkheer', 'Capt', 'Sir', 'Don', 'Rev'], 'Dr/Military/Noble/Clergy')


# Engineer "Child" Feature

In [18]:
full_data.loc[:, "Child"] = 1
full_data.loc[full_data["Age"]>=18, "Child"] = 0

# Verify Non-Existence of Missing Values

In [19]:
show_missing_report(full_data)

Report of Missing Values


'All columns:'

PassengerId    False
Survived        True
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Embarked       False
Type           False
Deck           False
Alone          False
Title          False
Child          False
dtype: bool

'Missing cols:'

[]




In [20]:
show_missing_report(df_train)
df_train.shape
# df_train
# sorted(df_train.columns.to_list())

Report of Missing Values
There are no missing values...


(891, 13)

In [21]:
df_test.shape

(418, 12)

# Drop Features
Features dropped here are 

# Encode Features

In [25]:
target_col = ["Survived"]
id_full_dataset = ["Type"]
cat_cols   = full_data.nunique()[full_data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
#numerical columns
num_cols   = [x for x in full_data.columns if x not in cat_cols + target_col + id_full_dataset]
#Binary columns with 2 values
bin_cols   = full_data.nunique()[full_data.nunique() == 2].keys().tolist()
#Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    full_data[i] = le.fit_transform(full_data[i])
    
#Duplicating columns for multi value columns
full_data = pd.get_dummies(data = full_data,columns = multi_cols )

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(full_data[num_cols])
scaled = pd.full_dataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_full_data_og = full_data.copy()
full_data = full_data.drop(columns = num_cols,axis = 1)
full_data = full_data.merge(scaled,left_index=True,right_index=True,how = "left")

full_data = full_data.drop(columns = ['PassengerId'],axis = 1)


ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'

# Create CV Dataset 

In [None]:
y = df_train_enco.pop("Survived")
X = df_train_enco
# Train_test split
random_state = 1337
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = random_state)

# Configure Grid Search Model Parameters

In [None]:
fit_params = {"early_stopping_rounds" : 100, 
             "eval_metric" : 'auc', 
             "eval_set" : [(X_train,y_train)],
             'eval_names': ['valid'],
             'verbose': 0,
             'categorical_feature': 'auto'}

param_test = {'learning_rate' : [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4],
              'n_estimators' : [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000],
              'num_leaves': sp_randint(6, 50), 
              'min_child_samples': sp_randint(100, 500), 
              'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
              'subsample': sp_uniform(loc=0.2, scale=0.8), 
              'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7],
              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

#number of combinations
n_iter = 500 

#intializing lgbm and lunching the search
lgbm_clf = lgbm.LGBMClassifier(
    random_state=random_state, 
    silent=True, 
    metric='None', n_jobs=4)

grid_search = RandomizedSearchCV(
    estimator=lgbm_clf, param_distributions=param_test, 
    n_iter=n_iter,
    scoring='accuracy',
    cv=5,
    refit=True,
    random_state=random_state,
    verbose=True)

grid_search.fit(X, y, **fit_params)
print('Best score reached: {} with params: {} '.format(grid_search.best_score_, grid_search.best_params_))

opt_parameters =  grid_search.best_params_


# CV Hyperparameter Found from Train Set

In [None]:
opt_parameters

In [None]:
with open("opt_parameters.json", "w") as fh:
    json.dump(opt_parameters, fh)

# Operate on Full Test Set

In [None]:
lgbm_clf = lgbm.LGBMClassifier(**opt_parameters)
lgbm_clf.fit(X, y)
y_pred = lgbm_clf.predict(df_test)

temp = pd.DataFrame(pd.read_csv( DATA_DIR / "test.csv")['PassengerId'])
# temp['Survived'] = y_pred
# temp.to_csv("../lgbm-submission.csv", index = False)
len(temp), len(y_pred)

In [None]:
y_pred.to_csv("lgbm-submission.csv")

# Discrimination Threshold

In [None]:
from yellowbrick.classifier import DiscriminationThreshold
visualizer = DiscriminationThreshold(lgbm_clf)

visualizer.fit(X, y)  
visualizer.poof()   


In [None]:
# from sklearn.externals 
import joblib
joblib.dump(lgbm_clf, 'lgbm_model.pkl')
# lgbm_clf.save_model("lgbm_model.txt")
# lgbm_clf.booster_.save_model("lgbm_model.txt")

In [None]:
# lgbm_loaded = joblib.load("lgbm_model.pkl")
# lgbm_loaded._Booster.save_model("test")

In [None]:
lgbm_clf._Booster.save_model("lgbm_model.txt")

In [None]:
""