# 1. Problem Definition

> Predict whether or not the person survived or not.

# 2. Data
This data was taken from kaggle's Monthly Tabular Playground  

>The dataset is synthetic but based on a real dataset (in this case, the actual Titanic data!) and generated using a CTGAN. The statistical properties of this dataset are very similar to the original Titanic dataset, but there's no way to "cheat" by using public labels for predictions. How well does your model perform on truly unseen data?  
The data has been split into two groups:  
◽ training set (train.csv)  
◽ test set (test.csv)  

https://www.kaggle.com/c/tabular-playground-series-apr-2021/

| Variable | Definition                                 | Key                                            |
|----------|--------------------------------------------|------------------------------------------------|
| survival | Survival                                   | 0 = No, 1 = Yes                                |
| pclass   | Ticket class                               | 1=1st, 2=2nd, 3=3rd                            |
| sex      | Sex                                        | 0=Female, 1=Male                               |
| Age      | Age in years                               |                                                |
| sibsp    | # of siblings / spouses aboard the Titanic |                                                |
| parch    | # of parents / children aboard the Titanic |                                                |
| ticket   | Ticket number                              |                                                |
| fare     | Passenger fare                             |                                                |
| cabin    | Cabin number                               |                                                |
| embarked | Port of Embarkation                        | C = Cherbourg, Q = Queenstown, S = Southampton |

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score
import pickle

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   PassengerId  100000 non-null  int64  
 1   Survived     100000 non-null  int64  
 2   Pclass       100000 non-null  int64  
 3   Name         100000 non-null  object 
 4   Sex          100000 non-null  object 
 5   Age          96708 non-null   float64
 6   SibSp        100000 non-null  int64  
 7   Parch        100000 non-null  int64  
 8   Ticket       95377 non-null   object 
 9   Fare         99866 non-null   float64
 10  Cabin        32134 non-null   object 
 11  Embarked     99750 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 9.2+ MB


In [5]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


Upon looking at the dataset, we can see there are a lot of missing values especially on the Cabin column. We're first going to clean the data and create a new version of the dataset where our machine learning algorithm can have no problem learning at.

In [6]:
np.random.seed(42)

In [7]:
# shuffle the dataset
train_df = train_df.sample(frac=1).reset_index(drop=True)

# dropping the unneeded columns in the dataset
unneeded_cols = ['Cabin', 'PassengerId', 'Name', 'Ticket']
for col in unneeded_cols:
    test_df = test_df.drop([col], axis=1)
    train_df = train_df.drop([col], axis=1)
    
train_df = train_df[train_df['Embarked'].notna()]
train_df = train_df[train_df['Fare'].notna()]
#test_df = test_df[test_df['Embarked'].notna()]

# converting categorical values to numeric    
cat_features = ['Pclass','Sex', 'Embarked', 'SibSp', 'Parch']

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

# filling missing values
num_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

# preprocess and modeling pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat',categorical_transformer, cat_features),
        ('num', numeric_transformer, num_features),
    ])


# split the target column and features column
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

# split using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)



In [8]:
model_alg = {
    'Logistic Regression' : LogisticRegression(solver='liblinear'),
    'Ridge Classifier' : RidgeClassifier(),
    'SGDClassifier' : SGDClassifier(),
    'RandomForest' : RandomForestClassifier(),
    'ExtraTreesClassifier' : ExtraTreesClassifier(),
    'KNC': KNeighborsClassifier(),
    'DecisionTreeClassifier':DecisionTreeClassifier(),
    'LGBMClassifier' : LGBMClassifier(verbose=-1),
    'XGBClassifier' : XGBClassifier(verbose=0, use_label_encoder=False),
    'CatBoostClassifier' : CatBoostClassifier(verbose=0),
}

score = {}
for name, _ in model_alg.items():
    model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', _)])
    model.fit(X_train, y_train)
    score[name] = model.score(X_test, y_test)

Parameters: { "verbose" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [9]:
score

{'Logistic Regression': 0.7662785250619019,
 'Ridge Classifier': 0.768955363715452,
 'SGDClassifier': 0.7519574382654085,
 'RandomForest': 0.7526935688951348,
 'ExtraTreesClassifier': 0.7508867028039885,
 'KNC': 0.7422204376631198,
 'DecisionTreeClassifier': 0.7509870842534966,
 'LGBMClassifier': 0.7700260991768721,
 'XGBClassifier': 0.7693903499966539,
 'CatBoostClassifier': 0.768687679850097}

In [10]:
import operator
max(score.items(), key=operator.itemgetter(1))[0]

'LGBMClassifier'

In [11]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', LGBMClassifier())])
model.fit(X_test, y_test)
model.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Pclass', 'Sex', 'Embarked',
                                                   'SibSp', 'Parch']),
                                                 ('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent'))]),
                                                  ['Age'])])),
                ('model', LGBMClassifie

In [12]:
pipe_grid = {
    'model__num_leaves' : [5,10,15],
    'model__min_data_in_leaf' : [900,920,940],
    'model__max_depth' : [3,5,7],
    'model__n_estimators' : [900,920,940]
}

In [13]:
{'model__max_depth': 5,
 'model__min_data_in_leaf': 920,
 'model__n_estimators': 900,
 'model__num_leaves': 10}

{'model__max_depth': 5,
 'model__min_data_in_leaf': 920,
 'model__n_estimators': 900,
 'model__num_leaves': 10}

In [14]:
gs_model = GridSearchCV(model, 
                        pipe_grid,
                        cv=5,
                        verbose=2,
                        n_jobs=-1,
                       )
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.1min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Pclass',
                                                                          'Sex',
                                                                          'Embarked',
                                                                          'SibSp',
                                             

In [15]:
gs_model.score(X_test, y_test)

0.7706953088402596

In [16]:
# Write
pickle.dump(gs_model, open('gs_model.pkl', 'wb'))

In [17]:
# Read
load_pickle_model_gs = pickle.load(open('gs_model.pkl', 'rb'))

In [18]:
load_pickle_model_gs.best_params_

{'model__max_depth': 5,
 'model__min_data_in_leaf': 920,
 'model__n_estimators': 920,
 'model__num_leaves': 15}

In [19]:
y_preds = load_pickle_model_gs.predict(X_test)
score = roc_auc_score(y_test, y_preds)
print(f'{score:0.5f}')

0.76639


In [20]:
y_preds = load_pickle_model_gs.predict(test_df)


In [21]:
y_preds.shape

(100000,)

In [22]:
test_df.shape

(100000, 7)

In [23]:
test_df['Survived'] = y_preds


In [24]:
test_df.to_csv('psuedo_df.csv',index=False)

In [25]:
submission_df['Survived'] = y_preds
submission_df.to_csv('submission_base_models.csv', index = False)
pd.read_csv("submission_base_models.csv")

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1
3,100003,0
4,100004,1
...,...,...
99995,199995,1
99996,199996,0
99997,199997,0
99998,199998,1
