* Classification
* Evaluation : Accuracy

# Setting Up

In [1]:
# Basic Package
import pandas as pd                                     # Data analysis tool
import numpy as np                                      # Package for scientific computing


# Model Metrics
# from sklearn.metrics import mean_absolute_error         # One of many statistical measures of error


## Self-Define Functions

### Plot_stats_graph

In [2]:

TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

train_data = pd.read_csv(TRAIN_FILE)
test_data = pd.read_csv(TEST_FILE)

# Understand the Data

In [3]:
train_data.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
#check for N/A
train_data.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [7]:
#### check categorical feature domain
train_data['CryoSleep'].value_counts(dropna=False)

CryoSleep
False    5439
True     3037
NaN       217
Name: count, dtype: int64

In [8]:
train_data['HomePlanet'].value_counts(dropna=False)

HomePlanet
Earth     4602
Europa    2131
Mars      1759
NaN        201
Name: count, dtype: int64

In [9]:
train_data['Cabin'].value_counts(dropna=False)

Cabin
NaN        199
G/734/S      8
C/137/S      7
B/201/P      7
G/109/P      7
          ... 
G/556/P      1
E/231/S      1
G/545/S      1
G/543/S      1
C/178/S      1
Name: count, Length: 6561, dtype: int64

In [10]:
train_data['VIP'].value_counts(dropna=False)

VIP
False    8291
NaN       203
True      199
Name: count, dtype: int64

In [11]:
train_data['Destination'].value_counts(dropna=False)

Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: count, dtype: int64

# First appoarch, simple model
extract features from existing categorical features without producing derived features.


* Cabin --> colA/colB/colc

* drop : PassengerId, Name

### Feature Engineering

#### Cabin

In [12]:
train_data[['Cabin_A', 'Cabin_B', 'Cabin_C']] = train_data['Cabin'].str.split('/', expand=True)
train_data = train_data.drop('Cabin', axis=1)


In [13]:
train_data.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_A,Cabin_B,Cabin_C
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S
5,0005_01,Earth,False,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True,F,0,P
6,0006_01,Earth,False,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True,F,2,S
7,0006_02,Earth,True,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True,G,0,S
8,0007_01,Earth,False,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True,F,3,S
9,0008_01,Europa,True,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True,B,1,P


In [49]:
train_data['Cabin_B'] = train_data['Cabin_B'].astype('float')
train_data['CryoSleep'] = train_data['CryoSleep'].astype('str')
train_data['VIP'] = train_data['VIP'].astype('str')

In [50]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8693 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8693 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
 13  Cabin_A       8494 non-null   object 
 14  Cabin_B       8494 non-null   float64
 15  Cabin_C       8494 non-null   object 
dtypes: bool(1), float64(7), object(8)
memory usage: 1.0+ MB


# Data Preprocessing

## Training Validation Split

In [51]:
from sklearn.model_selection import train_test_split    # Splits arrays or matrices into random train and test subsets


In [52]:
from sklearn import preprocessing, linear_model, metrics, model_selection

feature = train_data.columns.drop(['PassengerId','Name','Transported'])

t_data = train_data[feature].copy()

numeric_features = t_data.select_dtypes(include='float').columns.to_list()
categorical_features = t_data.select_dtypes(exclude='float').columns.to_list()

t_labels = train_data['Transported'].copy()

X_train, X_val, y_train,y_val = model_selection.train_test_split(
    t_data
    , t_labels
    , test_size=0.2
    , random_state=2)

In [53]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6954 entries, 1475 to 7336
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    6792 non-null   object 
 1   CryoSleep     6954 non-null   object 
 2   Destination   6810 non-null   object 
 3   Age           6812 non-null   float64
 4   VIP           6954 non-null   object 
 5   RoomService   6817 non-null   float64
 6   FoodCourt     6802 non-null   float64
 7   ShoppingMall  6785 non-null   float64
 8   Spa           6815 non-null   float64
 9   VRDeck        6812 non-null   float64
 10  Cabin_A       6790 non-null   object 
 11  Cabin_B       6790 non-null   float64
 12  Cabin_C       6790 non-null   object 
dtypes: float64(7), object(6)
memory usage: 760.6+ KB


In [54]:
numeric_features

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_B']

In [55]:
categorical_features

['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_A', 'Cabin_C']

## Create Pipeline

https://towardsdatascience.com/step-by-step-tutorial-of-sci-kit-learn-pipeline-62402d5629b6

Build a predefined data preprocess pipeline

### Loading Packages

In [56]:
# Data Preprocessing
from sklearn.compose import ColumnTransformer           # Applies transformers to columns of DataFrames
from sklearn.pipeline import Pipeline                   # Helps building a chain of transforms and estimators
from sklearn.impute import SimpleImputer                # Imputation transformer for completing missing values

# Numeric preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Categorical preprocessing
from sklearn.preprocessing import OrdinalEncoder        # Encode categorical features into ranked numerics. eg: S:1, M:2, L:3
from sklearn.preprocessing import OneHotEncoder         # Encode categorical features into 1/0

# create more features
from sklearn.preprocessing import PolynomialFeatures    # Create PolonomialFeatures

# Dimensional reduction
from sklearn.decomposition import PCA

### Setting up the pipeline

In [64]:
# setting up

numeric_transformer = Pipeline(
                              steps=[
                                       ('n_imputer', SimpleImputer(strategy='mean'))
                                       ,('scaler', MinMaxScaler())
                                       #,('poly', PolynomialFeatures(2)) #set to default
                              ])

categorical_transformer = Pipeline(
                              steps=[
                                   ('c_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                   ('onehot', OneHotEncoder())
])

#ColumnTransformer can perform different preprocessing steps due to different data types
preprocessor = ColumnTransformer(
                                 transformers=[
                                    ('numeric', numeric_transformer, numeric_features),
                                    ('categorical', categorical_transformer, categorical_features)
                                 ]
                              ) 

# Model Selection

## Comparing Base models

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

regressors = [
   #LogisticRegression(),
   RandomForestClassifier(), 
   AdaBoostClassifier(), 
   BaggingClassifier(),
   GradientBoostingClassifier(),
   XGBClassifier()
   ]


for r in regressors:


   print(r)
   pipeline = Pipeline(steps = [
                  ('preprocessor', preprocessor)
                   ,('regressor',r)
            ])
   pipeline.fit(X_train, y_train)
   train_pre = pipeline.predict(X_train)
   val_pre = pipeline.predict(X_val)
   
   print("Training : ", metrics.accuracy_score(y_train, train_pre))
   print("Validation : ", metrics.accuracy_score(val_pre, y_val))


RandomForestClassifier()
Training :  0.999712395743457
Validation :  0.7964347326049454
AdaBoostClassifier()
Training :  0.8027034800115042
Validation :  0.7889591719378953
BaggingClassifier()
Training :  0.9811619211964338
Validation :  0.7866589994249569
GradientBoostingClassifier()
Training :  0.8229795800977855
Validation :  0.8136860264519838
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estim

## Hyper parameter tuning

In [60]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV        # Search over specified parameter values for an estimator
from sklearn.model_selection import KFold               # Cross-validator
from sklearn.model_selection import cross_validate      # Evaluate metrics by cross-validation
from sklearn.metrics import mean_absolute_error         # One of many statistical measures of error



*
xgb_param_grid = {'model__n_estimators': [10, 50, 100, 200, 400, 600],
              'model__max_depth': [2, 3, 5, 7, 10],
              'model__min_child_weight': [0.05, 0.10],
              'model__learning_rate': [0.01, 0.1],
              'model__lambda' : [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
              'model__gamma' : [0, 0.25, 0.5, 1.0],
              }

In [68]:
rf_param_grid = {'model__n_estimators': [10, 50, 100, 200],
              'model__max_depth': [2, 3, 5, 7, 10],
              'model__min_samples_split': [0.05, 0.10],

              }

fit_params={"model__early_stopping_rounds":10, 
            "model__eval_metric" : "mse", 
            "model__eval_set" : [[X_val, y_val]]
            }

clf = RandomForestClassifier()
# find the best parameter
kfold = KFold(shuffle=True, random_state=0)

pipeline = Pipeline(steps = [
                ('preprocessor', preprocessor)
            ,('model',clf)
        ])

# avilable scoring : print(metrics.SCORERS.keys())
grid_search = GridSearchCV(pipeline
                           , rf_param_grid
                           , scoring='accuracy'
                           , cv=kfold
                           , n_jobs=-1
                           )
#grid_search.set_params('model__early_stopping_rounds'=10)
grid_result = grid_search.fit(X_train, y_train)

In [69]:
grid_result.best_params_

{'model__max_depth': 10,
 'model__min_samples_split': 0.05,
 'model__n_estimators': 100}

In [82]:
# Define the model with default parameters
model = RandomForestClassifier(random_state=0
                     , max_depth=10
                     , min_samples_split = 0.05
                     , n_estimators= 10
                     )

# Pack preprocessing and modeling together in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                              ])

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
train_pre = pipeline.predict(X_train)
val_pre = pipeline.predict(X_val)

print("Training : ", metrics.accuracy_score(y_train, train_pre))
print("Validation : ", metrics.accuracy_score(val_pre, y_val))
 

Training :  0.7825711820534944
Validation :  0.7745830937320299


### Stacking

In [108]:

from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor, StackingClassifier
from sklearn.feature_selection import SelectFromModel


X_train, X_val, y_train,y_val = train_test_split(t_data, t_labels, test_size=0.2, random_state=2)


estimators = [
    ('xgb1', XGBClassifier()),
    ('rf',  RandomForestClassifier()),
    ('ada', AdaBoostClassifier()),
    ('bag', LogisticRegression())
]
clf = StackingClassifier(
    estimators=estimators,
    #final_estimator=RandomForestClassifier(random_state=0)
    final_estimator=AdaBoostClassifier(random_state=0)
)


stacked_pipeline = Pipeline([
            ('preprocessor', preprocessor)
            ,('reg',reg)
        ])

rf_model = stacked_pipeline.fit(X_train, y_train)
train_pre = rf_model.predict(X_train)
val_pre = rf_model.predict(X_val)

print("Training : ", metrics.accuracy_score(y_train, train_pre))
print("Validation : ", metrics.accuracy_score(val_pre, y_val))

Training :  0.909692263445499
Validation :  0.7935595169637722


In [102]:
param_grid = {
    #'reg__xgb__learning_rate': [0.1, 0.01],  # Learning rate
    #'reg__xgb__max_depth': [3, 5, 7],  # Maximum depth of a tree
    #'reg__xgb__n_estimators': [100, 200, 300],  # Number of trees
    #'reg__xgb__subsample': [0.8, 0.9, 1.0],  # Subsample ratio of the training instances
    #'reg__xgb__colsample_bytree': [0.8, 0.9, 1.0],  # Subsample ratio of columns when constructing each tree
    #'reg__xgb__gamma': [0, 0.1, 0.2],  # Minimum loss reduction required to make a further partition on a leaf node
    #'reg__xgb__reg_alpha': [0, 0.1, 0.5]  # L1 regularization term on weights
    #'reg__xgb__reg_lambda': [0, 0.1, 0.5]  # L2 regularization term on weights    
    'model__final_estimator__n_estimators': [10, 50, 100, 200],
    'model__final_estimator__max_depth': [2, 3, 5, 7, 10],
    'model__final_estimator__min_samples_split': [0.05, 0.10]
}
kfold = KFold(shuffle=True, random_state=0)

pipeline = Pipeline(steps = [
                ('preprocessor', preprocessor)
            ,('model',clf)
        ])

# avilable scoring : print(metrics.SCORERS.keys())
grid_search = GridSearchCV(pipeline
                           , param_grid
                           , scoring='accuracy'
                           , cv=kfold
                           , n_jobs=-1
                           )
#grid_search.set_params('model__early_stopping_rounds'=10)
grid_result = grid_search.fit(X_train, y_train)

In [103]:
grid_result.best_params_

{'model__final_estimator__max_depth': 5,
 'model__final_estimator__min_samples_split': 0.1,
 'model__final_estimator__n_estimators': 10}

In [104]:
estimators = [
    ('xgb1', XGBClassifier()),
    ('rf',  RandomForestClassifier()),
    ('ada', AdaBoostClassifier()),
    ('bag', BaggingClassifier())
]
clf = StackingClassifier(
    estimators=estimators,
    final_estimator=RandomForestClassifier(
        random_state=0
                     , max_depth=5
                     , min_samples_split = 0.1
                     , n_estimators= 10
        
    )
)


stacked_pipeline = Pipeline([
            ('preprocessor', preprocessor)
            ,('reg',reg)
        ])

rf_model = stacked_pipeline.fit(X_train, y_train)
train_pre = rf_model.predict(X_train)
val_pre = rf_model.predict(X_val)

print("Training : ", metrics.accuracy_score(y_train, train_pre))
print("Validation : ", metrics.accuracy_score(val_pre, y_val))

Training :  0.9017831463905666
Validation :  0.7987349051178838
