<a href="https://colab.research.google.com/github/josefstrnad/02_Spaceship_Titanic_Final/blob/main/Spaceship_Titanic_Final_GIT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description

In [None]:
'''
File and Data Field Descriptions
train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
Destination - The planet the passenger will be debarking to.
Age - The age of the passenger.
VIP - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
Name - The first and last names of the passenger.
Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.
test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.
sample_submission.csv - A submission file in the correct format.
PassengerId - Id for each passenger in the test set.
Transported - The target. For each passenger, predict either True or False. 

Train: 
All except PassengerId, Name, Transported

Target:
Transported'''

# 1. Importing Modules (Libraries) and loading data

In [None]:
import pandas as pd
import numpy as np

In [None]:
#!git clone https://github.com/josefstrnad/02_Spaceship_Titanic_Final

In [None]:
train_data = pd.read_csv(f"/content/02_Spaceship_Titanic_Final/train.csv")
test_data = pd.read_csv(f"/content/02_Spaceship_Titanic_Final/test.csv")
sub_df = pd.read_csv(f"/content/02_Spaceship_Titanic_Final/sample_submission.csv")
df = pd.concat([train_data, test_data])

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
df.info()

In [None]:
df.Transported.value_counts()

In [None]:
import seaborn as sns
sns.countplot(df.Transported)

# Imputation

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df=df.replace(to_replace=False,value=0)
df=df.replace(to_replace=True,value=1)

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer=imputer.fit(df.iloc[:,1:3])
df.iloc[:,1:3]=imputer.transform(df.iloc[:,1:3])   

imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer=imputer.fit(df.iloc[:,4:5])
df.iloc[:,4:5]=imputer.transform(df.iloc[:,4:5])

imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer=imputer.fit(df.iloc[:,5:6])
df.iloc[:,5:6]=imputer.transform(df.iloc[:,5:6])

imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer=imputer.fit(df.iloc[:,6:7])
df.iloc[:,6:7]=imputer.transform(df.iloc[:,6:7])  

imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer=imputer.fit(df.iloc[:,7:12])
df.iloc[:,7:12]=imputer.transform(df.iloc[:,7:12])

# Data Pre-processing

In [None]:
# Feature Engineering (information extract) from Cabin
# Dividing Cabin column
df['Deck'] = df['Cabin'].str[0]
df['Side'] = df['Cabin'].str[-1]
df.drop(['Cabin'], axis=1, inplace=True)

In [None]:
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer=imputer.fit(df.iloc[:,13:15])
df.iloc[:,13:15]=imputer.transform(df.iloc[:,13:15]) 

In [None]:
df.head()

In [None]:
# Correlation matrix

In [None]:
features_to_drop = ['PassengerId', 'HomePlanet', 'Destination', 'Name', 'Transported'] 
df_CM = df.drop(features_to_drop, axis=1)
df_CM.shape

In [None]:
df_CM.corr(method='pearson')

In [None]:
corr = df_CM.corr()
sns.heatmap(corr, annot=True, fmt=".2f", linewidth=.5)

In [None]:
dummies = pd.get_dummies(df, columns=['HomePlanet', 'Destination',	'Deck', 'Side'], drop_first=False)  

In [None]:
features_to_drop = ['PassengerId',	'HomePlanet',	'CryoSleep',	'Deck', 'Side',	'Destination',	'Age',	'VIP',	'RoomService',	'FoodCourt',	'ShoppingMall',	'Spa',	'VRDeck',	'Name',	'Transported']
df = df.drop(features_to_drop, axis=1)
df.shape

In [None]:
df_new = pd.concat([df, dummies], axis=1)

In [None]:
df_new.head()

In [None]:
features_to_drop = ['PassengerId','Name'] 
df_new = df_new.drop(features_to_drop, axis=1)
df_new.shape

In [None]:
sns.countplot(df_new.CryoSleep)

In [None]:
df_new.head()

# Preparing Train and Test dataset

In [None]:
train = df_new.iloc[:8693]
train.shape

In [None]:
X = train.drop(['Transported'], axis=1)
y = train.Transported

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)   

In [None]:
from sklearn.preprocessing import StandardScaler

standardscaler = StandardScaler()
X_train = standardscaler.fit_transform(X_train)
X_test = standardscaler.transform(X_test)  

In [None]:
# Select categorical predictors
cat_features = df_new.select_dtypes(include='object').columns.to_list()

In [None]:
df_new.info()

# Modelling - all algos together

In [None]:
!pip install xgboost

In [None]:
!pip install lightgbm

In [None]:
!pip install catboost

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

model = [
    DecisionTreeClassifier(max_depth=3, criterion='entropy'),
    RandomForestClassifier(n_estimators=3, criterion='entropy'),
    AdaBoostClassifier(n_estimators=3),     
    GradientBoostingClassifier(n_estimators=3, max_depth=3),
    XGBClassifier(n_estimators=3, max_depth=3),
    CatBoostClassifier(depth=6, silent=True)
]

In [100]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score

#Do some preperation for the loop
col = []
algorithms = pd.DataFrame(columns = col)
idx = 0

#Train and score algorithms
for a in model:
    
    a.fit(X_train, y_train)
    pred = a.predict(X_test)
    acc_train = accuracy_score(y_train, a.predict(X_train)) 
    acc_test = accuracy_score(y_test, pred) 
    f1 = f1_score(y_test, pred)
    cv = cross_val_score(a, X_test, y_test).mean()
    auc = roc_auc_score(y_test, pred)
    
    Alg = a.__class__.__name__
    
    algorithms.loc[idx, 'Algorithm'] = Alg
    algorithms.loc[idx, 'Accuracy [train]'] = round(acc_train * 100, 2)
    algorithms.loc[idx, 'Accuracy [test]'] = round(acc_test * 100, 2)
    algorithms.loc[idx, 'F1 Score'] = round(f1 * 100, 2)
    algorithms.loc[idx, 'CV Score'] = round(cv * 100, 2)
    algorithms.loc[idx, 'AUC Score'] = round(auc * 100, 2)

    idx+=1

In [101]:
algorithms.sort_values(by='AUC Score', ascending=False)

Unnamed: 0,Algorithm,Accuracy [train],Accuracy [test],F1 Score,CV Score,AUC Score
5,CatBoostClassifier,85.76,81.83,82.21,80.05,81.82
1,RandomForestClassifier,91.24,76.65,76.48,74.41,76.66
3,GradientBoostingClassifier,75.93,74.7,72.84,75.05,74.75
4,XGBClassifier,75.93,74.7,72.84,75.1,74.75
0,DecisionTreeClassifier,71.96,71.31,66.22,69.12,71.42
2,AdaBoostClassifier,70.75,68.32,69.1,72.0,68.3


# Hyper-parameter tunning

In [102]:
#Catboost

In [103]:
from sklearn.model_selection import GridSearchCV

param_grid = {
            'iterations':[100,500,1000],
            'learning_rate':[0.01,0.1,0.5],
            'depth':[3,6,10]
            }      

In [104]:
grid = GridSearchCV(estimator=CatBoostClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_test, y_test), verbose=False)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: ignored

In [None]:
grid.best_params_

In [None]:
#{'depth': 6, 'iterations': 1000, 'learning_rate': 0.01}

In [None]:
grid.best_score_

In [None]:
#AdaBoostClassifier

In [None]:
param_grid = {
            'n_estimators':[10, 100, 1000],
            'learning_rate':[0.01, 0.1, 1.0]
            }    

In [None]:
grid = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
#GradientBoostingClassifier

In [None]:
param_grid = {
            'n_estimators':[10, 100, 1000],
            'max_depth':[2, 6, 8, 10, 12, 20]
            }  

In [None]:
grid = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

# Submission

In [107]:
# Save predictions
target = pd.Series(pred, name='target')
target.to_csv('sample_submission.csv', index=False, header=True)

In [106]:
# Save model
import joblib
joblib.dump(grid, filename='model.pkl')

['model.pkl']