# Imports

In [1]:
import string
import sys
from collections import deque

import pandas as pd
import numpy as np
# import altair as alt
# alt.renderers.enable('mimetype')
# alt.data_transformers.enable('data_server')

from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import warnings

from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB

# import kaggle

import matplotlib.pyplot as plt

from pycaret.classification import *

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
train_df = pd.read_csv("data/train.csv")
train_df.set_index("PassengerId", inplace=True)
train_df = shuffle(train_df)

train_df

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0304_01,Europa,False,E/18/S,TRAPPIST-1e,28.0,False,0.0,422.0,0.0,1851.0,5166.0,Betena Ausivetpul,False
2584_01,Earth,,G/417/P,PSO J318.5-22,21.0,False,0.0,0.0,,0.0,0.0,Lilace Woodwardy,False
0212_01,Earth,False,E/9/P,TRAPPIST-1e,13.0,False,192.0,0.0,0.0,51.0,739.0,Inen Workmans,False
8819_01,Earth,False,G/1428/P,TRAPPIST-1e,44.0,False,857.0,0.0,1.0,8.0,17.0,Almary Sancasey,False
9034_03,Europa,False,D/288/P,55 Cancri e,27.0,False,0.0,984.0,0.0,13995.0,312.0,Phecca Headmish,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5093_01,Europa,False,C/158/P,55 Cancri e,49.0,True,9.0,179.0,0.0,3.0,7865.0,Wasalam Cluthosexy,False
7437_01,Europa,True,B/279/S,55 Cancri e,45.0,False,0.0,0.0,0.0,0.0,0.0,Mesatan Disgul,True
7973_01,,True,C/297/S,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Batan Drelcate,True
9077_02,Earth,True,G/1475/P,PSO J318.5-22,22.0,False,0.0,0.0,0.0,0.0,0.0,Willy Sawyerson,False


# Application of PyCaret

In [3]:
experiment = setup(
    train_df, 
    target='Transported',
    normalize=True,
    use_gpu=True,
)

Unnamed: 0,Description,Value
0,session_id,5816
1,Target,Transported
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(8693, 13)"
5,Missing Values,True
6,Numeric Features,6
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,False


In [4]:
# Model Selection

model = compare_models(n_select=1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7959,0.8801,0.8239,0.7829,0.8026,0.5916,0.5929,19.919
lightgbm,Light Gradient Boosting Machine,0.7943,0.8741,0.8193,0.7829,0.8004,0.5883,0.5894,3.739
rf,Random Forest Classifier,0.7926,0.8627,0.7723,0.8084,0.7896,0.5853,0.5864,6.347
gbc,Gradient Boosting Classifier,0.7918,0.8759,0.8255,0.7762,0.7997,0.5833,0.5851,40.085
xgboost,Extreme Gradient Boosting,0.787,0.8699,0.8024,0.7813,0.7914,0.5739,0.5746,22.032
lr,Logistic Regression,0.7849,0.8729,0.7857,0.7875,0.7864,0.5698,0.5701,5.205
ada,Ada Boost Classifier,0.7828,0.8677,0.7883,0.7831,0.7853,0.5655,0.5661,12.249
svm,SVM - Linear Kernel,0.7821,0.0,0.7796,0.7882,0.7823,0.5642,0.5666,4.232
et,Extra Trees Classifier,0.7758,0.8532,0.7035,0.8267,0.7596,0.5522,0.5591,10.4
dt,Decision Tree Classifier,0.7734,0.7733,0.7877,0.7693,0.7779,0.5466,0.5475,1.147


In [5]:
# Model Optimization

tuned_model = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8128,0.8862,0.8371,0.8006,0.8185,0.6254,0.6261
1,0.8374,0.8978,0.8502,0.8312,0.8406,0.6748,0.675
2,0.7734,0.8566,0.8143,0.7553,0.7837,0.5465,0.5482
3,0.7734,0.8655,0.8208,0.7522,0.785,0.5464,0.5488
4,0.78,0.8598,0.7883,0.7781,0.7832,0.5599,0.5599
5,0.8043,0.8957,0.8274,0.7938,0.8102,0.6083,0.6089
6,0.7911,0.8699,0.8301,0.772,0.8,0.582,0.5837
7,0.7895,0.8749,0.7843,0.7947,0.7895,0.579,0.579
8,0.8043,0.8878,0.8203,0.7968,0.8084,0.6085,0.6087
9,0.8026,0.8916,0.8725,0.7672,0.8165,0.6049,0.6108


In [6]:
# Finalize model

finalized_model = finalize_model(tuned_model)

# Test Set Preparation

In [7]:
X_test = pd.read_csv("data/test.csv")
X_test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [8]:
prediction = {}

In [9]:
prediction = predict_model(finalized_model, data=X_test)

In [10]:
predictions = prediction["Label"]

In [11]:
temp_df = X_test
temp_df["Transported"] = predictions
temp_df = temp_df.loc[:, ["PassengerId", "Transported"]]
temp_df.to_csv("results_pycaret/spaceship_titanic_imputation_221014_pycaret_full.csv", encoding='utf-8', index=False)

In [12]:
temp_df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
