In [20]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll import scope
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV

from automl import AutoLGB, AutoXGB, AutoCatboost
import seaborn as sns
%matplotlib inline
sns.set(rc={'figure.figsize': [9, 9]}, font_scale=1.2)
from logging import getLogger
import logger_config
logger = getLogger("main_logger")
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
df = pd.read_csv("data/train.csv")

In [9]:
df["label"] = np.where(df["Transported"] == True, 1, 0)

In [10]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,label
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1


In [11]:
label = "label"
Y = df[label]
selected_cols = [ 
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "Age",
    "VIP",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck"
]
X = df[selected_cols]

# impute missing data for train data
encoder = {}
for col in X.columns:
    if X[col].dtypes == object:
        X[col].fillna(X[col].mode()[0], inplace=True)
        encoder[col] = LabelEncoder()
        X[col] = encoder[col].fit_transform(X[col])
    else:
        X[col].fillna(X[col].median(), inplace=True)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = encoder[col].fit_transform(X[col])


In [12]:
model = AutoLGB()
model.tune(X_train, Y_train)
model.fit(X_train, Y_train)

2022-09-16 16:59:52 - main_logger-INFO - [automl.py:tune:83] Starting optimization process for model LBM


100%|█████████| 10/10 [00:09<00:00,  1.08trial/s, best loss: 0.2077239112571898]

2022-09-16 17:00:02 - main_logger-INFO - [automl.py:tune:89] Best params for model: {'boosting_type': 'gbdt', 'colsample_bytree': 0.5, 'feature_pre_filter': False, 'learning_rate': 0.09623370933913766, 'max_bin': 240, 'max_depth': 4, 'metric': 'binary_error', 'min_child_samples': 118, 'n_estimators': 220, 'n_jobs': -1, 'num_leaves': 983, 'objective': 'binary', 'reg_alpha': 2.6163847925432866, 'reg_lambda': 0.1776854615018797, 'seed': 1, 'subsample': 0.6000000000000001, 'subsample_freq': 1, 'verbosity': -1}
2022-09-16 17:00:02 - main_logger-INFO - [automl.py:tune:90] Time taken to run for: 0.2(mins)
2022-09-16 17:00:02 - main_logger-INFO - [automl.py:fit:98] Fitting model with input data
2022-09-16 17:00:02 - main_logger-INFO - [automl.py:fit:102] Model training accuracy: 0.8139687756778965





In [15]:
model = AutoXGB()
model.tune(X_train, Y_train)
model.fit(X_train, Y_train)

2022-09-16 17:02:20 - main_logger-INFO - [automl.py:tune:155] Starting optimization process for model XgBoost Classifier


100%|████████| 10/10 [00:32<00:00,  3.29s/trial, best loss: 0.20723089564502872]

2022-09-16 17:02:53 - main_logger-INFO - [automl.py:tune:161] Best params for model: {'booster': 'gbtree', 'colsample_bytree': 0.9, 'eval_metric': 'error', 'gamma': 1.0, 'learning_rate': 0.006633209827570888, 'max_depth': 21, 'min_child_weight': 1.0, 'n_estimators': 527, 'nthread': -1, 'objective': 'binary:logistic', 'reg_alpha': 3.6567722778714224, 'reg_lambda': 3.2329291832316756, 'subsample': 0.9500000000000001, 'tree_method': 'exact'}
2022-09-16 17:02:53 - main_logger-INFO - [automl.py:tune:162] Time taken to run for: 0.5(mins)
2022-09-16 17:02:53 - main_logger-INFO - [automl.py:fit:170] Fitting model with input data





2022-09-16 17:02:55 - main_logger-INFO - [automl.py:fit:174] Model training accuracy: 0.8271158586688578


In [21]:
model = AutoCatboost()
model.tune(X_train, Y_train)
model.fit(X_train, Y_train)

2022-09-16 17:09:37 - main_logger-INFO - [automl.py:tune:233] Starting optimization process for model Catboost Classifier


  0%|                                    | 0/10 [00:00<?, ?trial/s, best loss=?]

job exception: name 'CatBoostClassifier' is not defined



  0%|                                    | 0/10 [00:00<?, ?trial/s, best loss=?]


NameError: name 'CatBoostClassifier' is not defined

In [16]:
prediction = model.predict(X_test)

In [17]:
y_pred = (model.predict(X_train) >= 0.5).astype(int)
train_acc = accuracy_score(Y_train, y_pred)
print(f"Accuracy on train set: {train_acc}")
y_pred = (model.predict(X_test) >= 0.5).astype(int)
test_acc = accuracy_score(Y_test, y_pred)
print(f"Accuracy on test set: {test_acc}")

Accuracy on train set: 0.8271158586688578
Accuracy on test set: 0.7956288343558282


In [9]:
# Inference for submission
test_df = pd.read_csv("data/test.csv")
for col in selected_cols:
    if test_df[col].dtypes != object:
        test_df[col].fillna(test_df[col].median(), inplace=True)
    else:
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
        test_df[col] = encoder[col].transform(test_df[col])

X_pred = test_df[selected_cols]
y_pred = model.predict(X_pred)
test_df["Transported"] = y_pred
test_df["Transported"] = np.where(test_df["Transported"] >= 0.5, True, False)
output_df = test_df[["PassengerId", "Transported"]]
output_df.to_csv("final_submission.csv", index=False)

In [10]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,0,1,G/3/S,2,27.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,True
1,0018_01,0,0,F/4/S,2,19.0,0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,False
2,0019_01,1,1,C/0/S,0,31.0,0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,True
3,0021_01,1,0,C/1/S,2,38.0,0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,True
4,0023_01,0,0,F/5/S,2,20.0,0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,True
