In [71]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval
from hyperopt.pyll import scope
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV

from automl import AutoLGB, AutoXGB, AutoCatboost
import seaborn as sns
%matplotlib inline
sns.set(rc={'figure.figsize': [9, 9]}, font_scale=1.2)
from logging import getLogger
import logger_config
logger = getLogger("main_logger")
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
df = pd.read_csv("data/train.csv")

In [13]:
df["label"] = np.where(df["Transported"] == True, 1, 0)

In [14]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,label
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1


In [15]:
label = "label"
Y = df[label]
selected_cols = [ 
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "Age",
    "VIP",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck"
]
X = df[selected_cols]

# impute missing data for train data
encoder = {}
for col in X.columns:
    if X[col].dtypes == object:
        X[col].fillna(X[col].mode()[0], inplace=True)
        encoder[col] = LabelEncoder()
        X[col] = encoder[col].fit_transform(X[col])
    else:
        X[col].fillna(X[col].median(), inplace=True)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = encoder[col].fit_transform(X[col])


In [78]:
model = AutoLGB(n_eval=500)
model.tune(X_train, Y_train)
model.fit(X_train, Y_train)

2022-09-18 20:55:32 - main_logger-INFO - [automl.py:tune:83] Starting optimization process for model LBM


100%|█████| 500/500 [02:23<00:00,  3.49trial/s, best loss: 0.18159408381265407]

2022-09-18 20:57:55 - main_logger-INFO - [automl.py:tune:89] Best params for model: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7000000000000001, 'learning_rate': 0.1926500666123323, 'max_depth': 10, 'metric': 'binary_error', 'min_child_samples': 193, 'n_estimators': 517, 'num_leaves': 15, 'objective': 'binary', 'reg_alpha': 4.3921982478537815, 'reg_lambda': 1.1336354645636761, 'subsample': 0.7000000000000001, 'verbosity': -1}
2022-09-18 20:57:55 - main_logger-INFO - [automl.py:tune:90] Time taken to run for: 2.4(mins)
2022-09-18 20:57:55 - main_logger-INFO - [automl.py:fit:98] Fitting model with input data
2022-09-18 20:57:56 - main_logger-INFO - [automl.py:fit:102] Model training accuracy: 0.8169268693508628





In [81]:
model = AutoXGB(n_eval=500)
model.tune(X_train, Y_train)
model.fit(X_train, Y_train)

2022-09-18 21:00:06 - main_logger-INFO - [automl.py:tune:156] Starting optimization process for model XgBoost Classifier


100%|█████| 500/500 [09:37<00:00,  1.16s/trial, best loss: 0.17584223500410845]

2022-09-18 21:09:44 - main_logger-INFO - [automl.py:tune:162] Best params for model: {'booster': 'gbtree', 'colsample_bytree': 0.9, 'eval_metric': 'error', 'gamma': 0.75, 'learning_rate': 0.012187650094396943, 'max_depth': 13, 'min_child_weight': 5.0, 'n_estimators': 734, 'nthread': -1, 'objective': 'binary:logistic', 'reg_alpha': 3.1445017917205225, 'reg_lambda': 1.8709604994525924, 'subsample': 0.9500000000000001, 'tree_method': 'exact'}
2022-09-18 21:09:44 - main_logger-INFO - [automl.py:tune:163] Time taken to run for: 9.6(mins)
2022-09-18 21:09:44 - main_logger-INFO - [automl.py:fit:171] Fitting model with input data





2022-09-18 21:09:46 - main_logger-INFO - [automl.py:fit:175] Model training accuracy: 0.83746918652424


In [86]:
model = AutoCatboost(n_eval=1000)
model.tune(X_train, Y_train)
model.fit(X_train, Y_train)

2022-09-18 21:11:55 - main_logger-INFO - [automl.py:tune:235] Starting optimization process for model Catboost Classifier


100%|███| 1000/1000 [02:47<00:00,  5.98trial/s, best loss: 0.17255546425636814]

2022-09-18 21:14:43 - main_logger-INFO - [automl.py:tune:241] Best params for model: {'bootstrap_type': 'Bayesian', 'depth': 4.0, 'eval_metric': 'Accuracy', 'fold_len_multiplier': 2.2254228007289374, 'l2_leaf_reg': 2.0001035180042783, 'learning_rate': 0.026935852406511772, 'max_bin': 190, 'min_data_in_leaf': 5.0, 'n_estimators': 100.0, 'od_type': 'Iter', 'od_wait': 25, 'random_strength': 0.7294738907715242, 'verbose': False}
2022-09-18 21:14:43 - main_logger-INFO - [automl.py:tune:242] Time taken to run for: 2.8(mins)





2022-09-18 21:14:43 - main_logger-INFO - [automl.py:fit:250] Fitting model with input data
2022-09-18 21:14:43 - main_logger-INFO - [automl.py:fit:254] Model training accuracy: 0.7942481511914544


In [87]:
prediction = model.predict(X_test)

In [88]:
y_pred = (model.predict(X_train) >= 0.5).astype(int)
train_acc = accuracy_score(Y_train, y_pred)
print(f"Accuracy on train set: {train_acc}")
y_pred = (model.predict(X_test) >= 0.5).astype(int)
test_acc = accuracy_score(Y_test, y_pred)
print(f"Accuracy on test set: {test_acc}")

Accuracy on train set: 0.7942481511914544
Accuracy on test set: 0.7944785276073619


In [89]:
# Inference for submission
test_df = pd.read_csv("data/test.csv")
for col in selected_cols:
    if test_df[col].dtypes != object:
        test_df[col].fillna(test_df[col].median(), inplace=True)
    else:
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
        test_df[col] = encoder[col].transform(test_df[col])

X_pred = test_df[selected_cols]
y_pred = model.predict(X_pred)
test_df["Transported"] = y_pred
test_df["Transported"] = np.where(test_df["Transported"] >= 0.5, True, False)
output_df = test_df[["PassengerId", "Transported"]]
output_df.to_csv("final_submission.csv", index=False)

In [90]:
test_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,0,1,G/3/S,2,27.0,0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,True
1,0018_01,0,0,F/4/S,2,19.0,0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,False
2,0019_01,1,1,C/0/S,0,31.0,0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,True
3,0021_01,1,0,C/1/S,2,38.0,0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,True
4,0023_01,0,0,F/5/S,2,20.0,0,10.0,0.0,635.0,0.0,0.0,Brence Harperez,True
