In [4]:
import pandas as pd
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.svm import SVC

# Additional imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score , confusion_matrix , classification_report
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
    
import warnings
warnings.simplefilter(action='ignore')

In [5]:
data = pd.read_csv("../data/raw/customer_data/ecomm-data.csv")
data.drop('CustomerID' , axis = 1 , inplace = True)


In [6]:
round((data.isnull().sum()*100 / data.shape[0]),2)

Churn                          0.00
Tenure                         4.69
PreferredLoginDevice           0.00
CityTier                       0.00
WarehouseToHome                4.46
PreferredPaymentMode           0.00
Gender                         0.00
HourSpendOnApp                 4.53
NumberOfDeviceRegistered       0.00
PreferedOrderCat               0.00
SatisfactionScore              0.00
MaritalStatus                  0.00
NumberOfAddress                0.00
Complain                       0.00
OrderAmountHikeFromlastYear    4.71
CouponUsed                     4.55
OrderCount                     4.58
DaySinceLastOrder              5.45
CashbackAmount                 0.00
dtype: float64

In [7]:
# Function to fit and save encoders and imputer during training
def fit_and_save_preprocessors(train_data, categorical_columns, impute_strategy, scaler_type):
    encoder_dict = {}

    for col in categorical_columns:
        encoder = LabelEncoder()
        encoder.fit(train_data[col].astype(str))  # Ensure categorical is a string
        encoder_dict[col] = encoder
        train_data[col] = encoder.transform(train_data[col])

    imputer = SimpleImputer(strategy=impute_strategy)
    if scaler_type == 'MinMax':
        scaler = MinMaxScaler()
    elif scaler_type == 'Standard':
        scaler = StandardScaler()
    elif scaler_type == 'Robust':
        scaler = RobustScaler()
    else:
        raise ValueError("Invalid scaler_type. Choose from 'MinMax', 'Standard', or 'Robust'.")

    
    for col in train_data.columns:
        if col not in categorical_columns:
            train_data[col] = imputer.fit_transform(train_data[[col]])

    joblib.dump(encoder_dict, 'encoders.pkl')
    joblib.dump(imputer, 'imputer.pkl')

    # Apply scaler to the entire DataFrame and convert it back to a DataFrame
    train_data_scaled = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)

    joblib.dump(scaler, 'scaler.pkl')


    return train_data_scaled

# Function to apply encoders and imputer during testing
def apply_preprocessors(test_data):
    loaded_encoders = joblib.load('encoders.pkl')
    loaded_imputer = joblib.load('imputer.pkl')
    loaded_scaler = joblib.load('scaler.pkl')

    for col in test_data.columns:
        if col in loaded_encoders:
            encoder = loaded_encoders[col]
            test_data[col] = encoder.transform(test_data[col])
        elif col in loaded_imputer:
            test_data[col] = loaded_imputer[col].transform(test_data[[col]])
        test_data[col] = loaded_scaler.transform(test_data[col])
    
    return test_data

# Detect outliers and remove for training data 
def handle_outliers(df , column_name):
  Q1 = df[column_name].quantile(0.25)
  Q3 = df[column_name].quantile(0.75)
  IQR = Q3 - Q1

  # Define Upper and lower boundaries
  Upper = Q3 + IQR * 1.5
  lower = Q1 - IQR * 1.5

  # lets make filter for col values
  new_df = df[ (df[column_name] > lower) & (df[column_name] < Upper) ]

  return new_df

In [8]:
round((data.isnull().sum()*100 / data.shape[0]),2)

Churn                          0.00
Tenure                         4.69
PreferredLoginDevice           0.00
CityTier                       0.00
WarehouseToHome                4.46
PreferredPaymentMode           0.00
Gender                         0.00
HourSpendOnApp                 4.53
NumberOfDeviceRegistered       0.00
PreferedOrderCat               0.00
SatisfactionScore              0.00
MaritalStatus                  0.00
NumberOfAddress                0.00
Complain                       0.00
OrderAmountHikeFromlastYear    4.71
CouponUsed                     4.55
OrderCount                     4.58
DaySinceLastOrder              5.45
CashbackAmount                 0.00
dtype: float64

In [9]:
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
data_processed = fit_and_save_preprocessors(data, categorical_columns=categorical_cols, impute_strategy="median", scaler_type="MinMax")

# lets Give our Functions columns contains outlier
cols_outliers = ['Tenure' , 'WarehouseToHome' , 'NumberOfAddress' , 'DaySinceLastOrder' , 'HourSpendOnApp' , 'NumberOfDeviceRegistered']
for col in cols_outliers:
    data_without_outliers = handle_outliers(data_processed , col)

In [10]:
def apply_data_balancing_technique(df):
    X = df.drop('Churn' , axis = 1)
    Y = df['Churn']

    smt = SMOTETomek(random_state=42)
    x_over , y_over = smt.fit_resample(X , Y)
    return x_over, y_over

In [11]:
X_balanced, y_balanced = apply_data_balancing_technique(data)

In [12]:
x_train , x_test , y_train , y_test = train_test_split(X_balanced , y_balanced , test_size = 0.30 , random_state = 42)

In [13]:
logisreg_clf = LogisticRegression()
svm_clf = SVC()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
XGB_clf = XGBClassifier()
ada_clf = AdaBoostClassifier()

In [14]:
clf_list = [logisreg_clf, svm_clf, dt_clf, rf_clf, XGB_clf, ada_clf]
clf_name_list = ['Logistic Regression', 'Support Vector Machine', 'Decision Tree', 'Random Forest', 'XGBClassifier' , 'AdaBoostClassifier']

for clf in clf_list:
    clf.fit(x_train,y_train)

KeyboardInterrupt: 

In [None]:
train_acc_list = []
test_acc_list = []

for clf,name in zip(clf_list,clf_name_list):
    y_pred_train = clf.predict(x_train)
    y_pred_test = clf.predict(x_test)
    print(f'Using model: {name}')
    print(f'Trainning Score: {clf.score(x_train, y_train)}')
    print(f'Test Score: {clf.score(x_test, y_test)}')
    print(f'f1-score Train: {f1_score(y_train, y_pred_train)}')
    print(f'f1-score Test: {f1_score(y_test, y_pred_test)}')
    train_acc_list.append(accuracy_score(y_train, y_pred_train))
    test_acc_list.append(accuracy_score(y_test, y_pred_test))
    print(' ' * 60)
    print('*' * 60)
    print(' ' * 60)


Using model: Logistic Regression
Trainning Score: 0.7863899908452853
Test Score: 0.7893238434163701
f1-score Train: 0.7898528970279196
f1-score Test: 0.7943015983321752
                                                            
************************************************************
                                                            
Using model: Support Vector Machine
Trainning Score: 0.7703692401586817
Test Score: 0.7640569395017793
f1-score Train: 0.7841055802610816
f1-score Test: 0.7795144662454273
                                                            
************************************************************
                                                            
Using model: Decision Tree
Trainning Score: 1.0
Test Score: 0.9615658362989323
f1-score Train: 1.0
f1-score Test: 0.9617834394904459
                                                            
************************************************************
                                      

In [None]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2023/10/06 09:37:57 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/hiten/personal-projects/github/customer-churn-prediction/customer_churn_prediction/notebooks/mlruns/1', creation_time=1696577877827, experiment_id='1', last_update_time=1696577877827, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [None]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [None]:
df_train = read_dataframe('../../../../MLOpsEngineer/mlops-zoomcamp/data/yellow_tripdata_2023-05.parquet')
df_val = read_dataframe('../../../../MLOpsEngineer/mlops-zoomcamp//data/yellow_tripdata_2023-06.parquet')

In [None]:

from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso


In [None]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

with mlflow.start_run():

    mlflow.set_tag("developer", "Hitendra")

    mlflow.log_param("train-data-path", "../data/yellow_tripdata_2023-05.parquet")
    mlflow.log_param("valid-data-path", "../data/yellow_tripdata_2023-06.parquet")

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="../models/lin_reg.bin", artifact_path="models_pickle")

In [1]:
import pandas as pd
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.svm import SVC

# Additional imports
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score , confusion_matrix , classification_report
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
    
import warnings
warnings.simplefilter(action='ignore')

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from imblearn.combine import SMOTETomek
import joblib
import mlflow
import mlflow.sklearn
import logging

# Set up logging configuration
logging.basicConfig(level=logging.INFO)

mlflow.set_tracking_uri("sqlite:///mlflow.db")
# Set MLflow experiment name
mlflow.set_experiment("customer_churn_model")



# Function to fit and save preprocessors during training
def fit_and_save_preprocessors(train_data, impute_strategy, scaler_type):
    logging.info(f"Starting preprocessing for impute strategy: {impute_strategy}, scaler type: {scaler_type}")

    # Categorical columns in the dataset (if any)
    categorical_columns = data.select_dtypes(include=['object']).columns.tolist()

    encoder_dict = {}
    imputer = SimpleImputer(strategy=impute_strategy)

    for col in categorical_columns:
        encoder = LabelEncoder()
        encoder.fit(train_data[col].astype(str))  # Ensure categorical is a string
        encoder_dict[col] = encoder
        train_data[col] = encoder.fit_transform(train_data[col])

    for col in train_data.columns:
        if col not in categorical_columns:
            train_data[col] = imputer.fit_transform(train_data[[col]])

    if scaler_type == 'MinMax':
        scaler = MinMaxScaler()
    elif scaler_type == 'Standard':
        scaler = StandardScaler()
    elif scaler_type == 'Robust':
        scaler = RobustScaler()
    else:
        raise ValueError("Invalid scaler_type. Choose from 'MinMax', 'Standard', or 'Robust'.")

    # Separate the target variable "Churn" from the features
    X_train = train_data.drop(columns=['Churn'])
    y_train = train_data['Churn']

    # Apply the scaler to the feature columns only
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

    # Combine the scaled features with the target variable
    train_data_scaled = pd.concat([X_train_scaled, y_train], axis=1)

    joblib.dump(encoder_dict, 'encoders.pkl')
    joblib.dump(imputer, 'imputer.pkl')
    joblib.dump(scaler, 'scaler.pkl')

    return train_data_scaled

# Function to apply encoders and imputer during testing
def apply_preprocessors(test_data):
    loaded_encoders = joblib.load('encoders.pkl')
    loaded_imputer = joblib.load('imputer.pkl')
    loaded_scaler = joblib.load('scaler.pkl')

    for col in test_data.columns:
        if col in loaded_encoders:
            encoder = loaded_encoders[col]
            test_data[col] = encoder.transform(test_data[col])
        elif col in loaded_imputer:
            test_data[col] = loaded_imputer[col].transform(test_data[[col]])

    test_data_scaled = pd.DataFrame(loaded_scaler.transform(test_data), columns=test_data.columns)

    return test_data_scaled

# Function to handle outliers for training data
def handle_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    Upper = Q3 + IQR * 1.5
    lower = Q1 - IQR * 1.5
    new_df = df[(df[column_name] > lower) & (df[column_name] < Upper)]
    return new_df

# Function to apply data balancing technique
def apply_data_balancing_technique(df):
    X = df.drop('Churn', axis=1)
    Y = df['Churn']
    smt = SMOTETomek(random_state=42)
    x_over, y_over = smt.fit_resample(X, Y)
    return x_over, y_over

# Create a list of classifiers
def get_classifiers():
    logisreg_clf = LogisticRegression()
    svm_clf = SVC()
    dt_clf = DecisionTreeClassifier()
    rf_clf = RandomForestClassifier()
    XGB_clf = XGBClassifier()
    ada_clf = AdaBoostClassifier()
    return [logisreg_clf, svm_clf, dt_clf, rf_clf, XGB_clf, ada_clf]

# Main function to run the ML pipeline
def run_ml_pipeline(data, classifier, classifier_name, impute_strategy, scaler_type):

    logging.info(f"Running ML pipeline for model: {classifier_name}, impute strategy: {impute_strategy}, scaler type: {scaler_type}")

    # Fit and save preprocessors
    data_processed = fit_and_save_preprocessors(data, impute_strategy, scaler_type)

    # Handle outliers
    cols_outliers = ['Tenure', 'WarehouseToHome', 'NumberOfAddress', 'DaySinceLastOrder', 'HourSpendOnApp', 'NumberOfDeviceRegistered']
    for col in cols_outliers:
        data_processed = handle_outliers(data_processed, col)

    # Apply data balancing technique
    X_balanced, y_balanced = apply_data_balancing_technique(data_processed)

    # Split the data into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.30, random_state=42)

    with mlflow.start_run() as run:
        classifier.fit(x_train, y_train)
        y_pred_train = classifier.predict(x_train)
        y_pred_test = classifier.predict(x_test)

        mlflow.log_params({'Impute Strategy': impute_strategy, 'Scaler Type': scaler_type, "Model":classifier_name})
        mlflow.log_metric('Training Accuracy', accuracy_score(y_train, y_pred_train))
        mlflow.log_metric('Test Accuracy', accuracy_score(y_test, y_pred_test))
        mlflow.log_metric('Training F1-Score', f1_score(y_train, y_pred_train))
        mlflow.log_metric('Test F1-Score', f1_score(y_test, y_pred_test))

        mlflow.sklearn.log_model(clf, name)

# Sample data
data = pd.read_csv("../data/raw/customer_data/ecomm-data.csv")
del data['CustomerID']

# List of impute strategies and scaler types to try
impute_strategies = ["mean", "median"]
scaler_types = ["MinMax", "Standard", "Robust"]

# Create a list of classifiers
classifiers = get_classifiers()
clf_name_list = ['Logistic Regression', 'Support Vector Machine', 'Decision Tree', 'Random Forest', 'XGBClassifier', 'AdaBoostClassifier']

# Run the ML pipeline for different combinations of impute strategies and scaler types
for clf, name in zip(classifiers, clf_name_list):
    for impute_strategy in impute_strategies:
        for scaler_type in scaler_types:
            run_ml_pipeline(data.copy(), classifier=clf, classifier_name=name, impute_strategy=impute_strategy, scaler_type=scaler_type)


2023/10/06 10:13:15 INFO mlflow.tracking.fluent: Experiment with name 'customer_churn_model' does not exist. Creating a new experiment.
INFO:root:Running ML pipeline for model: Logistic Regression, impute strategy: mean, scaler type: MinMax
INFO:root:Starting preprocessing for impute strategy: mean, scaler type: MinMax
INFO:root:Running ML pipeline for model: Logistic Regression, impute strategy: mean, scaler type: Standard
INFO:root:Starting preprocessing for impute strategy: mean, scaler type: Standard
INFO:root:Running ML pipeline for model: Logistic Regression, impute strategy: mean, scaler type: Robust
INFO:root:Starting preprocessing for impute strategy: mean, scaler type: Robust
INFO:root:Running ML pipeline for model: Logistic Regression, impute strategy: median, scaler type: MinMax
INFO:root:Starting preprocessing for impute strategy: median, scaler type: MinMax
INFO:root:Running ML pipeline for model: Logistic Regression, impute strategy: median, scaler type: Standard
INFO:ro

In [5]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
# Set MLflow experiment name
mlflow.set_experiment("customer_churn_xgboost_model_hyperparameter_tuning")


2023/10/06 10:59:16 INFO mlflow.tracking.fluent: Experiment with name 'customer_churn_xgboost_model_hyperparameter_tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/hiten/personal-projects/github/customer-churn-prediction/customer_churn_prediction/notebooks/mlruns/5', creation_time=1696582756621, experiment_id='5', last_update_time=1696582756621, lifecycle_stage='active', name='customer_churn_xgboost_model_hyperparameter_tuning', tags={}>

In [8]:
from hyperopt import hp, tpe, fmin
from hyperopt.pyll import scope

# Define the search space for hyperparameters
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 200, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -4, 0),
    'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 10, 1)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'objective': 'reg:linear',
    'seed': 42
}

# Objective function to optimize
def objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        model = XGBClassifier(**params)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        f1 = f1_score(y_test, y_pred)
        mlflow.log_metric('f1', f1)
        return -f1  # Hyperopt minimizes the objective function, so we use negative f1-score

# Fit and save preprocessors
data_processed = fit_and_save_preprocessors(data, impute_strategy, scaler_type)

# Handle outliers
cols_outliers = ['Tenure', 'WarehouseToHome', 'NumberOfAddress', 'DaySinceLastOrder', 'HourSpendOnApp', 'NumberOfDeviceRegistered']
for col in cols_outliers:
    data_processed = handle_outliers(data_processed, col)

# Apply data balancing technique
X_balanced, y_balanced = apply_data_balancing_technique(data_processed)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.30, random_state=42)

# Use Hyperopt to search for the best hyperparameters
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50)

# Print the best hyperparameters
print("Best Hyperparameters:", best)

INFO:root:Starting preprocessing for impute strategy: median, scaler type: Robust


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.001536 seconds
INFO:hyperopt.tpe:TPE using 0 trials


  2%|▏         | 1/50 [00:00<00:40,  1.20trial/s, best loss: -0.9812476081132798]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002361 seconds
INFO:hyperopt.tpe:TPE using 1/1 trials with best loss -0.981248


  4%|▍         | 2/50 [00:01<00:29,  1.65trial/s, best loss: -0.9812476081132798]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002103 seconds
INFO:hyperopt.tpe:TPE using 2/2 trials with best loss -0.981248


  6%|▌         | 3/50 [00:02<00:50,  1.07s/trial, best loss: -0.98468606431853]  

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002869 seconds
INFO:hyperopt.tpe:TPE using 3/3 trials with best loss -0.984686


  8%|▊         | 4/50 [00:03<00:39,  1.17trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002390 seconds
INFO:hyperopt.tpe:TPE using 4/4 trials with best loss -0.988109


 10%|█         | 5/50 [00:04<00:49,  1.09s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002140 seconds
INFO:hyperopt.tpe:TPE using 5/5 trials with best loss -0.988109


 12%|█▏        | 6/50 [00:06<00:52,  1.18s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002332 seconds
INFO:hyperopt.tpe:TPE using 6/6 trials with best loss -0.988109


 14%|█▍        | 7/50 [00:08<01:01,  1.43s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002697 seconds
INFO:hyperopt.tpe:TPE using 7/7 trials with best loss -0.988109


 16%|█▌        | 8/50 [00:09<00:55,  1.32s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003359 seconds
INFO:hyperopt.tpe:TPE using 8/8 trials with best loss -0.988109


 18%|█▊        | 9/50 [00:12<01:13,  1.79s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002847 seconds
INFO:hyperopt.tpe:TPE using 9/9 trials with best loss -0.988109


 20%|██        | 10/50 [00:15<01:32,  2.31s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002646 seconds
INFO:hyperopt.tpe:TPE using 10/10 trials with best loss -0.988109


 22%|██▏       | 11/50 [00:16<01:13,  1.90s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002589 seconds
INFO:hyperopt.tpe:TPE using 11/11 trials with best loss -0.988109


 24%|██▍       | 12/50 [00:16<00:51,  1.36s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002413 seconds
INFO:hyperopt.tpe:TPE using 12/12 trials with best loss -0.988109


 26%|██▌       | 13/50 [00:19<01:04,  1.74s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002907 seconds
INFO:hyperopt.tpe:TPE using 13/13 trials with best loss -0.988109


 28%|██▊       | 14/50 [00:21<01:02,  1.74s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002435 seconds
INFO:hyperopt.tpe:TPE using 14/14 trials with best loss -0.988109


 30%|███       | 15/50 [00:23<01:09,  1.99s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003894 seconds
INFO:hyperopt.tpe:TPE using 15/15 trials with best loss -0.988109


 32%|███▏      | 16/50 [00:25<01:11,  2.09s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002689 seconds
INFO:hyperopt.tpe:TPE using 16/16 trials with best loss -0.988109


 34%|███▍      | 17/50 [00:28<01:08,  2.09s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002616 seconds
INFO:hyperopt.tpe:TPE using 17/17 trials with best loss -0.988109


 36%|███▌      | 18/50 [00:29<00:58,  1.83s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.004062 seconds
INFO:hyperopt.tpe:TPE using 18/18 trials with best loss -0.988109


 38%|███▊      | 19/50 [00:30<00:49,  1.60s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002491 seconds
INFO:hyperopt.tpe:TPE using 19/19 trials with best loss -0.988109


 40%|████      | 20/50 [00:31<00:41,  1.39s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002754 seconds
INFO:hyperopt.tpe:TPE using 20/20 trials with best loss -0.988109


 42%|████▏     | 21/50 [00:31<00:29,  1.03s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002512 seconds
INFO:hyperopt.tpe:TPE using 21/21 trials with best loss -0.988109


 44%|████▍     | 22/50 [00:33<00:34,  1.23s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002744 seconds
INFO:hyperopt.tpe:TPE using 22/22 trials with best loss -0.988109


 46%|████▌     | 23/50 [00:34<00:34,  1.28s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002598 seconds
INFO:hyperopt.tpe:TPE using 23/23 trials with best loss -0.988109


 48%|████▊     | 24/50 [00:35<00:30,  1.18s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002644 seconds
INFO:hyperopt.tpe:TPE using 24/24 trials with best loss -0.988109


 50%|█████     | 25/50 [00:35<00:22,  1.11trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002637 seconds
INFO:hyperopt.tpe:TPE using 25/25 trials with best loss -0.988109


 52%|█████▏    | 26/50 [00:37<00:26,  1.11s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002749 seconds
INFO:hyperopt.tpe:TPE using 26/26 trials with best loss -0.988109


 54%|█████▍    | 27/50 [00:38<00:27,  1.20s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002637 seconds
INFO:hyperopt.tpe:TPE using 27/27 trials with best loss -0.988109


 56%|█████▌    | 28/50 [00:39<00:22,  1.04s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002457 seconds
INFO:hyperopt.tpe:TPE using 28/28 trials with best loss -0.988109


 58%|█████▊    | 29/50 [00:40<00:19,  1.05trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002477 seconds
INFO:hyperopt.tpe:TPE using 29/29 trials with best loss -0.988109


 60%|██████    | 30/50 [00:40<00:15,  1.28trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002466 seconds
INFO:hyperopt.tpe:TPE using 30/30 trials with best loss -0.988109


 62%|██████▏   | 31/50 [00:41<00:18,  1.03trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002929 seconds
INFO:hyperopt.tpe:TPE using 31/31 trials with best loss -0.988109


 64%|██████▍   | 32/50 [00:42<00:14,  1.27trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002425 seconds
INFO:hyperopt.tpe:TPE using 32/32 trials with best loss -0.988109


 66%|██████▌   | 33/50 [00:43<00:13,  1.30trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002518 seconds
INFO:hyperopt.tpe:TPE using 33/33 trials with best loss -0.988109


 68%|██████▊   | 34/50 [00:44<00:16,  1.04s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002729 seconds
INFO:hyperopt.tpe:TPE using 34/34 trials with best loss -0.988109


 70%|███████   | 35/50 [00:46<00:18,  1.26s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002597 seconds
INFO:hyperopt.tpe:TPE using 35/35 trials with best loss -0.988109


 72%|███████▏  | 36/50 [00:48<00:19,  1.40s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002521 seconds
INFO:hyperopt.tpe:TPE using 36/36 trials with best loss -0.988109


 74%|███████▍  | 37/50 [00:49<00:18,  1.39s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002417 seconds
INFO:hyperopt.tpe:TPE using 37/37 trials with best loss -0.988109


 76%|███████▌  | 38/50 [00:50<00:16,  1.36s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002525 seconds
INFO:hyperopt.tpe:TPE using 38/38 trials with best loss -0.988109


 78%|███████▊  | 39/50 [00:52<00:15,  1.44s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002599 seconds
INFO:hyperopt.tpe:TPE using 39/39 trials with best loss -0.988109


 80%|████████  | 40/50 [00:53<00:12,  1.26s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002425 seconds
INFO:hyperopt.tpe:TPE using 40/40 trials with best loss -0.988109


 82%|████████▏ | 41/50 [00:53<00:08,  1.08trial/s, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002530 seconds
INFO:hyperopt.tpe:TPE using 41/41 trials with best loss -0.988109


 84%|████████▍ | 42/50 [00:55<00:10,  1.35s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002501 seconds
INFO:hyperopt.tpe:TPE using 42/42 trials with best loss -0.988109


 86%|████████▌ | 43/50 [00:56<00:09,  1.30s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.003713 seconds
INFO:hyperopt.tpe:TPE using 43/43 trials with best loss -0.988109


 88%|████████▊ | 44/50 [00:58<00:07,  1.32s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002945 seconds
INFO:hyperopt.tpe:TPE using 44/44 trials with best loss -0.988109


 90%|█████████ | 45/50 [01:00<00:07,  1.56s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002908 seconds
INFO:hyperopt.tpe:TPE using 45/45 trials with best loss -0.988109


 92%|█████████▏| 46/50 [01:01<00:05,  1.47s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002554 seconds
INFO:hyperopt.tpe:TPE using 46/46 trials with best loss -0.988109


 94%|█████████▍| 47/50 [01:02<00:03,  1.24s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002684 seconds
INFO:hyperopt.tpe:TPE using 47/47 trials with best loss -0.988109


 96%|█████████▌| 48/50 [01:02<00:02,  1.03s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002600 seconds
INFO:hyperopt.tpe:TPE using 48/48 trials with best loss -0.988109


 98%|█████████▊| 49/50 [01:06<00:01,  1.70s/trial, best loss: -0.988108937476026]

INFO:hyperopt.tpe:build_posterior_wrapper took 0.002596 seconds
INFO:hyperopt.tpe:TPE using 49/49 trials with best loss -0.988109


100%|██████████| 50/50 [01:08<00:00,  1.37s/trial, best loss: -0.988108937476026]
Best Hyperparameters: {'colsample_bytree': 0.7906157383171843, 'learning_rate': 0.15465495515036473, 'max_depth': 10.0, 'min_child_weight': 9.0, 'n_estimators': 141.0, 'subsample': 0.5149313721037773}


In [None]:

# Handle outliers
cols_outliers = ['Tenure', 'WarehouseToHome', 'NumberOfAddress', 'DaySinceLastOrder', 'HourSpendOnApp', 'NumberOfDeviceRegistered']
for col in cols_outliers:
    data_processed = handle_outliers(data_processed, col)


In [None]:
data_processed

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,-1.731743,2.222345,-0.741002,0.100852,1.469771,-1.156091,0.325191,-1.227468,0.097069,-0.672900,-0.261904,-0.772992,1.251898,1.852616,1.584290,-1.312273,-0.405767,-0.699345,1.284833e-01,-0.350105
1,-1.731128,2.222345,0.000000,1.509782,-0.715286,-0.916235,1.764521,0.814685,0.097069,0.303750,0.446658,-0.048392,1.251898,1.078430,1.584290,-0.197324,-0.946053,-0.699345,-1.278752e+00,-1.142957
2,-1.730513,2.222345,0.000000,1.509782,-0.715286,1.722175,0.325191,0.814685,-1.320723,0.303750,0.446658,-0.048392,1.251898,0.691336,1.584290,-0.476062,-0.946053,-0.699345,-4.344109e-01,-1.163287
3,-1.729897,2.222345,-1.219847,1.509782,1.469771,-0.076741,0.325191,0.814685,-1.320723,0.303750,-0.261904,1.400807,1.251898,1.465523,-0.631198,2.032574,-0.946053,-0.699345,-4.344109e-01,-0.878673
4,-1.729282,2.222345,-1.219847,1.509782,-0.715286,-0.436525,-2.553468,0.814685,0.000000,-0.672900,0.446658,1.400807,1.251898,-0.469944,-0.631198,-1.312273,-0.405767,-0.699345,-4.344109e-01,-0.959991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5625,1.729282,-0.449975,-0.022733,-1.308078,-0.715286,1.722175,-0.394474,0.814685,0.097069,-1.649551,-0.261904,-1.497592,-0.253481,0.691336,-0.631198,0.638888,-0.405767,-0.351066,-1.529638e-01,-0.533071
5626,1.729897,-0.449975,0.336401,0.100852,-0.715286,-0.316597,-0.394474,0.814685,0.097069,1.280401,-1.679027,1.400807,-0.253481,0.691336,-0.631198,0.081413,-0.405767,-0.351066,-2.499752e-16,0.971315
5627,1.730513,-0.449975,-1.100136,0.100852,-0.715286,-0.556452,0.325191,0.814685,0.097069,-1.649551,-0.261904,0.676207,-0.253481,-0.469944,1.584290,1.475100,-0.405767,-0.351066,-1.529638e-01,0.178463
5628,1.731128,-0.449975,1.533515,-1.308078,1.469771,-0.796308,-0.394474,0.814685,1.514862,1.280401,-0.261904,0.676207,-0.253481,-0.082850,-0.631198,-0.197324,0.134518,-0.351066,1.254272e+00,0.036156


In [None]:

# Apply data balancing technique
X_balanced, y_balanced = apply_data_balancing_technique(data_processed)