In [14]:
import numpy as np
import pandas as pd
import mlflow
import os
import warnings
from sklearn.model_selection import train_test_split,GridSearchCV
from imblearn.over_sampling import SMOTE
import mlflow
import mlflow.sklearn
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [15]:
df1 = pd.read_csv("./data/Churn_Modelling.csv")

In [16]:
df1 = df1.drop('CustomerId', axis=1, errors='ignore')
df1 = df1.drop('Surname', axis=1, errors='ignore')

In [17]:
nominal_cols = ['Geography', 'Gender']
data_nominal_encoded = pd.get_dummies(df1[nominal_cols], drop_first=True)
data_nominal_encoded = data_nominal_encoded.astype(int)

# Combine encoded columns back into the DataFrame
df = pd.concat([df1.drop(columns=nominal_cols), data_nominal_encoded], axis=1)
df.head(2)

Unnamed: 0,RowNumber,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,1,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,2,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0


In [18]:
y = df[['Exited']]
X = df[['CreditScore','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','Geography_Germany','Geography_Spain','Gender_Male']]

In [19]:
y.value_counts()

Exited
0         7963
1         2037
Name: count, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((12740, 11), (12740, 1), (2000, 11), (2000, 1))

In [21]:
# Suppress specific warnings
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# Ensure y_train is a 1D array
# y_train = y_train.ravel()

# Define the model
classifier = RandomForestClassifier(random_state=0)

# Define the parameter distribution
param_dist = {
    'n_estimators': [9, 17, 31, 41, 51],
    'max_depth': range(2, 12),
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6]
}


In [22]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [23]:
classifier.score(X_test, y_test)

0.8155

In [26]:
mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [27]:
with mlflow.start_run():
    # Log the initial parameter distribution
    mlflow.log_params({"param_dist": str(param_dist)})
    
    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=classifier, 
        param_distributions=param_dist, 
        n_iter=10, 
        cv=5, 
        verbose=0, 
        random_state=0
    )
    
    # Fit the model
    random_search.fit(X_train, y_train)
    
    # Log the best parameters found
    best_params = random_search.best_params_
    mlflow.log_params(best_params)
    
    # Make predictions on the training set
    y_pred = random_search.predict(X_train)
    
    # Calculate accuracy and log the metric
    accuracy = accuracy_score(y_train, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    
    # Log the best model
    mlflow.sklearn.log_model(random_search.best_estimator_, "random_forest_model")
    
    print(f"Best Parameters: {best_params}")
    print(f"Training Accuracy: {accuracy}")


2024/08/11 09:54:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run defiant-ox-83 at: http://127.0.0.1:5001/#/experiments/0/runs/2df11387f5e04e19a5aa409f08c4510e.
2024/08/11 09:54:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/0.


Best Parameters: {'n_estimators': 41, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 11}
Training Accuracy: 0.8846153846153846
