In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump, load


def preprocess(data: pd.DataFrame) -> pd.DataFrame:
    categorical_columns = ['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'MaritalStatus']
    numerical_columns = ['Tenure', 'CityTier', 'WarehouseToHome', 'HourSpendOnApp', 'NumberOfDeviceRegistered', 
                         'SatisfactionScore', 'NumberOfAddress', 'Complain', 'OrderAmountHikeFromlastYear', 
                         'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'CashbackAmount', 'Churn']
    for col in data.columns:
        if data[col].isnull().sum() > 0:
            data[col] = data[col].fillna(data[col].median())
    categorical_df = data[categorical_columns]
    numerical_df = data[numerical_columns]
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    encoded_categorical = one_hot_encoder.fit_transform(categorical_df)
    encoded_categorical_df = pd.DataFrame(encoded_categorical.toarray(), 
                                           columns=one_hot_encoder.get_feature_names_out(categorical_columns))
    preprocessed_df = pd.concat([encoded_categorical_df, numerical_df], axis=1)
    return preprocessed_df

def build_model(data: pd.DataFrame) -> dict:
    X = data.drop(columns='Churn')
    y = data['Churn']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30)
    log_reg = LogisticRegression(solver='lbfgs', max_iter=10000)
    log_reg.fit(X_train, y_train)
    y_pred = log_reg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    encoder_path = "C:/Users/Mali/ai-project-methodology/models/encoder.joblib" 
    dump(OneHotEncoder, encoder_path)
    model_path = "C:/Users/Mali/ai-project-methodology/models/model.joblib"
    dump(log_reg, model_path)

    # Changed the returns to match hte tracking parameters for mlflow
    return log_reg, OneHotEncoder, accuracy,report, X_train, X_test, y_train, y_test

def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    encoder_path = "../ai-project-methodology/models/encoder.joblib"
    model_path = "../ai-project-methodology/models/model.joblib"
    encoder = load(encoder_path)
    model = load(model_path)
    preprocessed_input = preprocess(input_data)
    predictions = model.predict(preprocessed_input.drop(columns='Churn'))
    return predictions


In [16]:
import mlflow
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models import infer_signature

mlflow.set_tracking_uri(r'sqlite:///C:/Users/Mali/ai-project-methodology.db') # Works for Windows
# Or mlflow.set_tracking_uri(uri="http://localhost:5000")
df = pd.read_csv('C:/Users/Mali/ai-project-methodology/data/Dataset/ECommerce.csv')

preprocess_data = preprocess(df)
lr, OneHotEncoder, accuracy,report, X_train, X_test, y_train, y_test = build_model(preprocess_data)

# Define the model hyperparameters
params = {
    "solver": "lbfgs",
    "max_iter": 10000, # Increase the interations numbers to avoid the limit
}

# Predict on the test set
y_pred = lr.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)

# Create a new MLflow Experiment
mlflow.set_experiment("Tracking metrics and parameters")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)

    # Infer the model signature
    signature = infer_signature(X_train, lr.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=lr,
        artifact_path="churn_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="log_reg_model",
    )



Registered model 'log_reg_model' already exists. Creating a new version of this model...
Created version '9' of model 'log_reg_model'.
