In [1]:
import pandas as pd
import numpy as np
import sklearn
from functools import partial
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
import warnings
warnings.filterwarnings("ignore")

from functools import partial

In [3]:
from dotenv import load_dotenv

from pathlib import Path

env_path = Path(".env-live")

if env_path.exists():
    load_dotenv(dotenv_path=env_path)

In [4]:
import os
os.environ['JRJ_MODEL_REGISTRY_S3_BUCKET_NAME']

'273-g2'

In [5]:
from jrjModelRegistry.jrjModelRegistry import registerAJrjModel

Pinged your deployment. You successfully connected to MongoDB!


In [6]:
df = pd.read_csv("online_retail_customer_churn_cleaned.csv")
df.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Target_Churn,Gender_Female,Gender_Male,Gender_Other,Promotion_Response_Ignored,Promotion_Response_Responded,Promotion_Response_Unsubscribed
0,1,62,45.15,5892.58,5,22,453.8,2,0,3,129,1,1,0.0,0.0,1.0,0.0,1.0,0.0
1,2,65,79.51,9025.47,13,77,22.9,2,2,3,227,0,0,0.0,1.0,0.0,0.0,1.0,0.0
2,3,18,29.19,618.83,13,71,50.53,5,2,2,283,0,1,0.0,1.0,0.0,0.0,1.0,0.0
3,4,21,79.63,9110.3,3,33,411.83,5,3,5,226,1,1,0.0,0.0,1.0,1.0,0.0,0.0
4,5,21,77.66,5390.88,15,43,101.19,3,0,5,242,0,0,0.0,0.0,1.0,0.0,0.0,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Customer_ID                      1000 non-null   int64  
 1   Age                              1000 non-null   int64  
 2   Annual_Income                    1000 non-null   float64
 3   Total_Spend                      1000 non-null   float64
 4   Years_as_Customer                1000 non-null   int64  
 5   Num_of_Purchases                 1000 non-null   int64  
 6   Average_Transaction_Amount       1000 non-null   float64
 7   Num_of_Returns                   1000 non-null   int64  
 8   Num_of_Support_Contacts          1000 non-null   int64  
 9   Satisfaction_Score               1000 non-null   int64  
 10  Last_Purchase_Days_Ago           1000 non-null   int64  
 11  Email_Opt_In                     1000 non-null   int64  
 12  Target_Churn         

In [8]:
#sample data for model registration
sample_data = {
    'Annual_Income': [60.5, 85.0, 45.2],
    'Total_Spend': [8000, 9500, 3000],
    'Average_Transaction_Amount': [250.0, 310.5, 150.0],
    'Last_Purchase_Days_Ago': [30, 10, 90],
    'Num_of_Purchases': [32, 45, 12],
    'Age': [40, 28, 60],
    'Years_as_Customer': [5, 3, 10],
    'Num_of_Returns': [2, 0, 5]
}

In [9]:
# Drop unused columns
X = df.drop(columns=['Customer_ID', 'Target_Churn'])
y = df['Target_Churn']


In [10]:
#Train test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
#Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
#Logistic Regression Build and Training
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

In [13]:
#use the model to predict churn on test data
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, mean_squared_error

y_pred = log_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred) #how it catches churn cases
precision = precision_score(y_test, y_pred) #how many of the predicted churn are actually churn
f1 = f1_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) #prediction error

print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"RMSE: {rmse:.4f}")


Accuracy: 0.4650
Recall: 0.6604
Precision: 0.4965
F1 Score: 0.5668
RMSE: 0.5350


In [14]:
summary_metrics = {
    "precision": precision,
    "recall": recall,
    "f1-score": f1,
    "accuracy": accuracy,
    "rmse": rmse
}
summary_df = pd.DataFrame(summary_metrics, index=["Logistic Regression"])
summary_df = summary_df[["accuracy", "precision", "recall", "f1-score", "rmse"]] 
summary_df


Unnamed: 0,accuracy,precision,recall,f1-score,rmse
Logistic Regression,0.465,0.496454,0.660377,0.566802,0.535


In [15]:
# Define transformer and predictor for future new data 

def churnModelPredictor(self, transformedData):
    import pandas as pd
    return self.predict(transformedData)

log_model.transformer = churnModelPredictor
log_model.mainPredictor = partial(churnModelPredictor, log_model)

In [16]:
# log_metadata = {
#     "modelName": "Group2_customerChurn_LogRegModel_1",
#     "version": "1.0.1",
#     "params": log_model.get_params(),
#     "score": float(accuracy),
#     "modelLibrary": "sklearn.linear_model.LogisticRegression",
#     "libraryMetadata": {
#         "n_features": int(X.shape[1]),
#         "n_classes": int(len(np.unique(y))),
#         "classes": log_model.classes_.tolist(),
#         "classification_report": report
#     },
#     "sampleData": {
#         "dataForTransfer": X_test[:3].to_dict(orient="records")
#     }
# }

In [17]:
#Create the metadata dictionary
log_metadata = {
    "modelName": "Group2_customerChurn_LogRegModel_1",
    "version": "1.0.1",
    "params": log_model.get_params(),
    "score": float(accuracy),
    "modelLibrary": "sklearn.linear_model.LogisticRegression",
    "libraryMetadata": {
        "n_features": int(X.shape[1]),
        "n_classes": int(len(np.unique(y))),
        "classes": log_model.classes_.tolist(),
        "classification_report": classification_report(y_test, y_pred, output_dict=True)
    },
    "sampleData": {
        "dataForTransfer": X_test.iloc[:3].to_dict(orient="records")
    }
}


In [18]:
registerAJrjModel(log_model, log_metadata)

✅ Uploaded encrypted ZIP to s3://273-g2/Group2_customerChurn_LogRegModel_1__1.0.1.pkl.zip
❌ Failed to generate URL or upload: E11000 duplicate key error collection: jrjModelRegistry.models index: modelName_1_version_1 dup key: { modelName: "Group2_customerChurn_LogRegModel_1", version: "1.0.1" }, full error: {'index': 0, 'code': 11000, 'errmsg': 'E11000 duplicate key error collection: jrjModelRegistry.models index: modelName_1_version_1 dup key: { modelName: "Group2_customerChurn_LogRegModel_1", version: "1.0.1" }', 'keyPattern': {'modelName': 1, 'version': 1}, 'keyValue': {'modelName': 'Group2_customerChurn_LogRegModel_1', 'version': '1.0.1'}}


In [19]:
#Comparison table Logistic vs. Random Forest
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [0.465, 0.545],
    'Recall': [0.66, 0.64],
    'Precision': [0.496, 0.562],
    'F1 Score': [0.566, 0.599],
})
comparison_df


Unnamed: 0,Model,Accuracy,Recall,Precision,F1 Score
0,Logistic Regression,0.465,0.66,0.496,0.566
1,Random Forest,0.545,0.64,0.562,0.599
