In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
import warnings
warnings.filterwarnings("ignore")

from functools import partial

In [3]:
from dotenv import load_dotenv

from pathlib import Path

env_path = Path(".env-live")

if env_path.exists():
    load_dotenv(dotenv_path=env_path)

In [4]:
import os
os.environ['JRJ_MODEL_REGISTRY_S3_BUCKET_NAME']

'273-g2'

In [6]:
from jrjModelRegistry.jrjModelRegistry import registerAJrjModel

Pinged your deployment. You successfully connected to MongoDB!


Base Model

In [7]:
df = pd.read_csv("online_retail_customer_churn_cleaned.csv")
df.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Target_Churn,Gender_Female,Gender_Male,Gender_Other,Promotion_Response_Ignored,Promotion_Response_Responded,Promotion_Response_Unsubscribed
0,1,62,45.15,5892.58,5,22,453.8,2,0,3,129,1,1,0.0,0.0,1.0,0.0,1.0,0.0
1,2,65,79.51,9025.47,13,77,22.9,2,2,3,227,0,0,0.0,1.0,0.0,0.0,1.0,0.0
2,3,18,29.19,618.83,13,71,50.53,5,2,2,283,0,1,0.0,1.0,0.0,0.0,1.0,0.0
3,4,21,79.63,9110.3,3,33,411.83,5,3,5,226,1,1,0.0,0.0,1.0,1.0,0.0,0.0
4,5,21,77.66,5390.88,15,43,101.19,3,0,5,242,0,0,0.0,0.0,1.0,0.0,0.0,1.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Customer_ID                      1000 non-null   int64  
 1   Age                              1000 non-null   int64  
 2   Annual_Income                    1000 non-null   float64
 3   Total_Spend                      1000 non-null   float64
 4   Years_as_Customer                1000 non-null   int64  
 5   Num_of_Purchases                 1000 non-null   int64  
 6   Average_Transaction_Amount       1000 non-null   float64
 7   Num_of_Returns                   1000 non-null   int64  
 8   Num_of_Support_Contacts          1000 non-null   int64  
 9   Satisfaction_Score               1000 non-null   int64  
 10  Last_Purchase_Days_Ago           1000 non-null   int64  
 11  Email_Opt_In                     1000 non-null   int64  
 12  Target_Churn         

In [9]:
sample_data = {
    'Annual_Income': [60.5, 85.0, 45.2],
    'Total_Spend': [8000, 9500, 3000],
    'Average_Transaction_Amount': [250.0, 310.5, 150.0],
    'Last_Purchase_Days_Ago': [30, 10, 90],
    'Num_of_Purchases': [32, 45, 12],
    'Age': [40, 28, 60],
    'Years_as_Customer': [5, 3, 10],
    'Num_of_Returns': [2, 0, 5]
}

In [10]:
def churnModelTransformer(dataForTransfer=None):
    import pandas as pd
    import statsmodels.api as sm
    top_features = ['Annual_Income', 'Total_Spend', 'Average_Transaction_Amount',
                    'Last_Purchase_Days_Ago', 'Num_of_Purchases', 'Age',
                    'Years_as_Customer', 'Num_of_Returns']
    
    if isinstance(dataForTransfer, pd.DataFrame):
        df = dataForTransfer.copy()
    else:
        df = pd.DataFrame(dataForTransfer)

    dfTransformer = sm.add_constant(df[top_features], has_constant='add')
    return dfTransformer

In [11]:
def churnModelPredictor(self, transformedData):
    import pandas as pd
    return self.predict(transformedData)

In [12]:
#Build Model
X = df.drop(columns=['Customer_ID', 'Target_Churn'])
y = df['Target_Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(random_state=42)

rf.fit(X_train, y_train)

In [13]:
# The score of the first model
y_pred = rf.predict(X_test)
base_score = rf.score(X_test, y_test)
base_score

0.545

In [14]:
report1 = print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.52      0.44      0.47        94
           1       0.56      0.64      0.60       106

    accuracy                           0.55       200
   macro avg       0.54      0.54      0.54       200
weighted avg       0.54      0.55      0.54       200



In [15]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [16]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.545


In [17]:
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall}')

Recall: 0.6415094339622641


In [18]:
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision}')

Precision: 0.5619834710743802


In [19]:
f1score = f1_score(y_test, y_pred)
print(f'F1 Score: {f1score}')

F1 Score: 0.5991189427312775


In [20]:
from functools import partial
rf.transformer = churnModelTransformer
rf.mainPredictor = partial(churnModelPredictor, rf)

In [21]:
rf_metadata = {
    "modelName": "Group2_customerChurn_RFModel_1",
    "version": "1.0.1",
    "params": rf.get_params(),                      
    "score": float(base_score),                           
    "modelLibrary": "sklearn.ensemble.RandomForestClassifier",
    "libraryMetadata": {
        "feature_importances": rf.feature_importances_.tolist(),
        "n_features": int(rf.n_features_in_),
        "n_classes": int(rf.n_classes_),
        "classes": rf.classes_.tolist(),
        "n_estimators": rf.n_estimators,
        "classification_report": report1
    },
    "sampleData": {
        "dataForTransfer": sample_data
    }
}

In [22]:
registerAJrjModel(rf, rf_metadata)

✅ Uploaded encrypted ZIP to s3://273-g2/Group2_customerChurn_RFModel_1__1.0.1.pkl.zip
❌ Failed to generate URL or upload: E11000 duplicate key error collection: jrjModelRegistry.models index: modelName_1_version_1 dup key: { modelName: "Group2_customerChurn_RFModel_1", version: "1.0.1" }, full error: {'index': 0, 'code': 11000, 'errmsg': 'E11000 duplicate key error collection: jrjModelRegistry.models index: modelName_1_version_1 dup key: { modelName: "Group2_customerChurn_RFModel_1", version: "1.0.1" }', 'keyPattern': {'modelName': 1, 'version': 1}, 'keyValue': {'modelName': 'Group2_customerChurn_RFModel_1', 'version': '1.0.1'}}


---

Hyper parameters

In [23]:
features = pd.DataFrame(rf.feature_importances_, index = X.columns)

In [24]:
top_features = features.sort_values(by=0, ascending=False).head(8).index.tolist()
X_top = X[top_features]
X_top

Unnamed: 0,Annual_Income,Total_Spend,Average_Transaction_Amount,Last_Purchase_Days_Ago,Num_of_Purchases,Age,Years_as_Customer,Num_of_Returns
0,45.15,5892.58,453.80,129,22,62,5,2
1,79.51,9025.47,22.90,227,77,65,13,2
2,29.19,618.83,50.53,283,71,18,13,5
3,79.63,9110.30,411.83,226,33,21,3,5
4,77.66,5390.88,101.19,242,43,21,15,3
...,...,...,...,...,...,...,...,...
995,143.72,1089.09,77.75,88,29,54,2,0
996,164.19,3700.24,34.45,352,90,19,9,6
997,113.31,705.85,187.37,172,69,47,17,7
998,72.98,3891.60,483.80,55,31,23,7,1


In [25]:
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

In [26]:
rf2 = RandomForestClassifier(
    n_estimators=1000,         # Number of trees in the forest
    criterion='entropy',       # Use information gain instead of Gini
    max_depth=14,              # Limit tree depth to prevent overfitting
    min_samples_split=10,      # Minimum samples required to split a node
    min_samples_leaf=4,        # Minimum samples at a leaf node
    max_features='sqrt',       # Consider sqrt(number of features) when looking for best split
    bootstrap=True,            # Use bootstrap samples
    random_state=42            # Ensure reproducibility
)

In [27]:
rf2.fit(X_train_top, y_train)

In [28]:
new_score = rf2.score(X_test_top, y_test)
new_score

0.535

In [29]:
from sklearn.metrics import classification_report

y_pred = rf2.predict(X_test_top)
report2 = print(classification_report(y_test, y_pred))
report2

              precision    recall  f1-score   support

           0       0.51      0.31      0.38        94
           1       0.55      0.74      0.63       106

    accuracy                           0.54       200
   macro avg       0.53      0.52      0.51       200
weighted avg       0.53      0.54      0.51       200



In [30]:
rf2_metadata = {
    "modelName": "Group2_customerChurn_RFModel2",
    "version": "1.0.3",
    "params": rf2.get_params(),                      
    "score": float(new_score),                           
    "modelLibrary": "sklearn.ensemble.RandomForestClassifier",
    "libraryMetadata": {
        "feature_importances": rf2.feature_importances_.tolist(),
        "n_features": int(rf2.n_features_in_),
        "n_classes": int(rf2.n_classes_),
        "classes": rf2.classes_.tolist(),
        "n_estimators": rf2.n_estimators,
        "classification_report": report2
    },
    "sampleData": {
        "dataForTransfer": sample_data
    }
}

In [31]:
rf2.transformer = churnModelTransformer
rf2.mainPredictor = partial(churnModelPredictor, rf2)

In [32]:
registerAJrjModel(rf2, rf2_metadata)

✅ Uploaded encrypted ZIP to s3://273-g2/Group2_customerChurn_RFModel2__1.0.3.pkl.zip
❌ Failed to generate URL or upload: E11000 duplicate key error collection: jrjModelRegistry.models index: modelName_1_version_1 dup key: { modelName: "Group2_customerChurn_RFModel2", version: "1.0.3" }, full error: {'index': 0, 'code': 11000, 'errmsg': 'E11000 duplicate key error collection: jrjModelRegistry.models index: modelName_1_version_1 dup key: { modelName: "Group2_customerChurn_RFModel2", version: "1.0.3" }', 'keyPattern': {'modelName': 1, 'version': 1}, 'keyValue': {'modelName': 'Group2_customerChurn_RFModel2', 'version': '1.0.3'}}
