In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
import warnings
warnings.filterwarnings("ignore")

from functools import partial

In [3]:
from dotenv import load_dotenv

from pathlib import Path

env_path = Path(".env-live")

if env_path.exists():
    load_dotenv(dotenv_path=env_path)

In [4]:
import os
os.environ['JRJ_MODEL_REGISTRY_S3_BUCKET_NAME']

'273-g2'

In [5]:
from jrjModelRegistry.jrjModelRegistry import registerAJrjModel

Pinged your deployment. You successfully connected to MongoDB!


Base Model

In [6]:
df = pd.read_csv("online_retail_customer_churn_cleaned.csv")
df.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Target_Churn,Gender_Female,Gender_Male,Gender_Other,Promotion_Response_Ignored,Promotion_Response_Responded,Promotion_Response_Unsubscribed
0,1,62,45.15,5892.58,5,22,453.8,2,0,3,129,1,1,0.0,0.0,1.0,0.0,1.0,0.0
1,2,65,79.51,9025.47,13,77,22.9,2,2,3,227,0,0,0.0,1.0,0.0,0.0,1.0,0.0
2,3,18,29.19,618.83,13,71,50.53,5,2,2,283,0,1,0.0,1.0,0.0,0.0,1.0,0.0
3,4,21,79.63,9110.3,3,33,411.83,5,3,5,226,1,1,0.0,0.0,1.0,1.0,0.0,0.0
4,5,21,77.66,5390.88,15,43,101.19,3,0,5,242,0,0,0.0,0.0,1.0,0.0,0.0,1.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Customer_ID                      1000 non-null   int64  
 1   Age                              1000 non-null   int64  
 2   Annual_Income                    1000 non-null   float64
 3   Total_Spend                      1000 non-null   float64
 4   Years_as_Customer                1000 non-null   int64  
 5   Num_of_Purchases                 1000 non-null   int64  
 6   Average_Transaction_Amount       1000 non-null   float64
 7   Num_of_Returns                   1000 non-null   int64  
 8   Num_of_Support_Contacts          1000 non-null   int64  
 9   Satisfaction_Score               1000 non-null   int64  
 10  Last_Purchase_Days_Ago           1000 non-null   int64  
 11  Email_Opt_In                     1000 non-null   int64  
 12  Target_Churn         

In [8]:
sample_data = pd.DataFrame({
    'Annual_Income': [60.5],
    'Total_Spend': [8000],
    'Average_Transaction_Amount': [250.0],
    'Last_Purchase_Days_Ago': [30],
    'Num_of_Purchases': [32],
    'Age': [40],
    'Years_as_Customer': [5],
    'Num_of_Returns': [2]
}).to_dict(orient="records")

In [9]:
def churnModelTransformer(dataForTransfer):
    import pandas as pd
    import statsmodels.api as sm

    top_features = ['Annual_Income', 'Total_Spend', 'Average_Transaction_Amount',
                    'Last_Purchase_Days_Ago', 'Num_of_Purchases', 'Age',
                    'Years_as_Customer', 'Num_of_Returns']
    
    if isinstance(dataForTransfer, pd.DataFrame):
        df = dataForTransfer.copy()
    else:
        df = pd.DataFrame(dataForTransfer)

    dfTransformer = sm.add_constant(df[top_features], has_constant='add')
    return dfTransformer

In [10]:
def churnModelPredictor(self, transformedData):
    
    return self.predict(transformedData).tolist()

In [11]:
#Build Model
X = df.drop(columns=['Customer_ID', 'Target_Churn'])
y = df['Target_Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, df.shape

((800, 17), (200, 17), (1000, 19))

In [12]:
rf = RandomForestClassifier(random_state=42)

rf.fit(churnModelTransformer(X_train), y_train)

In [13]:
# The score of the first model
y_train_pred = rf.predict(churnModelTransformer(X_train))
y_test_pred = rf.predict(churnModelTransformer(X_test))
y_test_pred_probs = rf.predict_proba(churnModelTransformer(X_test))

In [14]:
report1 = print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.51      0.44      0.47        94
           1       0.56      0.63      0.59       106

    accuracy                           0.54       200
   macro avg       0.54      0.53      0.53       200
weighted avg       0.54      0.54      0.54       200



In [15]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [18]:
score = accuracy_score(y_test,y_test_pred)
print(f'score: {score}')

score: 0.54


In [19]:
recall = recall_score(y_test, y_test_pred)
print(f'Recall: {recall}')

Recall: 0.6320754716981132


In [20]:
precision = precision_score(y_test, y_test_pred)
print(f'Precision: {precision}')

Precision: 0.5583333333333333


In [21]:
f1score = f1_score(y_test, y_test_pred)
print(f'F1 Score: {f1score}')

F1 Score: 0.5929203539823009


In [22]:
from functools import partial
rf.transformer = churnModelTransformer
rf.mainPredictor = partial(churnModelPredictor, rf)

In [23]:
rf.mainPredictor(rf.transformer(sample_data))

[1]

In [122]:
rf_metadata = {
    "modelName": "Group2_customerChurn_RFModel_1",
    "version": "1.0.10",
    "params":{
    "n_estimators": rf.n_estimators,
    "max_depth": rf.max_depth,
    "random_state": rf.random_state
},                      
    "score": float(score),                           
    "modelLibrary": "sklearn.ensemble.RandomForestClassifier",
    "libraryMetadata": {
        "feature_importances": rf.feature_importances_.tolist(),
        "n_features": int(rf.n_features_in_),
        "n_classes": int(rf.n_classes_),
        "classes": rf.classes_.tolist(),
        "n_estimators": rf.n_estimators,
        "classification_report": report1
    },
    "sampleData": {
        "dataForTransfer": sample_data
    }
}

In [123]:
registerAJrjModel(rf, rf_metadata)

✅ Uploaded encrypted ZIP to s3://273-g2/Group2_customerChurn_RFModel_1__1.0.10.pkl.zip


{'_id': '685d6f5fab9adc5a58ced1a8',
 'modelName': 'Group2_customerChurn_RFModel_1',
 'version': '1.0.10',
 'params': {'n_estimators': 100, 'max_depth': None, 'random_state': 42},
 'score': 0.54,
 'modelLibrary': 'sklearn.ensemble.RandomForestClassifier',
 'libraryMetadata': {'feature_importances': [0.0,
   0.14721906084445097,
   0.14469500419922265,
   0.15262880864777423,
   0.13477579210674462,
   0.1273417815081952,
   0.12029285448693638,
   0.09636594412436171,
   0.07668075408231421],
  'n_features': 9,
  'n_classes': 2,
  'classes': [0, 1],
  'n_estimators': 100,
  'classification_report': None},
 'sampleData': {'dataForTransfer': [{'Annual_Income': 60.5,
    'Total_Spend': 8000,
    'Average_Transaction_Amount': 250.0,
    'Last_Purchase_Days_Ago': 30,
    'Num_of_Purchases': 32,
    'Age': 40,
    'Years_as_Customer': 5,
    'Num_of_Returns': 2}]},
 'modelType': 'model',
 'keepLastOnly': False,
 'modelSizeBytes': 2754279,
 'zippedModelSizeBytes': 306972,
 's3Url': '273-g2/Gro

---

Hyper parameters

In [68]:
X_transformed = churnModelTransformer(X_train)
features = pd.DataFrame(
    rf.feature_importances_.reshape(-1, 1),
    index=X_transformed.columns,
    columns=["Importance"]
)

In [70]:
top_features = features.sort_values(by="Importance", ascending=False).head(8).index.tolist()
X_top = X[top_features]
X_top

Unnamed: 0,Average_Transaction_Amount,Annual_Income,Total_Spend,Last_Purchase_Days_Ago,Num_of_Purchases,Age,Years_as_Customer,Num_of_Returns
0,453.80,45.15,5892.58,129,22,62,5,2
1,22.90,79.51,9025.47,227,77,65,13,2
2,50.53,29.19,618.83,283,71,18,13,5
3,411.83,79.63,9110.30,226,33,21,3,5
4,101.19,77.66,5390.88,242,43,21,15,3
...,...,...,...,...,...,...,...,...
995,77.75,143.72,1089.09,88,29,54,2,0
996,34.45,164.19,3700.24,352,90,19,9,6
997,187.37,113.31,705.85,172,69,47,17,7
998,483.80,72.98,3891.60,55,31,23,7,1


In [71]:
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top, y, test_size=0.2, random_state=42)

In [72]:
rf2 = RandomForestClassifier(
    n_estimators=1000,        
    criterion='entropy',       
    max_depth=14,              
    min_samples_split=10,     
    min_samples_leaf=4,       
    max_features='sqrt',       
    bootstrap=True,            
    random_state=42           
)

In [73]:
rf2.fit(X_train_top, y_train)

In [74]:
new_score = rf2.score(X_test_top, y_test)
new_score

0.53

In [75]:
from sklearn.metrics import classification_report

y_pred = rf2.predict(X_test_top)
report2 = print(classification_report(y_test, y_pred))
report2

              precision    recall  f1-score   support

           0       0.50      0.32      0.39        94
           1       0.54      0.72      0.62       106

    accuracy                           0.53       200
   macro avg       0.52      0.52      0.50       200
weighted avg       0.52      0.53      0.51       200



In [None]:
rf2_metadata = {
    "modelName": "Group2_customerChurn_RFModel2",
    "version": "1.0.3",
    "params": rf2.get_params(),                      
    "score": float(new_score),                           
    "modelLibrary": "sklearn.ensemble.RandomForestClassifier",
    "libraryMetadata": {
        "feature_importances": rf2.feature_importances_.tolist(),
        "n_features": int(rf2.n_features_in_),
        "n_classes": int(rf2.n_classes_),
        "classes": rf2.classes_.tolist(),
        "n_estimators": rf2.n_estimators,
        "classification_report": report2
    },
    "sampleData": {
        "dataForTransfer": sample_data
    }
}

In [None]:
rf2.transformer = churnModelTransformer
rf2.mainPredictor = partial(churnModelPredictor, rf2)

In [None]:
registerAJrjModel(rf2, rf2_metadata)