Retention Prediction:

Which factors (subscribing channel, dietary preference, meal type, location, age group, etc.) are most predictive of customer retention?

Step 1: Import Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

Step 2: Load and Preprocess the Data

In [13]:
# Load the dataset
data = pd.read_csv('C:/JPEM_Git_Main/JPEM/JPEM_SAIT/Winter2025_PROJ406/Team7/foodfusion.csv')
data.head()

Unnamed: 0,user_id,date_served,subscribing_channel,converted,is_retained,dietary_preference,meal_type,location,age_group
0,1,2023-06-13,Email,True,False,Vegetarian,Lunch,Greenwich,46+ years
1,2,2023-01-27,Influencers,True,False,Vegan,Breakfast,Camden,19-24 years
2,3,2023-03-01,House Ads,True,True,Omnivore,Snack,Greenwich,31-36 years
3,4,2023-05-31,Instagram,True,True,Keto,Dinner,Hackney,19-24 years
4,5,2023-02-25,Instagram,True,True,Keto,Breakfast,Hackney,25-30 years


In [14]:
# Drop the 'date_served' and "user_id" columns
data = data.drop('date_served', axis=1)
data = data.drop('user_id', axis=1)
data = data.drop('converted', axis=1)

# Check for missing values
print(data.isnull().sum())

# Drop or impute missing values
data = data.dropna()

# Separate features (X) and target (y)
X = data.drop('is_retained', axis=1)
y = data['is_retained']

# One-hot encode all categorical features in X
X = pd.get_dummies(X, drop_first=True)

subscribing_channel    0
is_retained            0
dietary_preference     0
meal_type              0
location               0
age_group              0
dtype: int64


Step 3: Balance the Dataset Using SMOTE

In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Convert boolean columns to integers (if any exist)
X = X.astype({col: 'int' for col in X.select_dtypes(include=['bool']).columns})

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("Original class distribution:")
print(y_train.value_counts())
print("Balanced class distribution:")
print(pd.Series(y_train_balanced).value_counts())


Original class distribution:
is_retained
False    6019
True     1981
Name: count, dtype: int64
Balanced class distribution:
is_retained
True     6019
False    6019
Name: count, dtype: int64


Step 4: Train the Model

In [16]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Train the model on the balanced data
model.fit(X_train_balanced, y_train_balanced)


Step 5: Make Predictions

In [17]:
# Predict probabilities instead of direct class labels
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class (`True`)

# Adjust the decision threshold (e.g., 0.3)
threshold = 0.3
y_pred = (y_pred_prob >= threshold).astype(int)


Step 6: Evaluate Model

In [18]:
from sklearn.metrics import classification_report, confusion_matrix

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))

# Classification Report
print(classification_report(y_test, y_pred))


[[757 748]
 [254 241]]
              precision    recall  f1-score   support

       False       0.75      0.50      0.60      1505
        True       0.24      0.49      0.32       495

    accuracy                           0.50      2000
   macro avg       0.50      0.49      0.46      2000
weighted avg       0.62      0.50      0.53      2000



Step 7: Analyze Feature Importance

In [19]:
# Feature Importance
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print(feature_importances)

                            Feature  Importance
15               location_Islington    0.060739
20              age_group_46+ years    0.054982
9                  meal_type_Dinner    0.054676
0      subscribing_channel_Facebook    0.053627
1     subscribing_channel_House Ads    0.053406
6       dietary_preference_Omnivore    0.052228
12               location_Greenwich    0.051634
16            age_group_19-24 years    0.051006
18            age_group_31-36 years    0.049282
11                  meal_type_Snack    0.048843
8     dietary_preference_Vegetarian    0.048730
2   subscribing_channel_Influencers    0.047166
13                 location_Hackney    0.046195
3     subscribing_channel_Instagram    0.046046
19            age_group_37-45 years    0.045971
17            age_group_25-30 years    0.044245
10                  meal_type_Lunch    0.041154
14  location_Hammersmith and Fulham    0.040341
5           dietary_preference_Keto    0.037498
7          dietary_preference_Vegan    0