In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the updated dataset
df = pd.read_csv('../data/ifood_df_updated.csv')

# Remove the income outlier for consistency
df = df[df['Income'] < 100000]

# Define the features (X) and the target variable (y)
# We'll use columns that have a strong correlation or are business-relevant
features = [
    'Income', 'MntTotal', 'TotalPurchases', 'Age', 'Kids',
    'NumWebVisitsMonth', 'Recency'
]

X = df[features]
y = df['Response']

# Split the data into training and testing sets
# We use a 70/30 split, a common practice
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest Classifier
# This model is a good choice as it handles a variety of data types well
# and often provides good performance out-of-the-box.
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Model Performance:")
print(classification_report(y_test, y_pred))

# Generate and plot the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Response', 'Response'], yticklabels=['No Response', 'Response'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.savefig('../reports/confusion_matrix.png')

print("Model has been trained, evaluated, and the confusion matrix has been saved to 'reports/confusion_matrix.png'.")

In [None]:
# Remove the income outlier for consistency
df = df[df['Income'] < 100000]

# Define features and target, and split the data (same as before)
features = [
    'Income', 'MntTotal', 'TotalPurchases', 'Age', 'Kids',
    'NumWebVisitsMonth', 'Recency'
]
X = df[features]
y = df['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model (same as before)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Assuming these business values for profit calculation
cost_per_campaign = 1  # in dollars
profit_per_response = 100 # in dollars

# Scenario 1: No Model (Blind Campaign)
total_customers = len(y_test)
responders_in_test_set = y_test.sum()
cost_blind = total_customers * cost_per_campaign
profit_blind = (responders_in_test_set * profit_per_response) - cost_blind

print("Profitability Analysis:")
print(f"Scenario 1: Blind Campaign (target everyone)")
print(f"  - Total Cost: ${cost_blind}")
print(f"  - Total Profit: ${profit_blind}\n")

# Scenario 2: Using the Model (Targeted Campaign)
cm = confusion_matrix(y_test, y_pred)
true_positives = cm[1, 1]  # Customers correctly predicted to respond
false_positives = cm[0, 1] # Customers incorrectly predicted to respond

total_targeted = true_positives + false_positives
cost_targeted = total_targeted * cost_per_campaign
profit_targeted = (true_positives * profit_per_response) - cost_targeted

print("Scenario 2: Targeted Campaign (using our model)")
print(f"  - Total Customers Targeted: {total_targeted}")
print(f"  - Total Cost: ${cost_targeted}")
print(f"  - Total Profit: ${profit_targeted}\n")

# Scenario 3: The "Ideal" Model (theoretical maximum profit)
# This model perfectly identifies all responders without any false positives
responders_in_test_set = y_test.sum()
cost_ideal = responders_in_test_set * cost_per_campaign
profit_ideal = (responders_in_test_set * profit_per_response) - cost_ideal

print("Scenario 3: The 'Ideal' Model (100% accurate)")
print(f"  - Total Cost: ${cost_ideal}")
print(f"  - Total Profit: ${profit_ideal}")