# Customer Churn Prediction

In [2]:
import pandas as pd
from sqlalchemy import create_engine, text
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

ModuleNotFoundError: No module named 'sklearn'

In [None]:


# 1. Connect to the Database
DATABASE_FILE = "classicmodels.sqlite"
engine = create_engine(f"sqlite:///{DATABASE_FILE}")

def get_data(sql_query: str) -> pd.DataFrame:
    """Utility function to run a query and return a DataFrame."""
    with engine.connect() as connection:
        return pd.read_sql(text(sql_query), connection)

# 2. Define the 'Snapshot Date' and Churn Logic
# We'll define a recent date as our 'snapshot' for recency calculation.
# The latest order date in the dataset is usually around May 31, 2005 (check your data).
# Let's set the snapshot date a bit later to capture 'Recency'.
SNAPSHOT_DATE = datetime(2006, 1, 1)

# Churn Definition: A customer is considered CHURNED if their last order was 
# more than 12 months (365 days) before the SNAPSHOT_DATE.

# 3. Calculate RFM Features
rfm_query = f"""
WITH CustomerOrders AS (
    SELECT
        o.customerNumber,
        o.orderDate,
        od.quantityOrdered * od.priceEach AS sales
    FROM
        orders o
    JOIN
        orderdetails od ON o.orderNumber = od.orderNumber
)
SELECT
    t1.customerNumber,
    -- Recency: Days since last order
    CAST(JULIANDAY('{SNAPSHOT_DATE.strftime('%Y-%m-%d')}') - JULIANDAY(MAX(t1.orderDate)) AS INTEGER) AS Recency,
    -- Frequency: Total number of orders
    COUNT(t1.orderDate) AS Frequency,
    -- Monetary: Total sales value
    SUM(t1.sales) AS Monetary,
    -- Last Order Date (for Churn Label)
    MAX(t1.orderDate) AS LastOrderDate
FROM
    CustomerOrders t1
GROUP BY
    t1.customerNumber;
"""

df_rfm = get_data(rfm_query)

# 4. Create the Churn Label (Target Variable Y)
df_rfm['LastOrderDate'] = pd.to_datetime(df_rfm['LastOrderDate'])
days_since_last_order = (SNAPSHOT_DATE - df_rfm['LastOrderDate']).dt.days

# Label 1 for Churn (last order > 365 days ago), 0 for Active
df_rfm['Churn'] = (days_since_last_order > 365).astype(int)

print("--- RFM Features with Churn Label ---")
print(df_rfm[['customerNumber', 'Recency', 'Frequency', 'Monetary', 'Churn']].head())
print(f"\nChurned Customers (1): {df_rfm['Churn'].sum()}")
print(f"Active Customers (0): {len(df_rfm) - df_rfm['Churn'].sum()}")