<a href="https://colab.research.google.com/github/katemyer/DataScience/blob/main/propensity_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# âš¡ BLASTPOINT-STYLE CUSTOMER CHURN PRACTICE
# ===========================================
# This mimics exactly what you'll see Friday:
# - Utility/telecom customer data
# - Predict customer churn
# - Real business context

# INSTRUCTIONS:
# 1. Run SETUP once (don't memorize)
# 2. Practice PANDAS section - type 5 times
# 3. Practice SKLEARN section - type 3 times
# """

import pandas as pd
import numpy as np

# =============================================================================
# SETUP: CREATE DATA (Run once, don't memorize!)
# =============================================================================

data = {
    'customer_id': [f'C{i:04d}' for i in range(1, 51)],

    # Demographics
    'age': [25, 45, 32, 58, 41, 29, 67, 35, 52, 28,
            38, 61, 44, 31, 56, 27, 49, 34, 64, 39,
            42, 26, 55, 33, 48, 37, 59, 30, 51, 43,
            36, 62, 47, 29, 54, 40, 65, 32, 50, 35,
            46, 28, 57, 41, 63, 34, 53, 38, 60, 31],

    'income_level': ['Medium', 'High', 'Medium', 'High', 'Medium',
                     'Low', 'High', 'Medium', 'High', 'Low',
                     'Medium', 'High', 'Medium', 'Low', 'High',
                     'Low', 'Medium', 'Low', 'High', 'Medium',
                     'Medium', 'Low', 'High', 'Medium', 'Medium',
                     'Medium', 'High', 'Low', 'High', 'Medium',
                     'Medium', 'High', 'Medium', 'Low', None,  # Missing
                     'Medium', 'High', 'Medium', 'High', 'Low',
                     'Medium', 'Low', 'High', 'Medium', None,  # Missing
                     'Low', 'High', 'Medium', 'High', 'Medium'],

    # Customer behavior
    'tenure_months': [3, 36, 12, 48, 24, 6, 60, 18, 42, 8,
                      15, 54, 30, 9, 45, 5, 33, 11, 58, 21,
                      27, 7, 51, 14, 39, 20, 56, 10, 44, 25,
                      17, 62, 35, 8, 50, 28, 64, 13, 47, 19,
                      32, 6, 53, 26, 61, 16, 49, 22, 59, 12],

    'monthly_bill': [45.5, 120.3, 67.8, 135.2, 89.4, 52.1, 145.7, 78.9, 128.6, 48.3,
                     72.4, 138.9, 95.7, 55.2, 132.4, 41.8, 105.3, 63.7, 142.1, 84.6,
                     91.2, None, 125.8, 69.4, 112.5, 76.3, 136.7, 58.9, 129.3, 87.1,  # Missing
                     74.8, 140.2, 108.6, 54.7, 133.9, 82.5, 147.3, 66.2, 127.8, 71.9,
                     98.4, 49.6, 131.5, 85.3, None, 68.1, 134.7, 79.2, 139.8, 64.5],  # Missing

    'contract_type': ['Month-to-month', '1-year', 'Month-to-month', '2-year', '1-year',
                      'Month-to-month', '2-year', 'Month-to-month', '2-year', 'Month-to-month',
                      '1-year', '2-year', '1-year', 'Month-to-month', '2-year',
                      'Month-to-month', '1-year', 'Month-to-month', '2-year', '1-year',
                      '1-year', 'Month-to-month', '2-year', 'Month-to-month', '1-year',
                      '1-year', '2-year', 'Month-to-month', '2-year', '1-year',
                      'Month-to-month', '2-year', '1-year', 'Month-to-month', '2-year',
                      '1-year', '2-year', 'Month-to-month', '2-year', 'Month-to-month',
                      '1-year', 'Month-to-month', '2-year', '1-year', '2-year',
                      'Month-to-month', '2-year', '1-year', '2-year', 'Month-to-month'],

    'payment_method': ['Auto', 'Auto', 'Manual', 'Auto', 'Auto',
                       'Manual', 'Auto', 'Manual', 'Auto', 'Manual',
                       'Auto', 'Auto', 'Auto', 'Manual', 'Auto',
                       'Manual', 'Auto', 'Manual', 'Auto', 'Auto',
                       'Auto', 'Manual', 'Auto', 'Manual', 'Auto',
                       'Auto', 'Auto', 'Manual', 'Auto', 'Auto',
                       'Manual', 'Auto', 'Auto', 'Manual', 'Auto',
                       'Auto', 'Auto', 'Manual', 'Auto', 'Manual',
                       'Auto', 'Manual', 'Auto', 'Auto', 'Auto',
                       'Manual', 'Auto', 'Auto', 'Auto', 'Manual'],

    'support_calls_last_month': [3, 0, 1, 0, 1, 4, 0, 2, 0, 5,
                                 1, 0, 1, 2, 0, 6, 1, 3, 0, 1,
                                 1, 4, 0, 2, 1, 1, 0, 3, 0, 1,
                                 2, 0, 1, 5, 0, 1, 0, 4, 0, 2,
                                 1, 3, 0, 1, 0, 2, 0, 1, 0, 1],

    # Target: 1 = churned, 0 = retained
    # Churn more likely if: short tenure, month-to-month, high support calls
    'churned': [1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
                0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
                0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
                1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
                0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
}

In [36]:
# load data if csv
# df = pd.read_csv('some_file.csv')

In [37]:
# 1. explore the data
df.head()
df.info() #types

df.describe() # descriptives

df.shape #50 rows of data and 11 features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               50 non-null     object 
 1   age                       50 non-null     int64  
 2   income_level              50 non-null     object 
 3   tenure_months             50 non-null     int64  
 4   monthly_bill              50 non-null     float64
 5   contract_type             50 non-null     object 
 6   payment_method            50 non-null     object 
 7   support_calls_last_month  50 non-null     int64  
 8   churned                   50 non-null     int64  
 9   high_support_calls        50 non-null     int64  
 10  long_tenure               50 non-null     int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 4.4+ KB


(50, 11)

In [38]:
print("âœ… Dataset created!")
print(f"   Shape: {df.shape}")
print(f"   Churn rate: {df['churned'].mean():.1%}")
print(f"   Missing values: {df.isna().sum().sum()}")
print("\nFirst 5 customers:")
print(df.head())

print("\n" + "="*70)
print("NOW DELETE THE PANDAS SECTION BELOW AND TYPE IT 5 TIMES")
print("="*70 + "\n")


# =============================================================================
# PANDAS PRACTICE - DELETE THIS SECTION AND TYPE 5 TIMES
# =============================================================================

# Step 1: Explore the data
df.head()
df.info()
df.describe()
df.shape

# Step 2: Check missing values
df.isna().sum()

# Step 3: Analyze target variable
df['churned'].value_counts()
df['churned'].mean()

# Step 4: Explore categoricals
df['contract_type'].value_counts()
df['payment_method'].value_counts()
df['income_level'].value_counts()

# Step 5: Grouping - key insights!
df.groupby('contract_type')['churned'].mean()
df.groupby('payment_method')['churned'].mean()
df.groupby('income_level')['churned'].mean()

# Step 6: Handle missing values (IMPUTATION)
# RULE: Numeric â†’ median, Categorical â†’ mode

# Numeric column - use MEDIAN (robust to outliers)
df['monthly_bill'].fillna(df['monthly_bill'].median(), inplace=True)

# Categorical column - use MODE (most frequent value)
df['income_level'].fillna(df['income_level'].mode()[0], inplace=True)

# ALWAYS verify no missing values remain
df.isna().sum()

# Step 7: Feature engineering
df['high_support_calls'] = (df['support_calls_last_month'] > 2).astype(int)
df['long_tenure'] = (df['tenure_months'] > 24).astype(int)

# Step 8: Encode categoricals
df_encoded = pd.get_dummies(df, columns=['income_level', 'contract_type', 'payment_method'], drop_first=True)

# Step 9: Prepare for modeling
X = df_encoded.drop(['customer_id', 'churned'], axis=1)
y = df_encoded['churned']

print(f"âœ… Pandas complete! X: {X.shape}, y: {y.shape}")

print("\n" + "="*70)
print("NOW DELETE THE SKLEARN SECTION BELOW AND TYPE IT 3 TIMES")
print("="*70 + "\n")


# =============================================================================
# SKLEARN PRACTICE - DELETE THIS SECTION AND TYPE 3 TIMES
# =============================================================================

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Train model
model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
model.fit(X_train, y_train)

# Step 3: Make predictions
y_pred = model.predict(X_test)

# Step 4: Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 5: Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

print("\n" + "="*70)
print("ðŸŽ‰ COMPLETE! Key insights:")
print(f"   â€¢ Model accuracy: {accuracy:.1%}")
print(f"   â€¢ Top predictor: {feature_importance.iloc[0]['feature']}")
print(f"   â€¢ Churn rate: {df['churned'].mean():.1%}")
print("="*70)


# =============================================================================
# WHAT TO SAY OUT LOUD (Practice this!)
# =============================================================================

print("\n" + "="*70)
print("ðŸŽ¤ PRACTICE SAYING THESE PHRASES OUT LOUD:")
print("="*70)
print("""
OPENING:
"Let me start by understanding this customer churn dataset..."
"I see we have 50 customers with demographic and behavioral data"
"The churn rate is 32% - somewhat imbalanced but manageable"

DURING EXPLORATION:
"I notice 2 missing income levels and 2 missing monthly bills"
"Month-to-month contracts show much higher churn rates"
"Customers with more support calls are more likely to churn"

DURING CLEANING:
"I'll fill missing income_level with the mode since it's categorical"
"For missing monthly_bill, I'll use the median - appropriate for numeric data"
"I'm creating a feature for high support calls as this seems predictive"

DURING MODELING:
"I'm using an 80-20 split with stratification to maintain the 32% churn rate"
"I'll start with Random Forest as a baseline - it handles non-linear relationships well"
"Setting random_state=42 for reproducibility"

DURING EVALUATION:
"The model achieves about 80% accuracy"
"Looking at feature importance, tenure and contract type are top predictors"
"Month-to-month contracts and short tenure are the biggest churn indicators"

CLOSING:
"To summarize: I built a Random Forest model predicting customer churn with 80% accuracy"
"Key findings: contract type and tenure are the strongest predictors"
"Customers on month-to-month plans with short tenure are highest risk"
"Next steps would be cross-validation and hyperparameter tuning"
""")

âœ… Dataset created!
   Shape: (50, 11)
   Churn rate: 36.0%
   Missing values: 0

First 5 customers:
  customer_id  age income_level  tenure_months  monthly_bill   contract_type  \
0       C0001   25       Medium              3          45.5  Month-to-month   
1       C0002   45         High             36         120.3          1-year   
2       C0003   32       Medium             12          67.8  Month-to-month   
3       C0004   58         High             48         135.2          2-year   
4       C0005   41       Medium             24          89.4          1-year   

  payment_method  support_calls_last_month  churned  high_support_calls  \
0           Auto                         3        1                   1   
1           Auto                         0        0                   0   
2         Manual                         1        1                   0   
3           Auto                         0        0                   0   
4           Auto                         1

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['monthly_bill'].fillna(df['monthly_bill'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['income_level'].fillna(df['income_level'].mode()[0], inplace=True)


Accuracy: 1.000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         4

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Confusion Matrix:
[[6 0]
 [0 4]]

Top 10 Most Important Features:
                         feature  importance
9   contract_type_Month-to-month    0.248270
0                            age    0.242737
10         payment_method_Manual    0.150444
1                  tenure_months    0.127785
3       support_calls_last_month    0.090303
2                   monthly_bill    0.077988
5                    long_tenure    0.037146
6               income_level_Low    0.014330
4             high_support_calls    0.006183
8           contract_type_2-year    0.004815

ðŸŽ‰ COMPLETE! Key insights:
   â€¢ Model accuracy: 100.0%
   â€¢ Top 