In [74]:

# ===========================================
# - Utility/telecom customer data
# - Predict customer churn
# - Real business context

# INSTRUCTIONS:
# 1. Run SETUP once
# 2. Practice PANDAS section
# 3. Practice SKLEARN section
# """

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# =============================================================================
# SETUP: CREATE DATA (Run once, don't memorize!)
# =============================================================================

data = {
    'customer_id': [f'C{i:04d}' for i in range(1, 51)],

    # Demographics
    'age': [25, 45, 32, 58, 41, 29, 67, 35, 52, 28,
            38, 61, 44, 31, 56, 27, 49, 34, 64, 39,
            42, 26, 55, 33, 48, 37, 59, 30, 51, 43,
            36, 62, 47, 29, 54, 40, 65, 32, 50, 35,
            46, 28, 57, 41, 63, 34, 53, 38, 60, 31],

    'income_level': ['Medium', 'High', 'Medium', 'High', 'Medium',
                     'Low', 'High', 'Medium', 'High', 'Low',
                     'Medium', 'High', 'Medium', 'Low', 'High',
                     'Low', 'Medium', 'Low', 'High', 'Medium',
                     'Medium', 'Low', 'High', 'Medium', 'Medium',
                     'Medium', 'High', 'Low', 'High', 'Medium',
                     'Medium', 'High', 'Medium', 'Low', None,  # Missing
                     'Medium', 'High', 'Medium', 'High', 'Low',
                     'Medium', 'Low', 'High', 'Medium', None,  # Missing
                     'Low', 'High', 'Medium', 'High', 'Medium'],

    # Customer behavior
    'tenure_months': [3, 36, 12, 48, 24, 6, 60, 18, 42, 8,
                      15, 54, 30, 9, 45, 5, 33, 11, 58, 21,
                      27, 7, 51, 14, 39, 20, 56, 10, 44, 25,
                      17, 62, 35, 8, 50, 28, 64, 13, 47, 19,
                      32, 6, 53, 26, 61, 16, 49, 22, 59, 12],

    'monthly_bill': [45.5, 120.3, 67.8, 135.2, 89.4, 52.1, 145.7, 78.9, 128.6, 48.3,
                     72.4, 138.9, 95.7, 55.2, 132.4, 41.8, 105.3, 63.7, 142.1, 84.6,
                     91.2, None, 125.8, 69.4, 112.5, 76.3, 136.7, 58.9, 129.3, 87.1,  # Missing
                     74.8, 140.2, 108.6, 54.7, 133.9, 82.5, 147.3, 66.2, 127.8, 71.9,
                     98.4, 49.6, 131.5, 85.3, None, 68.1, 134.7, 79.2, 139.8, 64.5],  # Missing

    'contract_type': ['Month-to-month', '1-year', 'Month-to-month', '2-year', '1-year',
                      'Month-to-month', '2-year', 'Month-to-month', '2-year', 'Month-to-month',
                      '1-year', '2-year', '1-year', 'Month-to-month', '2-year',
                      'Month-to-month', '1-year', 'Month-to-month', '2-year', '1-year',
                      '1-year', 'Month-to-month', '2-year', 'Month-to-month', '1-year',
                      '1-year', '2-year', 'Month-to-month', '2-year', '1-year',
                      'Month-to-month', '2-year', '1-year', 'Month-to-month', '2-year',
                      '1-year', '2-year', 'Month-to-month', '2-year', 'Month-to-month',
                      '1-year', 'Month-to-month', '2-year', '1-year', '2-year',
                      'Month-to-month', '2-year', '1-year', '2-year', 'Month-to-month'],

    'payment_method': ['Auto', 'Auto', 'Manual', 'Auto', 'Auto',
                       'Manual', 'Auto', 'Manual', 'Auto', 'Manual',
                       'Auto', 'Auto', 'Auto', 'Manual', 'Auto',
                       'Manual', 'Auto', 'Manual', 'Auto', 'Auto',
                       'Auto', 'Manual', 'Auto', 'Manual', 'Auto',
                       'Auto', 'Auto', 'Manual', 'Auto', 'Auto',
                       'Manual', 'Auto', 'Auto', 'Manual', 'Auto',
                       'Auto', 'Auto', 'Manual', 'Auto', 'Manual',
                       'Auto', 'Manual', 'Auto', 'Auto', 'Auto',
                       'Manual', 'Auto', 'Auto', 'Auto', 'Manual'],

    'support_calls_last_month': [3, 0, 1, 0, 1, 4, 0, 2, 0, 5,
                                 1, 0, 1, 2, 0, 6, 1, 3, 0, 1,
                                 1, 4, 0, 2, 1, 1, 0, 3, 0, 1,
                                 2, 0, 1, 5, 0, 1, 0, 4, 0, 2,
                                 1, 3, 0, 1, 0, 2, 0, 1, 0, 1],

    # Target: 1 = churned, 0 = retained
    # Churn more likely if: short tenure, month-to-month, high support calls
    'churned': [1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
                0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
                0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
                1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
                0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
}

df = pd.DataFrame(data)

In [75]:
# load data if csv
# df = pd.read_csv('some_file.csv')

In [76]:
# 1. Explore the data
df.head()
df.info() #types
df.shape #50 rows of data and 11 features

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               50 non-null     object 
 1   age                       50 non-null     int64  
 2   income_level              48 non-null     object 
 3   tenure_months             50 non-null     int64  
 4   monthly_bill              48 non-null     float64
 5   contract_type             50 non-null     object 
 6   payment_method            50 non-null     object 
 7   support_calls_last_month  50 non-null     int64  
 8   churned                   50 non-null     int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 3.6+ KB


(50, 9)

In [77]:
df.describe() # descriptives

Unnamed: 0,age,tenure_months,monthly_bill,support_calls_last_month,churned
count,50.0,50.0,48.0,50.0,50.0
mean,43.6,30.2,95.627083,1.36,0.36
std,12.224098,18.934905,33.181434,1.548666,0.484873
min,25.0,3.0,41.8,0.0,0.0
25%,33.25,13.25,68.025,0.0,0.0
50%,41.5,26.5,88.25,1.0,0.0
75%,53.75,47.75,129.85,2.0,1.0
max,67.0,64.0,147.3,6.0,1.0


In [78]:
#check class balance
df['churned'].value_counts(normalize=True)

# 0 = retained custumers, 64% stayed
# 1 = churned, 36% left

# in real world data, often see 95% stay, 5% churn. closer to the 60/40 so model should have plenty to learn from

Unnamed: 0_level_0,proportion
churned,Unnamed: 1_level_1
0,0.64
1,0.36


In [79]:
#check missing values
df.isna().sum()

Unnamed: 0,0
customer_id,0
age,0
income_level,2
tenure_months,0
monthly_bill,2
contract_type,0
payment_method,0
support_calls_last_month,0
churned,0


In [80]:
#Step 2: Preprocessing

In [81]:
# A. from here, we will consider the missing values -> monthly_bill and income_level
#use median for monthly bill $ == robust against outliers
df['monthly_bill'] = df['monthly_bill'].fillna(df['monthly_bill'].median())
#use mode for income_level bc categorical variable, can't do math on low, medium, high so best to use most frequent to maintain distribution
df['income_level'] = df['income_level'].fillna(df['income_level'].mode()[0])
df.isna().sum()

Unnamed: 0,0
customer_id,0
age,0
income_level,0
tenure_months,0
monthly_bill,0
contract_type,0
payment_method,0
support_calls_last_month,0
churned,0


In [82]:
# data is clean now
# let's encode the categorical variables into a numerical format so the ML model can process them

# B. income_level
# rank matters here, so i want to map the words to numbers.
# can tell model that as number goes up, income goes up
income_map = {'Low': 0, 'Medium': 1, 'High': 2}
df['income_level'] = df['income_level'].map(income_map)


# C. remaining customer_id, contract_type, and payment_method
# customer_id is just a label, not a feature
# contract_type, and payment_method there's no natural order, so will use one-hot encoding to create flags for each category
df_encoded = pd.get_dummies(
    df.drop('customer_id', axis=1),
    columns=['contract_type', 'payment_method'],
    drop_first=True
)

df.head()

Unnamed: 0,customer_id,age,income_level,tenure_months,monthly_bill,contract_type,payment_method,support_calls_last_month,churned
0,C0001,25,1,3,45.5,Month-to-month,Auto,3,1
1,C0002,45,2,36,120.3,1-year,Auto,0,0
2,C0003,32,1,12,67.8,Month-to-month,Manual,1,1
3,C0004,58,2,48,135.2,2-year,Auto,0,0
4,C0005,41,1,24,89.4,1-year,Auto,1,0


In [None]:
#Step 3: Modeling

In [83]:
# NO linear regression because it's more for predicting numbers like exact dollar amount
# churn is more of a yes / no category
  # BASELINE logisitic regresison
    # gives coefficienets, directionality +/-, immediate interpretability
      # need to scale data but can be done with StandardScaler
      # have to manually tell model which combo of features to look for

  # OPTIMIZE ->  RANDOM FOREST
    # does well for non linear relationships bc does not care about scaling numbers
    # can naturally see which features combined contributes to high churn risk
    # gives "feature importance" automatically, can tell you top 5 reasons for churn
    # focus on feature engineering to extract actionable insights for client


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 1. define X and y
#drop the target to ensure no data leakage, model only learns from our predictors avaiable
X = df_encoded.drop('churned', axis = 1)
y = df_encoded['churned']

# 2. split the data
# random state is set so results are reproducible due to feature eng and not chance
    #set to 42 because it's the "DS amswer to the universe" but in my studies, i've just used 1
# stratification bc churned class is smaller than retained, helps make sure model sees representative sample of churners in both train and test phases
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42, stratify=y)

#3. init and fit
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

