# Build dim_demographics

Purpose:
- Create a clean demographic dimension for churn segmentation
- Derive interpretable age groups
- Preserve customer-level grain

In [1]:
import pandas as pd
import numpy as np

data_path = "../data/raw/"

demographics = pd.read_excel(
    data_path + "Telco_customer_churn_demographics.xlsx"
)

demographics.head()

Unnamed: 0,Customer ID,Count,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents
0,8779-QRDMV,1,Male,78,No,Yes,No,No,0
1,7495-OOKFY,1,Female,74,No,Yes,Yes,Yes,1
2,1658-BYGOY,1,Male,71,No,Yes,No,Yes,3
3,4598-XLKNJ,1,Female,78,No,Yes,Yes,Yes,1
4,4846-WHAFZ,1,Female,80,No,Yes,Yes,Yes,1


In [2]:
def age_band(age):
    if age < 30:
        return "<30"
    elif age < 45:
        return "30–44"
    elif age < 60:
        return "45–59"
    else:
        return "60+"

demographics["age_group"] = demographics["Age"].apply(age_band)

In [3]:
demographics["age_group"].value_counts()

age_group
30–44    1943
45–59    1917
60+      1782
<30      1401
Name: count, dtype: int64

In [4]:
dim_demographics = demographics[
    ["Customer ID", "Gender", "Senior Citizen", "age_group"]
].copy()

In [5]:
dim_demographics.columns = [
    "customer_id",
    "gender",
    "senior_citizen",
    "age_group"
]

In [6]:
dim_demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     7043 non-null   object
 1   gender          7043 non-null   object
 2   senior_citizen  7043 non-null   object
 3   age_group       7043 non-null   object
dtypes: object(4)
memory usage: 220.2+ KB


In [7]:
dim_demographics["customer_id"].nunique()

7043

In [8]:
dim_demographics.to_csv(
    "../data/processed/dim_demographics.csv",
    index=False
)

Notes:
- Raw age was converted into categorical age groups to improve interpretability.
- Demographic attributes were kept minimal and descriptive to avoid overlap with service and customer dimensions.
- Age bands were chosen to balance interpretability and distribution.