# Build dim_customer

Purpose:
- Create a customer context dimension
- Capture household and relationship attributes relevant to churn
- Preserve customer-level grain

In [1]:
import pandas as pd
import numpy as np

data_path = "../data/raw/"

customer = pd.read_excel(
    data_path + "CustomerChurn.xlsx"
)

customer.head()

Unnamed: 0,LoyaltyID,Customer ID,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,...,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn
0,318537,7590-VHVEG,No,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,152148,5575-GNVDE,No,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,326527,3668-QPYBK,No,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,845894,7795-CFOCW,No,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,503388,9237-HQITU,No,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
customer.info()
customer["Customer ID"].nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   LoyaltyID          7043 non-null   int64  
 1   Customer ID        7043 non-null   object 
 2   Senior Citizen     7043 non-null   object 
 3   Partner            7043 non-null   object 
 4   Dependents         7043 non-null   object 
 5   Tenure             7043 non-null   int64  
 6   Phone Service      7043 non-null   object 
 7   Multiple Lines     7043 non-null   object 
 8   Internet Service   7043 non-null   object 
 9   Online Security    7043 non-null   object 
 10  Online Backup      7043 non-null   object 
 11  Device Protection  7043 non-null   object 
 12  Tech Support       7043 non-null   object 
 13  Streaming TV       7043 non-null   object 
 14  Streaming Movies   7043 non-null   object 
 15  Contract           7043 non-null   object 
 16  Paperless Billing  7043 

7043

In [3]:
dim_customer = customer[
    ["Customer ID", "Partner", "Dependents"]
].copy()

In [4]:
dim_customer.columns = [
    "customer_id",
    "has_partner",
    "has_dependents",
]

In [5]:
dim_customer.nunique()
dim_customer.isna().sum()

customer_id       0
has_partner       0
has_dependents    0
dtype: int64

In [6]:
dim_customer["customer_id"].nunique()

7043

In [7]:
dim_customer.to_csv(
    "../data/processed/dim_customer.csv",
    index=False
)

Notes:
- Customer attributes were limited to household context to avoid overlap with service and demographic dimensions.
- Revenue, tenure, and churn-related fields were intentionally excluded.