In [1]:
import kagglehub
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path = kagglehub.dataset_download("blastchar/telco-customer-churn")
print("Path to dataset files:", path)

Path to dataset files: /Users/hammadhassan/.cache/kagglehub/datasets/blastchar/telco-customer-churn/versions/1


In [4]:
churnDataKaggle = './data/raw_customer_churn_data.csv'
df = pd.read_csv(churnDataKaggle)
display(df)    

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [5]:
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [6]:
def clean_data(df):
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'] = df['TotalCharges'].fillna(0)

    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
    if 'customerID' in df.columns:
        df = df.drop('customerID', axis=1)

    return df

In [7]:
def encode_features(df, save_encoders=True, scaler_path='models/scaler.pkl', encoder_path='models/encoder.pkl'):
    import os

    binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
    for col in binary_cols:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

    if 'gender' in df.columns:
        df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

    multi_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
                  'OnlineBackup', 'DeviceProtection', 'TechSupport',
                  'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

    encoder = OneHotEncoder(drop='first', sparse=False)
    encoded_multi = encoder.fit_transform(df[multi_cols])
    encoded_cols = encoder.get_feature_names_out(multi_cols)
    encoded_df = pd.DataFrame(encoded_multi, columns=encoded_cols, index=df.index)

    df = df.drop(multi_cols, axis=1)
    df = pd.concat([df, encoded_df], axis=1)

    scaler = StandardScaler()
    num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
    df[num_cols] = scaler.fit_transform(df[num_cols])

    if save_encoders:
        model_dir = os.path.dirname(scaler_path)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        import joblib
        joblib.dump(scaler, scaler_path)
        joblib.dump(encoder, encoder_path)

    return df

In [8]:
df_clean = clean_data(df)
df_ready = encode_features(df_clean, save_encoders=True)



In [9]:
df_clean.to_csv('./data/telco_clean.csv', index=False)