In [2]:
import pandas as pd

# 1. Load raw data
raw_path = "../data/raw/Telco-Customer-Churn.csv"
df = pd.read_csv(raw_path)

# 2. Print shape & dtypes to confirm
print("Shape:", df.shape)
print(df.dtypes)

Shape: (7043, 21)
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [3]:
non_numeric = df[pd.to_numeric(df["TotalCharges"], errors="coerce").isna()]
display(non_numeric.head())
print("Number of non-numeric TotalCharges rows:", non_numeric.shape[0])

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No


Number of non-numeric TotalCharges rows: 11


In [4]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

In [5]:
df["TotalCharges"].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(0, inplace=True)


In [6]:
print(df["TotalCharges"].dtype)

float64


In [7]:
binary_cols = [
    "Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"
]

# Map Yes→1, No→0
for col in binary_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

In [8]:
three_level_cols = [
    "MultipleLines", "OnlineSecurity", "OnlineBackup",
    "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"
]

for col in three_level_cols:
    df[col] = df[col].replace({"No phone service": "No", "No internet service": "No"})
    # Now convert to binary
    df[col] = df[col].map({"Yes": 1, "No": 0})

In [9]:
df = pd.get_dummies(df, columns=["InternetService", "Contract", "PaymentMethod"], drop_first=True)

In [10]:
df["gender"] = df["gender"].map({"Female": 0, "Male": 1})

In [11]:
df["SeniorCitizen"] = df["SeniorCitizen"].astype("category")


In [12]:
# Example buckets: 0–12 months, 13–24, 25–48, 49–72
bins = [0, 12, 24, 48, 72]
labels = ["0–12", "13–24", "25–48", "49–72"]
df["tenure_group"] = pd.cut(df["tenure"], bins=bins, labels=labels, right=True, include_lowest=True)

# One‐hot encode these buckets if you want
df = pd.get_dummies(df, columns=["tenure_group"], drop_first=True)


In [13]:
df.drop(columns=["customerID"], inplace=True)


In [14]:
display(df.head())


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_13–24,tenure_group_25–48,tenure_group_49–72
0,0,0,1,0,1,0,0,0,1,0,...,False,False,False,False,False,True,False,False,False,False
1,1,0,0,0,34,1,0,1,0,1,...,False,False,True,False,False,False,True,False,True,False
2,1,0,0,0,2,1,0,1,1,0,...,False,False,False,False,False,False,True,False,False,False
3,1,0,0,0,45,0,0,1,0,1,...,False,False,True,False,False,False,False,False,True,False
4,0,0,0,0,2,1,0,0,0,0,...,True,False,False,False,False,True,False,False,False,False


In [15]:
display(df.isnull().sum().to_frame("null_count"))


Unnamed: 0,null_count
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
OnlineSecurity,0
OnlineBackup,0
DeviceProtection,0


In [16]:
print(df.dtypes)


gender                                      int64
SeniorCitizen                            category
Partner                                     int64
Dependents                                  int64
tenure                                      int64
PhoneService                                int64
MultipleLines                               int64
OnlineSecurity                              int64
OnlineBackup                                int64
DeviceProtection                            int64
TechSupport                                 int64
StreamingTV                                 int64
StreamingMovies                             int64
PaperlessBilling                            int64
MonthlyCharges                            float64
TotalCharges                              float64
Churn                                       int64
InternetService_Fiber optic                  bool
InternetService_No                           bool
Contract_One year                            bool


In [17]:
processed_path = "../data/processed/telco_churn_clean.csv"
df.to_csv(processed_path, index=False)
