In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load cleaned data
df = pd.read_csv("../data/cleaned_data.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


**Explanation:**  
Loaded the cleaned dataset from the previous notebook (`cleaned_data.csv`) to prepare it for feature engineering and modeling.


In [2]:
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 
                    'MultipleLines', 'InternetService', 'OnlineSecurity', 
                    'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                    'StreamingTV', 'StreamingMovies', 'Contract', 
                    'PaperlessBilling', 'PaymentMethod']

numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']


**Explanation:**  
Separated features into:  
- **Categorical columns:** Need encoding  
- **Numerical columns:** May require scaling  
This distinction ensures correct preprocessing for modeling.


In [3]:
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df_encoded.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,29.85,29.85,0,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
1,5575-GNVDE,0,34,56.95,1889.5,0,True,False,False,True,...,False,False,False,False,True,False,False,False,False,True
2,3668-QPYBK,0,2,53.85,108.15,1,True,False,False,True,...,False,False,False,False,False,False,True,False,False,True
3,7795-CFOCW,0,45,42.3,1840.75,0,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,9237-HQITU,0,2,70.7,151.65,1,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False


**Explanation:**  
Applied **one-hot encoding** to convert categorical variables into numeric form.  
- `drop_first=True` avoids multicollinearity  
- The resulting dataset is fully numeric, ready for scaling and modeling

In [4]:
scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
df_encoded.head()

Unnamed: 0,customerID,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,-1.280248,-1.161694,-0.994194,0,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
1,5575-GNVDE,0,0.064303,-0.260878,-0.17374,0,True,False,False,True,...,False,False,False,False,True,False,False,False,False,True
2,3668-QPYBK,0,-1.239504,-0.363923,-0.959649,1,True,False,False,True,...,False,False,False,False,False,False,True,False,False,True
3,7795-CFOCW,0,0.512486,-0.74785,-0.195248,0,True,False,False,False,...,False,False,False,False,True,False,False,False,False,False
4,9237-HQITU,0,-1.239504,0.196178,-0.940457,1,False,False,False,True,...,False,False,False,False,False,False,True,False,True,False


**Explanation:**  
Applied **StandardScaler** to numerical features:  
- Centers features around 0 and scales to unit variance  
- Important for algorithms sensitive to feature scales (e.g., Logistic Regression, SVM)


In [5]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

**Explanation:**  
Separated the dataset into:  
- **X (features):** All input variables  
- **y (target):** Churn (0/1)  
This is the standard format for machine learning models.


In [6]:
processed_data = pd.concat([X, y], axis=1)
processed_data.to_csv("../data/processed_data.csv", index=False)
print("Processed data saved to ../data/processed_data.csv")

Processed data saved to ../data/processed_data.csv


**Explanation:**  
Saved the fully **encoded and scaled dataset** as `processed_data.csv` in the `data/` folder.  
- This dataset is ready for the next notebook: **Model Training**.
