In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_csv("Data.csv")

In [16]:
print("Displaying first 5 rows of the dataset:")
print(df.head())

Displaying first 5 rows of the dataset:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport Streamin

In [17]:
print("Dataset Shape (Rows, Columns):")
print(df.shape)

Dataset Shape (Rows, Columns):
(7043, 21)


In [18]:
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-n

In [19]:
print("Statistical Summary of Numerical Columns:")
print(df.describe())

Statistical Summary of Numerical Columns:
       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


In [20]:
print("Checking missing values column-wise:")
print(df.isnull().sum())

Checking missing values column-wise:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [21]:
print("Converting TotalCharges column to numeric...")

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

print("Conversion done.")
print("Missing values after conversion:")
print(df["TotalCharges"].isnull().sum())


Converting TotalCharges column to numeric...
Conversion done.
Missing values after conversion:
11


In [22]:
print("Filling missing values in TotalCharges with median...")

median_value = df["TotalCharges"].median()
print("Median value used:", median_value)

df["TotalCharges"] = df["TotalCharges"].fillna(median_value)

print("Missing values after filling:")
print(df["TotalCharges"].isnull().sum())


Filling missing values in TotalCharges with median...
Median value used: 1397.475
Missing values after filling:
0


In [23]:
print("Dropping customerID column (not useful for modeling)...")

df.drop("customerID", axis=1, inplace=True)

print("customerID column dropped.")
print("Remaining columns:")
print(df.columns)


Dropping customerID column (not useful for modeling)...
customerID column dropped.
Remaining columns:
Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


In [24]:
print("Identifying categorical columns...")
cat_cols = df.select_dtypes(include=["object"]).columns
print("Categorical Columns:")
print(cat_cols)

print("Identifying numerical columns...")
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
print("Numerical Columns:")
print(num_cols)


Identifying categorical columns...
Categorical Columns:
Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')
Identifying numerical columns...
Numerical Columns:
Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')


In [25]:
print("Applying One-Hot Encoding to categorical variables...")

df_encoded = pd.get_dummies(df, drop_first=True)

print("Encoding completed ✅")
print("New dataset shape after encoding:")
print(df_encoded.shape)


Applying One-Hot Encoding to categorical variables...
Encoding completed ✅
New dataset shape after encoding:
(7043, 31)


In [26]:
print("Separating features and target variable...")

X = df_encoded.drop("Churn_Yes", axis=1)
y = df_encoded["Churn_Yes"]

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)


Separating features and target variable...
Feature matrix shape: (7043, 30)
Target vector shape: (7043,)


In [27]:
print("\nSplitting dataset into training and testing sets...")

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("Train-Test split completed ✅")



Splitting dataset into training and testing sets...
Train-Test split completed ✅


In [28]:
print("\nFinal Dataset Shapes:")
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)



Final Dataset Shapes:
X_train: (5634, 30)
X_test : (1409, 30)
y_train: (5634,)
y_test : (1409,)
