## Data Preprocessing for Customer Churn Prediction

This notebook performs data preprocessing steps to clean and prepare the dataset for modeling.


In [30]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn .model_selection import train_test_split

In [None]:
df=pd.read_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\cleaned_telco.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Churn_flag
0,0,7590-VHVEG,Female,No,Yes,No,1,No,,DSL,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0.0
1,1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,...,No,No,No,One year,No,Mailed check,56.950001,1889.5,No,0.0
2,2,3668-QPYBK,Male,No,No,No,2,Yes,No,DSL,...,No,No,No,Month-to-month,Yes,Mailed check,53.849998,108.150002,Yes,1.0
3,3,7795-CFOCW,Male,No,No,No,45,No,,DSL,...,Yes,No,No,One year,No,Bank transfer (automatic),42.299999,1840.75,No,0.0
4,4,9237-HQITU,Female,No,No,No,2,Yes,No,Fiber optic,...,No,No,No,Month-to-month,Yes,Electronic check,70.699997,151.649994,Yes,1.0


: 

In [None]:
df.isnull().sum()[df.isnull().sum() > 0]

MultipleLines       269
OnlineSecurity      651
OnlineBackup        651
DeviceProtection    651
TechSupport         651
StreamingTV         651
StreamingMovies     651
Churn                 1
Churn_flag            1
dtype: int64

: 

In [None]:
print(pd.Series(df['Churn'].value_counts()))
df = df.dropna(subset=['Churn'])


Churn
No     3706
Yes    1336
Name: count, dtype: int64


: 

In [None]:
col_to_fill=['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

df[col_to_fill]=df[col_to_fill].fillna('No')



: 

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5042 entries, 0 to 5042
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   SeniorCitizen                          5042 non-null   int32  
 1   Partner                                5042 non-null   int32  
 2   Dependents                             5042 non-null   int32  
 3   tenure                                 5042 non-null   int64  
 4   PhoneService                           5042 non-null   int32  
 5   MultipleLines                          5042 non-null   int32  
 6   OnlineSecurity                         5042 non-null   int32  
 7   OnlineBackup                           5042 non-null   int32  
 8   DeviceProtection                       5042 non-null   int32  
 9   TechSupport                            5042 non-null   int32  
 10  StreamingTV                            5042 non-null   int32  
 11  Streaming

: 

In [None]:
df=df.drop(columns=['customerID','Unnamed: 0'])

: 

In [None]:
col_to_fix=[
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling',
 'Churn']

: 

In [None]:
label={}
for col in col_to_fix:
    if df[col].dtype=='object':
        le=LabelEncoder()
        df[col]=le.fit_transform(df[col])
        label[col]=le

: 

In [None]:
one_hot_cols=['gender','InternetService','Contract','PaymentMethod']

: 

In [None]:

# Fix column name formatting just in case
df.columns = df.columns.str.strip()

# Columns you want to encode
one_hot_cols = ['gender', 'InternetService', 'Contract', 'PaymentMethod']

# Create OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit and transform
encoded = encoder.fit_transform(df[one_hot_cols])

# Create DataFrame for encoded columns
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(one_hot_cols))

# Reset index to match with original DataFrame
encoded_df.index = df.index

# Drop original columns and add encoded ones
df = df.drop(one_hot_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)


: 

In [None]:
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

: 

In [None]:
scalar=StandardScaler()
scalar.fit_transform(df[numeric_cols])

array([[-1.28728744, -1.17197345, -1.00135582],
       [ 0.0583594 , -0.27049061, -0.1808993 ],
       [-1.24651026, -0.37361234, -0.96681075],
       ...,
       [-0.87951567, -1.18028972, -0.86167549],
       [-1.1649559 ,  0.30998447, -0.87925686],
       [ 1.36322906,  1.34951724,  2.0051906 ]])

: 

: 