## Data Preprocessing for Customer Churn Prediction

This notebook performs data preprocessing steps to clean and prepare the dataset for modeling.


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn .model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [4]:
df=pd.read_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\cleaned_telco.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Churn_flag
0,0,7590-VHVEG,Female,No,Yes,No,1,No,,DSL,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,0.0
1,1,5575-GNVDE,Male,No,No,No,34,Yes,No,DSL,...,No,No,No,One year,No,Mailed check,56.950001,1889.5,No,0.0
2,2,3668-QPYBK,Male,No,No,No,2,Yes,No,DSL,...,No,No,No,Month-to-month,Yes,Mailed check,53.849998,108.150002,Yes,1.0
3,3,7795-CFOCW,Male,No,No,No,45,No,,DSL,...,Yes,No,No,One year,No,Bank transfer (automatic),42.299999,1840.75,No,0.0
4,4,9237-HQITU,Female,No,No,No,2,Yes,No,Fiber optic,...,No,No,No,Month-to-month,Yes,Electronic check,70.699997,151.649994,Yes,1.0


In [5]:
df.isnull().sum()[df.isnull().sum() > 0]

MultipleLines       269
OnlineSecurity      651
OnlineBackup        651
DeviceProtection    651
TechSupport         651
StreamingTV         651
StreamingMovies     651
Churn                 1
Churn_flag            1
dtype: int64

### Handle Missing Values
We drop rows with missing `Churn` values and fill NA in selected service columns with "No".


In [6]:
print(pd.Series(df['Churn'].value_counts()))
df = df.dropna(subset=['Churn'])


Churn
No     3706
Yes    1336
Name: count, dtype: int64


In [7]:
col_to_fill=['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

df[col_to_fill]=df[col_to_fill].fillna('No')



In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5042 entries, 0 to 5042
Data columns (total 23 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5042 non-null   int64  
 1   customerID        5042 non-null   object 
 2   gender            5042 non-null   object 
 3   SeniorCitizen     5042 non-null   object 
 4   Partner           5042 non-null   object 
 5   Dependents        5042 non-null   object 
 6   tenure            5042 non-null   int64  
 7   PhoneService      5042 non-null   object 
 8   MultipleLines     5042 non-null   object 
 9   InternetService   5042 non-null   object 
 10  OnlineSecurity    5042 non-null   object 
 11  OnlineBackup      5042 non-null   object 
 12  DeviceProtection  5042 non-null   object 
 13  TechSupport       5042 non-null   object 
 14  StreamingTV       5042 non-null   object 
 15  StreamingMovies   5042 non-null   object 
 16  Contract          5042 non-null   object 
 17  

In [9]:
df=df.drop(columns=['customerID','Unnamed: 0'])

In [10]:
col_to_fix=[
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'PaperlessBilling',
 'Churn']

In [11]:
label={}
for col in col_to_fix:
    if df[col].dtype=='object':
        le=LabelEncoder()
        df[col]=le.fit_transform(df[col])
        label[col]=le

In [12]:
one_hot_cols=['gender','InternetService','Contract','PaymentMethod']

In [13]:

# Fix column name formatting just in case
df.columns = df.columns.str.strip()

# Columns you want to encode
one_hot_cols = ['gender', 'InternetService', 'Contract', 'PaymentMethod']

# Create OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Fit and transform
encoded = encoder.fit_transform(df[one_hot_cols])

# Create DataFrame for encoded columns
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(one_hot_cols))

# Reset index to match with original DataFrame
encoded_df.index = df.index

# Drop original columns and add encoded ones
df = df.drop(one_hot_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)


In [14]:
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [15]:
scalar=StandardScaler()
scalar.fit_transform(df[numeric_cols])

array([[-1.28728744, -1.17197345, -1.00135582],
       [ 0.0583594 , -0.27049061, -0.1808993 ],
       [-1.24651026, -0.37361234, -0.96681075],
       ...,
       [-0.87951567, -1.18028972, -0.86167549],
       [-1.1649559 ,  0.30998447, -0.87925686],
       [ 1.36322906,  1.34951724,  2.0051906 ]])

In [16]:
#train and test split
x=df.drop(columns=['Churn','Churn_flag'],axis=1)
y=df['Churn_flag']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [17]:
# Handle class imbalance using SMOTE
smote=SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

[WinError 2] The system cannot find the file specified
  File "c:\Users\raich\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\raich\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\raich\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\raich\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [18]:
df.to_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\cleaned_telco2.csv', index=False)

In [20]:
x_train.to_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\x_train.csv', index=False) # Feature training data
x_test.to_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\x_test.csv', index=False)     # Feature testing data
y_train.to_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\y_train.csv', index=False) # target training data
y_test.to_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\y_test.csv', index=False)     # target testing data
# Handle class imbalance using SMOTE
x_train_resampled.to_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\x_train_resampled.csv', index=False) # Resampled feature training data
y_train_resampled.to_csv(r'C:\Users\raich\Desktop\project2\-Customer-Churn-Demand-Prediction-Using-ML\Data\y_train_resampled.csv', index=False) # Resampled target training data

# ✅ Summary of Preprocessing

- Handled missing values
- Encoded categorical features
- Scaled numeric features
- Dataset is ready for training models
- Handled class imbalance using SMOTEHandle class imbalance using SMOTE
- saved the full preprocessed dataset,training and testing datasets