In [69]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

In [46]:
data = pd.read_csv('data/Telco_Customer_Churn.csv')

In [47]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [50]:
df = data.copy()
df= df.drop(columns='customerID',axis=1)

In [51]:
# Define mappings for each column
mappings={
    'TechSupport':{
        'No': 0, 'Yes': 1, 'No internet service': 2
    },
    'MultipleLines' : {
        'No': 0, 'Yes': 1, 'No phone service': 2
        },
    'OnlineSecurity' : {
        'No': 0, 'Yes': 1, 'No internet service': 2
        },
    'OnlineBackup' :{
        'No': 0, 'Yes': 1, 'No internet service': 2
        },
    'DeviceProtection' : {
        'No': 0, 'Yes': 1, 'No internet service': 2
        },
    'StreamingTV' : {
        'No': 0, 'Yes': 1, 'No internet service': 2
        },
    'StreamingMovies' : {
        'No': 0, 'Yes': 1, 'No internet service': 2
        },
    'Contract' : {
        'Month-to-month': 0, 'One year': 1, 'Two year': 2
        },
    'PaymentMethod' : {
        'Electronic check': 0, 'Mailed check': 1,'Bank transfer (automatic)': 2, 'Credit card (automatic)': 3
        },
    'InternetService' :{
        'DSL': 0, 'Fiber optic': 1, 'No': 2
        }
}



In [52]:
for co,mapping in mappings.items():
    df[co] = df[co].apply(lambda x : mapping[x]) 

In [53]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,2,0,0,2,0
InternetService,0,0,0,0,1
OnlineSecurity,0,1,1,1,0
OnlineBackup,1,0,1,0,0


In [54]:
binary_mapping = ['Partner','Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_mapping:
    df[col] = df[col].apply(lambda x : 1 if x=='Yes' else 0)
df['gender'] = df['gender'].apply(lambda x: 1 if x=='Male' else 0)

In [55]:
print(df['MonthlyCharges'].dtype)
print(df['TotalCharges'].dtype)
print(df['tenure'].dtype)

float64
object
int64


In [56]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,0.0,1.0,1.0,1.0,0.0
SeniorCitizen,0.0,0.0,0.0,0.0,0.0
Partner,1.0,0.0,0.0,0.0,0.0
Dependents,0.0,0.0,0.0,0.0,0.0
tenure,1.0,34.0,2.0,45.0,2.0
PhoneService,0.0,1.0,1.0,0.0,1.0
MultipleLines,2.0,0.0,0.0,2.0,0.0
InternetService,0.0,0.0,0.0,0.0,1.0
OnlineSecurity,0.0,1.0,1.0,1.0,0.0
OnlineBackup,1.0,0.0,1.0,0.0,0.0


In [57]:
df['TotalCharges'] =pd.to_numeric(df['TotalCharges'],errors='coerce')
df['TotalCharges'].isnull().sum()

11

In [58]:
df = df.dropna(subset=['TotalCharges'])

In [59]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,0.0,1.0,1.0,1.0,0.0
SeniorCitizen,0.0,0.0,0.0,0.0,0.0
Partner,1.0,0.0,0.0,0.0,0.0
Dependents,0.0,0.0,0.0,0.0,0.0
tenure,1.0,34.0,2.0,45.0,2.0
PhoneService,0.0,1.0,1.0,0.0,1.0
MultipleLines,2.0,0.0,0.0,2.0,0.0
InternetService,0.0,0.0,0.0,0.0,1.0
OnlineSecurity,0.0,1.0,1.0,1.0,0.0
OnlineBackup,1.0,0.0,1.0,0.0,0.0


In [60]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['tenure', 'MonthlyCharges', 'TotalCharges']])


In [61]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,0.0,1.0,1.0,1.0,0.0
SeniorCitizen,0.0,0.0,0.0,0.0,0.0
Partner,1.0,0.0,0.0,0.0,0.0
Dependents,0.0,0.0,0.0,0.0,0.0
tenure,0.0,0.464789,0.014085,0.619718,0.014085
PhoneService,0.0,1.0,1.0,0.0,1.0
MultipleLines,2.0,0.0,0.0,2.0,0.0
InternetService,0.0,0.0,0.0,0.0,1.0
OnlineSecurity,0.0,1.0,1.0,1.0,0.0
OnlineBackup,1.0,0.0,1.0,0.0,0.0


In [62]:
sc= MinMaxScaler()
df[['MonthlyCharges','TotalCharges','tenure']]=sc.fit_transform(df[['MonthlyCharges','TotalCharges','tenure']])

In [63]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,0.0,1.0,1.0,1.0,0.0
SeniorCitizen,0.0,0.0,0.0,0.0,0.0
Partner,1.0,0.0,0.0,0.0,0.0
Dependents,0.0,0.0,0.0,0.0,0.0
tenure,0.0,0.464789,0.014085,0.619718,0.014085
PhoneService,0.0,1.0,1.0,0.0,1.0
MultipleLines,2.0,0.0,0.0,2.0,0.0
InternetService,0.0,0.0,0.0,0.0,1.0
OnlineSecurity,0.0,1.0,1.0,1.0,0.0
OnlineBackup,1.0,0.0,1.0,0.0,0.0


In [64]:
X =df.drop(columns='Churn',axis=1)
y=df['Churn']

In [65]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

In [66]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [70]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.62      0.51      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [74]:
df.to_csv("data/preprocessed_data.csv")

# Key Metrics Breakdown
## For Class 0 (Non-Churn):

    Precision (0.83): Out of all the predictions made for class 0, 83% were correct.
    Recall (0.89): Out of all actual instances of class 0, 89% were correctly identified.
    F1-Score (0.86): The harmonic mean of precision and recall, showing a good balance for class 0.

## For Class 1 (Churn):

    Precision (0.62): Out of all the predictions made for class 1, 62% were correct.
    Recall (0.51): Out of all actual instances of class 1, only 51% were identified.
    F1-Score (0.56): Indicates moderate performance for identifying churners, likely due to lower recall.

## Overall Metrics:

    Accuracy (0.79): The model correctly predicted the target class for 79% of the samples.
    Macro Avg: The unweighted mean of precision, recall, and F1-score, treating both classes equally.
        Macro Recall (0.70): Shows that the model is less effective for class 1.
    Weighted Avg: Averages the metrics, weighted by the number of instances in each class.

## Analysis of Results

    The model performs well for class 0 (non-churners) but struggles with class 1 (churners), which is a common issue in imbalanced datasets.
    Lower recall for churners means the model is missing many true positives for class 1.
    This imbalance suggests that the model is biased towards the majority class (0), which dominates the dataset