In [5]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Load dataset
df = pd.read_csv("churn_dataa.csv")

# Drop 'customerID' column
df.drop('customerID', axis='columns', inplace=True)

# Handle missing or whitespace in 'TotalCharges'
df['TotalCharges'] = df['TotalCharges'].replace(" ", np.nan)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])
df['TotalCharges'].fillna(df['TotalCharges'].mean(), inplace=True)

# Replace categorical values
df.replace('No internet service', 'No', inplace=True)
df.replace('No phone service', 'No', inplace=True)

# Convert Yes/No columns to 1/0
yes_no_columns = ['Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup',
                  'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']
for col in yes_no_columns:
    df[col].replace({'Yes': 1, 'No': 0}, inplace=True)

# Convert gender to 1/0
df['gender'].replace({'Female': 1, 'Male': 0}, inplace=True)

# One-hot encoding for categorical variables
df2 = pd.get_dummies(data=df, columns=['InternetService', 'Contract', 'PaymentMethod'])

# Scaling numerical features
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = MinMaxScaler()
df2[cols_to_scale] = scaler.fit_transform(df2[cols_to_scale])

# Define features (X) and target (y)
X = df2.drop('Churn', axis='columns')
y = df2['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

# Logistic Regression model
def logistic_regression_model(X_train, y_train, X_test, y_test, weights=None):
    log_reg = LogisticRegression(max_iter=1000, class_weight=weights)
    
    # Train model
    log_reg.fit(X_train, y_train)
    
    # Evaluate model
    y_preds = log_reg.predict(X_test)
    
    print("Confusion Matrix: \n", confusion_matrix(y_test, y_preds))
    print("Classification Report: \n", classification_report(y_test, y_preds))
    
    return y_preds

# Train logistic regression without class weights
y_preds = logistic_regression_model(X_train, y_train, X_test, y_test)

# Undersampling to handle class imbalance
count_class_0, count_class_1 = df2.Churn.value_counts()

df_class_0 = df2[df2['Churn'] == 0]
df_class_1 = df2[df2['Churn'] == 1]

# Under-sample majority class
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

# Define features and target for undersampled dataset
X_under = df_test_under.drop('Churn', axis='columns')
y_under = df_test_under['Churn']

# Train-test split on undersampled data
X_train_under, X_test_under, y_train_under, y_test_under = train_test_split(X_under, y_under, test_size=0.2, random_state=15, stratify=y_under)

# Train logistic regression on undersampled data
y_preds_under = logistic_regression_model(X_train_under, y_train_under, X_test_under, y_test_under)



Confusion Matrix: 
 [[914 109]
 [183 203]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1023
           1       0.65      0.53      0.58       386

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.78      0.79      0.79      1409

Confusion Matrix: 
 [[272 102]
 [ 83 291]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.77      0.73      0.75       374
           1       0.74      0.78      0.76       374

    accuracy                           0.75       748
   macro avg       0.75      0.75      0.75       748
weighted avg       0.75      0.75      0.75       748



In [7]:
import joblib 
# Save the trained model to a file
joblib.dump(log_reg, 'logistic_regression_model.joblib')

['logistic_regression_model.joblib']

In [9]:
joblib.dump(scaler, 'scaler_churn.joblib')

['scaler_churn.joblib']