In [115]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [116]:
# Importing the dataset
data = pd.read_excel('02 Churn-Dataset.xlsx')

### Convert variables to 0/1

In [117]:
data['gender'] = data['gender'].map({'Male':0,'Female':1}) # Male:0 | Female:1
data['Partner'] = data['Partner'].map({'No':0,'Yes':1}) # No:0 | Yes:1
data['Dependents']= data['Dependents'].map({'No':0,'Yes':1})
data['PhoneService'] = data['PhoneService'].map({'No':0,'Yes':1})
data['MultipleLines'] = data['MultipleLines'].map({'No':0,'No phone service':0,'Yes':1})
data['OnlineSecurity'] = data['OnlineSecurity'].map({'No':0,'No internet service':0,'Yes':1})
data['OnlineBackup'] = data['OnlineBackup'].map({'No':0,'No internet service':0,'Yes':1})
data['DeviceProtection'] = data['DeviceProtection'].map({'No':0,'No internet service':0,'Yes':1})
data['TechSupport'] = data['TechSupport'].map({'No':0,'No internet service':0,'Yes':1})
data['StreamingTV'] = data['StreamingTV'].map({'No':0,'No internet service':0,'Yes':1})
data['StreamingMovies'] = data['StreamingMovies'].map({'No':0,'No internet service':0,'Yes':1})
data['PaperlessBilling'] = data['PaperlessBilling'].map({'No':0,'Yes':1})
data['Churn'] = data['Churn'].map({'No':0,'Yes':1})

In [118]:
# remove '' values in TotalCharges columns
data = data[data['TotalCharges'] != ' ']
data['TotalCharges'] = data['TotalCharges'].astype('float')

### Enconding 

In [119]:
dummies_data = data[['customerID','InternetService','Contract','PaymentMethod']]

In [120]:
dummies_data = pd.get_dummies(dummies_data, columns=['InternetService','Contract','PaymentMethod'], drop_first=True)

In [121]:
# Convert True/False to 0/1
dummies_data.iloc[:, 1:] = dummies_data.iloc[:, 1:].astype(int)

In [122]:
data = data.drop(columns={'InternetService','Contract','PaymentMethod'})

In [123]:
encoded_data = data.merge(dummies_data, on='customerID')

### Logistic Regression

In [124]:
# Define features and target variable
X = encoded_data.drop(['customerID', 'Churn'], axis=1)
y = encoded_data['Churn']

In [125]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [126]:
# Train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [127]:
# Make predictions
y_pred = model.predict(X_test)

In [128]:
# Create DataFrames for X_test, y_test, and y_pred
X_test_df = X_test.reset_index(drop=True)
y_test_df = y_test.reset_index(drop=True)
y_pred_df = pd.Series(y_pred, name='Predicted')

In [129]:
# Combine X_test, y_test, and y_pred into a single DataFrame
result_df = pd.concat([X_test_df, y_test_df, y_pred_df], axis=1)

In [130]:
result_df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,numTechTickets,InternetService_Fiber optic,InternetService_No,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn,Predicted
0,0,1,1,0,61,1,1,0,0,0,...,1,0,1,0,1,0,0,0,0,0
1,1,0,0,0,19,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,1,0,13,1,1,0,0,1,...,2,1,0,0,0,1,0,0,1,1
3,0,0,1,0,37,1,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,6,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402,0,0,0,0,1,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1403,0,0,0,0,12,1,1,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1404,1,0,0,0,26,1,0,0,1,1,...,0,0,0,1,0,0,0,1,0,0
1405,1,1,0,0,35,1,1,1,1,1,...,0,1,0,1,0,0,0,0,0,0


In [131]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.85


In [132]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
total_samples = np.sum(conf_matrix)
conf_matrix_percent = np.round(((conf_matrix / total_samples) * 100),2)
print('Confusion Matrix:')
print(conf_matrix_percent)

Confusion Matrix:
[[66.6   6.82]
 [ 8.6  17.98]]
