In [None]:
# Ian Hedges, Collin Glover
# MGMT388 Lab 5

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score

# Task 1: Load the Telco Customer Churn dataset using Pandas
data = pd.DataFrame(pd.read_csv("/content/drive/MyDrive/Telco-Customer-Churn.csv"))

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Telco-Customer-Churn.csv'

In [None]:
# Task 2: Perform exploratory data analysis (EDA)
# Summary statistics
print("Summary Statistics:")
print(data.describe())

In [None]:
#print gender column
print(data['gender'])

In [None]:
#preview data
data.head()

In [None]:
# Create pairplot
sns.pairplot(data = data)

In [None]:
# Data visualization
sns.boxplot(x='Churn', y='tenure', data=data)
plt.title('Tenure vs Churn')
plt.show()

In [None]:
# Count plot howing distribution of each categorical variable
cat_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
for column in cat_cols:
  sns.countplot(data = data[column])
  plt.show()

In [None]:
# Preprocessing TotalCharges to handle the missing values that appeared as " " and fix the ValueError
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan)
data['TotalCharges'] = data['TotalCharges'].astype(float)

In [None]:
# Encode categorical variables using OneHotEncoder
cat_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
encoder = OneHotEncoder(drop='first', sparse=False)
encoded_data = encoder.fit_transform(data[cat_cols])

encoded_columns = encoder.categories_
new_columns = [f"{col}_{value}" for col, values in zip(cat_cols, encoded_columns) for value in values[1:]]

encoded_df = pd.DataFrame(encoded_data, columns=new_columns)

data.drop(columns=cat_cols, inplace=True)

data = pd.concat([data, encoded_df], axis=1)


In [None]:
# Removing NaN data from dataset (11 total)

data.isna().sum()
data = data.dropna()

In [None]:
# Split the dataset into training and testing sets
X = data.drop(['customerID', 'Churn'], axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Task 4: Build a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Task 5: Evaluate the performance of the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

unique_values = y_test.unique()
precision = precision_score(y_test, y_pred, pos_label=unique_values[1])

#precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, pos_label=unique_values[1])
f1 = f1_score(y_test, y_pred, pos_label=unique_values[1])

print("Model Evaluation:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Task 6: Visualize the results of the logistic regression model
# ROC curve for logistic regression, we used LabelEncoder to prevent a position error being thrown
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_test_encoded = le.fit_transform(y_test)
y_pred_prob = model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test_encoded, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, cmap="Reds", fmt="d")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Distribution of probabilty

import matplotlib.pyplot as plt
sns.histplot(model.predict_proba(X_test)[:,1])
plt.xlabel('Probability of Churn')
plt.ylabel('Frequency')
plt.title('Distribution of Churn Probability')
plt.show()
