In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the data
data = pd.read_csv("files_for_lab/Customer-Churn.csv")
data  

# Split the data into features (X) and labels (y)
X = pd.get_dummies(data.drop('Churn', axis=1))
y = data['Churn']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# SMOTE to upsample the data
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train a logistic regression model on SMOTE data
logistic_regression = LogisticRegression(random_state=42, max_iter=100000)
logistic_regression.fit(X_train_smote, y_train_smote)

# Predict labels on the test set and calculate accuracy
lr_predictions_smote = logistic_regression.predict(X_test)
lr_accuracy_smote = accuracy_score(y_test, lr_predictions_smote)
print("Logistic Regression Accuracy (SMOTE):", lr_accuracy_smote)

# Train a decision tree model on SMOTE data
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_smote, y_train_smote)

# Predict labels on the test set and calculate accuracy
dt_predictions_smote = decision_tree.predict(X_test)
dt_accuracy_smote = accuracy_score(y_test, dt_predictions_smote)
print("Decision Tree Accuracy (SMOTE):", dt_accuracy_smote)

# Apply TomekLinks to reduce sampling
tomek = TomekLinks()
X_tl, y_tl = tomek.fit_resample(X, y)

# Split the reduced data into training and testing sets (80% training, 20% testing)
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size=0.2, random_state=42)

# Train a logistic regression model on TomekLinks data
logistic_regression_tl = LogisticRegression(random_state=42, max_iter=100000)
logistic_regression_tl.fit(X_train_tl, y_train_tl)

# Predict labels on the test set and calculate accuracy
lr_predictions_tl = logistic_regression_tl.predict(X_test_tl)
lr_accuracy_tl = accuracy_score(y_test_tl, lr_predictions_tl)
print("Logistic Regression Accuracy (TomekLinks):", lr_accuracy_tl)

# Train a decision tree model on TomekLinks data
decision_tree_tl = DecisionTreeClassifier()
decision_tree_tl.fit(X_train_tl, y_train_tl)

# Predict labels on the test set and calculate accuracy
dt_predictions_tl = decision_tree_tl.predict(X_test_tl)
dt_accuracy_tl = accuracy_score(y_test_tl, dt_predictions_tl)
print("Decision Tree Accuracy (TomekLinks):", dt_accuracy_tl)


Logistic Regression Accuracy (SMOTE): 0.7885024840312278
Decision Tree Accuracy (SMOTE): 0.7388218594748048
Logistic Regression Accuracy (TomekLinks): 0.8043972706595905
Decision Tree Accuracy (TomekLinks): 0.7983320697498104
