In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [33]:
# Load the data
data = pd.read_csv("files_for_lab/Customer-Churn.csv")
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [34]:
# Split the data into features (X) and labels (y)
X = pd.get_dummies(data.drop('Churn', axis=1))
y = data['Churn']

In [35]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [36]:
# SMOTE to upsample the data
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [37]:
# Train a logistic regression model on SMOTE data
logistic_regression = LogisticRegression(random_state=42, max_iter=100000)
logistic_regression.fit(X_train_smote, y_train_smote)

LogisticRegression(max_iter=100000, random_state=42)

In [38]:
# Predict labels on the test set and calculate accuracy
lr_predictions_smote = logistic_regression.predict(X_test)
lr_accuracy_smote = accuracy_score(y_test, lr_predictions_smote)
print("Logistic Regression Accuracy (SMOTE):", lr_accuracy_smote)

Logistic Regression Accuracy (SMOTE): 0.7899219304471257


In [39]:
# Train a decision tree model on SMOTE data
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_smote, y_train_smote)

DecisionTreeClassifier()

In [40]:
# Predict labels on the test set and calculate accuracy
dt_predictions_smote = decision_tree.predict(X_test)
dt_accuracy_smote = accuracy_score(y_test, dt_predictions_smote)
print("Decision Tree Accuracy (SMOTE):", dt_accuracy_smote)

Decision Tree Accuracy (SMOTE): 0.7480482611781405


In [41]:
# Apply TomekLinks to reduce sampling
tomek = TomekLinks()
X_tl, y_tl = tomek.fit_resample(X, y)

In [42]:
# Split the reduced data into training and testing sets
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size=0.2, random_state=42)

In [43]:
# Train a logistic regression model on TomekLinks data
logistic_regression_tl = LogisticRegression(random_state=42, max_iter=100000)
logistic_regression_tl.fit(X_train_tl, y_train_tl)

LogisticRegression(max_iter=100000, random_state=42)

In [44]:
# Predict labels on the test set and calculate accuracy
lr_predictions_tl = logistic_regression_tl.predict(X_test_tl)
lr_accuracy_tl = accuracy_score(y_test_tl, lr_predictions_tl)
print("Logistic Regression Accuracy (TomekLinks):", lr_accuracy_tl)

Logistic Regression Accuracy (TomekLinks): 0.8043972706595905


In [45]:
# Train a decision tree model on TomekLinks data
decision_tree_tl = DecisionTreeClassifier()
decision_tree_tl.fit(X_train_tl, y_train_tl)

DecisionTreeClassifier()

In [46]:
# Predict labels on the test set and calculate accuracy
dt_predictions_tl = decision_tree_tl.predict(X_test_tl)
dt_accuracy_tl = accuracy_score(y_test_tl, dt_predictions_tl)
print("Decision Tree Accuracy (TomekLinks):", dt_accuracy_tl)

Decision Tree Accuracy (TomekLinks): 0.796057619408643
