In [None]:
# Load the dataset and explore the variables.

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('files_for_lab/customer_churn.csv')

# Explore the variables
print(data.head())
print(data.info())


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [2]:
# We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,MonthlyCharges.

In [3]:
# Extract the target variable
target = data['Churn']

In [5]:
data['Churn']

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [6]:
from sklearn.preprocessing import StandardScaler

# Extract the independent variables
independent_vars = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]

# Scale the independent variables
scaler = StandardScaler()
scaled_independent_vars = scaler.fit_transform(independent_vars)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(scaled_independent_vars, target, test_size=0.2, random_state=42)

# Build the logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


LogisticRegression()

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report
# Predict the target variable
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
predictions = logreg.predict(X_test)
print(classification_report(y_test, predictions))


Accuracy: 0.8076650106458482
              precision    recall  f1-score   support

          No       0.83      0.92      0.88      1036
         Yes       0.70      0.49      0.57       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.80      0.81      0.80      1409



In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')
f1 = f1_score(y_test, y_pred, pos_label='Yes')

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.8076650106458482
Precision: 0.6961538461538461
Recall: 0.48525469168900803
F1-score: 0.5718799368088469


In [17]:
#Even a simple model will give us more than 70% accuracy. Why? Because the dataset is imbalanced. The majority class (non-churn) likely dominates the predictions, leading to a high accuracy even if the model is not performing well on the minority class (churn).

In [18]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the dataset
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Build and evaluate the logistic regression model
logreg_smote = LogisticRegression()
logreg_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = logreg_smote.predict(X_test)
accuracy_smote = accuracy_score(y_test, y_pred_smote)
print("Accuracy (with SMOTE):", accuracy_smote)


Accuracy (with SMOTE): 0.7444996451383961


In [19]:
from imblearn.under_sampling import TomekLinks

# Apply TomekLinks to the dataset
tomek = TomekLinks()
X_train_tomek, y_train_tomek = tomek.fit_resample(X_train, y_train)

# Build and evaluate the logistic regression model
logreg_tomek = LogisticRegression()
logreg_tomek.fit(X_train_tomek, y_train_tomek)
y_pred_tomek = logreg_tomek.predict(X_test)
accuracy_tomek = accuracy_score(y_test, y_pred_tomek)
print("Accuracy (with TomekLinks):", accuracy_tomek)


Accuracy (with TomekLinks): 0.794180269694819
