## Apply SMOTE for upsampling the data

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

In [3]:
data = pd.read_csv(r"C:\Users\HP\Desktop\Iron Hack\Python\Labs\lab-cross-validation\files_for_lab\Customer-Churn.csv")
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [13]:
data.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [18]:
data['MonthlyChargesCategory'] = pd.cut(data['MonthlyCharges'], bins=2, labels=['Low', 'High'])

data = data.drop(columns=['MonthlyCharges'])

In [19]:
# One-hot encode categorical columns
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
encoder = OneHotEncoder(sparse=False, drop='first')

# Apply one-hot encoding to categorical columns
encoded_categorical = pd.DataFrame(encoder.fit_transform(data[categorical_cols]), columns=encoder.get_feature_names_out(categorical_cols))

# Select numerical columns
numerical_cols = data.select_dtypes(include=[np.number])

# Combine encoded categorical and numerical columns
X = pd.concat([numerical_cols, encoded_categorical], axis=1)
y = data['MonthlyChargesCategory']



In [20]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f'Original dataset shape: {y_train.value_counts()}')
print(f'Resampled dataset shape: {y_train_smote.value_counts()}')

Original dataset shape: MonthlyChargesCategory
High    3029
Low     2605
Name: count, dtype: int64
Resampled dataset shape: MonthlyChargesCategory
Low     3029
High    3029
Name: count, dtype: int64


In [21]:
# Train the logistic regression model
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_logreg = logreg.predict(X_test)

# Compute accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f'Logistic Regression Accuracy: {accuracy_logreg}')

Logistic Regression Accuracy: 1.0


In [22]:
# Train the decision tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train_smote, y_train_smote)

# Predict on the test set
y_pred_tree = tree_model.predict(X_test)

# Compute accuracy
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f'Decision Tree Accuracy: {accuracy_tree}')

Decision Tree Accuracy: 1.0


In [23]:
print(f'Comparison of Accuracies:')
print(f'Logistic Regression Accuracy: {accuracy_logreg}')
print(f'Decision Tree Accuracy: {accuracy_tree}')

Comparison of Accuracies:
Logistic Regression Accuracy: 1.0
Decision Tree Accuracy: 1.0


## Apply TomekLinks for downsampling

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import OneHotEncoder

In [26]:
# Apply TomekLinks
tomek = TomekLinks()
X_train_tomek, y_train_tomek = tomek.fit_resample(X_train, y_train)

print(f'Original dataset shape: {y_train.value_counts()}')
print(f'Resampled dataset shape: {y_train_tomek.value_counts()}')

Original dataset shape: MonthlyChargesCategory
High    3029
Low     2605
Name: count, dtype: int64
Resampled dataset shape: MonthlyChargesCategory
High    3003
Low     2605
Name: count, dtype: int64


In [27]:
# Train the logistic regression model
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_tomek, y_train_tomek)

# Predict on the test set
y_pred_logreg = logreg.predict(X_test)

# Compute accuracy
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f'Logistic Regression Accuracy: {accuracy_logreg}')

Logistic Regression Accuracy: 1.0


In [29]:
# Train the decision tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train_tomek, y_train_tomek)

# Predict on the test set
y_pred_tree = tree_model.predict(X_test)

# Compute accuracy
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f'Decision Tree Accuracy: {accuracy_tree}')

Decision Tree Accuracy: 1.0


In [30]:
print(f'Comparison of Accuracies:')
print(f'Logistic Regression Accuracy: {accuracy_logreg}')
print(f'Decision Tree Accuracy: {accuracy_tree}')

Comparison of Accuracies:
Logistic Regression Accuracy: 1.0
Decision Tree Accuracy: 1.0


In [31]:
# Reapply TomekLinks
X_train_tomek2, y_train_tomek2 = tomek.fit_resample(X_train_tomek, y_train_tomek)

print(f'After first application: {y_train_tomek.value_counts()}')
print(f'After second application: {y_train_tomek2.value_counts()}')

After first application: MonthlyChargesCategory
High    3003
Low     2605
Name: count, dtype: int64
After second application: MonthlyChargesCategory
High    3003
Low     2605
Name: count, dtype: int64
