In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

In [3]:
# Load the dataset and split it into features and target
data = pd.read_csv(r"D:\DAI 101\ulalala\suv_data.csv")
X = data[['Age', 'EstimatedSalary']]
y = data['Purchased']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Function to train the logistic regression model and evaluate it
def evaluate_model(X_train, y_train, X_test, y_test):
    model = LogisticRegression(class_weight='balanced')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    return acc, f1, auc

In [5]:
# Baseline model without any resampling
print("Baseline (No Resampling):")
acc, f1, auc = evaluate_model(X_train, y_train, X_test, y_test)
print(f"Accuracy: {acc}, F1 Score: {f1}, AUC: {auc}")

Baseline (No Resampling):
Accuracy: 0.875, F1 Score: 0.8421052631578947, AUC: 0.9594870300204021


In [6]:
# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

print("\nRandom Oversampling:")
acc, f1, auc = evaluate_model(X_resampled, y_resampled, X_test, y_test)
print(f"Accuracy: {acc}, F1 Score: {f1}, AUC: {auc}")


Random Oversampling:
Accuracy: 0.8666666666666667, F1 Score: 0.8297872340425532, AUC: 0.9594870300204021


In [7]:
# Apply Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

print("\nRandom Undersampling:")
acc, f1, auc = evaluate_model(X_resampled, y_resampled, X_test, y_test)
print(f"Accuracy: {acc}, F1 Score: {f1}, AUC: {auc}")


Random Undersampling:
Accuracy: 0.8666666666666667, F1 Score: 0.8297872340425532, AUC: 0.9591955698047216


In [8]:
# Apply SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("\nSMOTE:")
acc, f1, auc = evaluate_model(X_resampled, y_resampled, X_test, y_test)
print(f"Accuracy: {acc}, F1 Score: {f1}, AUC: {auc}")


SMOTE:
Accuracy: 0.8666666666666667, F1 Score: 0.8297872340425532, AUC: 0.9591955698047216


In [9]:
# Apply Tomek Links undersampling
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X_train, y_train)

print("\nTomek Links:")
acc, f1, auc = evaluate_model(X_resampled, y_resampled, X_test, y_test)
print(f"Accuracy: {acc}, F1 Score: {f1}, AUC: {auc}")


Tomek Links:
Accuracy: 0.8666666666666667, F1 Score: 0.8333333333333334, AUC: 0.9580297289419994


In [10]:
# Evaluate using class weights (built-in class balancing)
print("\nClass Weight Balanced:")
acc, f1, auc = evaluate_model(X_train, y_train, X_test, y_test)
print(f"Accuracy: {acc}, F1 Score: {f1}, AUC: {auc}")


Class Weight Balanced:
Accuracy: 0.875, F1 Score: 0.8421052631578947, AUC: 0.9594870300204021
