In [225]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### set a seed value for reproducibility

In [227]:
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)

# Load preprocessed data
X_train = pd.read_csv("X_train.csv")
X_val = pd.read_csv("X_val.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()
y_val = pd.read_csv("y_val.csv").values.ravel()
y_test = pd.read_csv("y_test.csv").values.ravel()


### Train logistic regression model

In [229]:
# Helper function to evaluate the model
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    return accuracy, precision, recall, f1

In [230]:
# Experiment 1: Logistic Regression with Original Features (6 different regularization values)
print("\nLogistic Regression with Original Features:")
for C in [0.01, 0.1, 1, 10, 100, 1000]:
    print(f"Regularization C={C}")
    logistic_model = LogisticRegression(C=C, penalty='l2', solver='liblinear', random_state=seed_value)
    logistic_model.fit(X_train, y_train)
    print("Validation Metrics:")
    evaluate_model(logistic_model, X_val, y_val)
    print("-" * 40)


Logistic Regression with Original Features:
Regularization C=0.01
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.91
F1-Score: 0.92
----------------------------------------
Regularization C=0.1
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.92
F1-Score: 0.92
----------------------------------------
Regularization C=1
Validation Metrics:
Accuracy: 0.90
Precision: 0.91
Recall: 0.93
F1-Score: 0.92
----------------------------------------
Regularization C=10
Validation Metrics:
Accuracy: 0.90
Precision: 0.91
Recall: 0.93
F1-Score: 0.92
----------------------------------------
Regularization C=100
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.93
F1-Score: 0.92
----------------------------------------
Regularization C=1000
Validation Metrics:
Accuracy: 0.90
Precision: 0.91
Recall: 0.93
F1-Score: 0.92
----------------------------------------


In [231]:
# Experiment 2: Polynomial Features (degree 2) + Logistic Regression
print("\nLogistic Regression with Polynomial Features (degree 2):")
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)

for C in [0.01, 0.1, 1, 10, 100, 1000]:
    print(f"Regularization C={C}")
    logistic_model = LogisticRegression(C=C, penalty='l2', solver='liblinear', random_state=seed_value)
    logistic_model.fit(X_train_poly, y_train)
    print("Validation Metrics:")
    evaluate_model(logistic_model, X_val_poly, y_val)
    print("-" * 40)


Logistic Regression with Polynomial Features (degree 2):
Regularization C=0.01
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.92
F1-Score: 0.92
----------------------------------------
Regularization C=0.1
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.92
F1-Score: 0.92
----------------------------------------
Regularization C=1
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.92
F1-Score: 0.92
----------------------------------------
Regularization C=10
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.92
F1-Score: 0.92
----------------------------------------
Regularization C=100
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.92
F1-Score: 0.92
----------------------------------------
Regularization C=1000
Validation Metrics:
Accuracy: 0.90
Precision: 0.92
Recall: 0.92
F1-Score: 0.92
----------------------------------------


In [232]:
# Experiment 3: PCA Transformation + Logistic Regression
print("\nLogistic Regression with PCA Transformation:")
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

for C in [0.01, 0.1, 1, 10, 100, 1000]:
    print(f"Regularization C={C}")
    logistic_model = LogisticRegression(C=C, penalty='l2', solver='liblinear', random_state=seed_value)
    logistic_model.fit(X_train_pca, y_train)
    print("Validation Metrics:")
    evaluate_model(logistic_model, X_val_pca, y_val)
    print("-" * 40)


Logistic Regression with PCA Transformation:
Regularization C=0.01
Validation Metrics:
Accuracy: 0.62
Precision: 0.62
Recall: 1.00
F1-Score: 0.77
----------------------------------------
Regularization C=0.1
Validation Metrics:
Accuracy: 0.62
Precision: 0.62
Recall: 1.00
F1-Score: 0.77
----------------------------------------
Regularization C=1
Validation Metrics:
Accuracy: 0.62
Precision: 0.62
Recall: 1.00
F1-Score: 0.77
----------------------------------------
Regularization C=10
Validation Metrics:
Accuracy: 0.62
Precision: 0.62
Recall: 1.00
F1-Score: 0.77
----------------------------------------
Regularization C=100
Validation Metrics:
Accuracy: 0.62
Precision: 0.62
Recall: 1.00
F1-Score: 0.77
----------------------------------------
Regularization C=1000
Validation Metrics:
Accuracy: 0.62
Precision: 0.62
Recall: 1.00
F1-Score: 0.77
----------------------------------------
