In [274]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [275]:
seed_value = 42 # set a seed value for reproducibility
np.random.seed(seed_value)
random.seed(seed_value)

X_train = pd.read_csv("X_train.csv")
X_val = pd.read_csv("X_val.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()
y_val = pd.read_csv("y_val.csv").values.ravel()
y_test = pd.read_csv("y_test.csv").values.ravel()

### Train logistic regression model

In [277]:
# Helper function to print results of the model
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    return accuracy, precision, recall, f1

In [278]:
# model 1: Logistic Regression with Original Features and 6 different regularization values
print("\nLogistic Regression with Original Features:")
for C in [0.01, 0.1, 1, 10, 100, 1000]:
    print(f"Regularization C={C}")
    logistic_model = LogisticRegression(C=C, penalty='l2', solver='liblinear', random_state=seed_value)
    logistic_model.fit(X_train, y_train)
    print("Validation Metrics:")
    evaluate_model(logistic_model, X_val, y_val)
    print("-" * 40)


Logistic Regression with Original Features:
Regularization C=0.01
Validation Metrics:
Accuracy: 0.91
Precision: 0.96
Recall: 0.88
F1-Score: 0.92
----------------------------------------
Regularization C=0.1
Validation Metrics:
Accuracy: 0.90
Precision: 0.95
Recall: 0.90
F1-Score: 0.92
----------------------------------------
Regularization C=1
Validation Metrics:
Accuracy: 0.90
Precision: 0.94
Recall: 0.90
F1-Score: 0.92
----------------------------------------
Regularization C=10
Validation Metrics:
Accuracy: 0.90
Precision: 0.94
Recall: 0.90
F1-Score: 0.92
----------------------------------------
Regularization C=100
Validation Metrics:
Accuracy: 0.90
Precision: 0.94
Recall: 0.90
F1-Score: 0.92
----------------------------------------
Regularization C=1000
Validation Metrics:
Accuracy: 0.90
Precision: 0.94
Recall: 0.90
F1-Score: 0.92
----------------------------------------


In [279]:
# model 2: Polynomial Features + Logistic Regression
print("\nLogistic Regression with Polynomial Features (degree 2):")
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)

for C in [0.01, 0.1, 1, 10, 100, 1000]:
    print(f"Regularization C={C}")
    logistic_model = LogisticRegression(C=C, penalty='l1', solver='liblinear', random_state=seed_value)
    logistic_model.fit(X_train_poly, y_train)
    print("Validation Metrics:")
    evaluate_model(logistic_model, X_val_poly, y_val)
    print("-" * 40)


Logistic Regression with Polynomial Features (degree 2):
Regularization C=0.01
Validation Metrics:
Accuracy: 0.91
Precision: 0.94
Recall: 0.91
F1-Score: 0.93
----------------------------------------
Regularization C=0.1
Validation Metrics:
Accuracy: 0.91
Precision: 0.94
Recall: 0.92
F1-Score: 0.93
----------------------------------------
Regularization C=1
Validation Metrics:
Accuracy: 0.94
Precision: 0.95
Recall: 0.94
F1-Score: 0.95
----------------------------------------
Regularization C=10
Validation Metrics:
Accuracy: 0.93
Precision: 0.95
Recall: 0.94
F1-Score: 0.95
----------------------------------------
Regularization C=100
Validation Metrics:
Accuracy: 0.93
Precision: 0.95
Recall: 0.94
F1-Score: 0.95
----------------------------------------
Regularization C=1000
Validation Metrics:
Accuracy: 0.93
Precision: 0.95
Recall: 0.94
F1-Score: 0.95
----------------------------------------


In [280]:
# model 3: PCA Transformation + Logistic Regression
print("\nLogistic Regression with PCA Transformation (95% variance retained):")
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

for C in [0.01, 0.1, 1, 10, 100, 1000]:
    print(f"Regularization C={C}")
    logistic_model = LogisticRegression(C=C, penalty='l1', solver='liblinear', random_state=seed_value)
    logistic_model.fit(X_train_pca, y_train)
    print("Validation Metrics:")
    evaluate_model(logistic_model, X_val_pca, y_val)
    print("-" * 40)


Logistic Regression with PCA Transformation (95% variance retained):
Regularization C=0.01
Validation Metrics:
Accuracy: 0.47
Precision: 0.60
Recall: 0.48
F1-Score: 0.53
----------------------------------------
Regularization C=0.1
Validation Metrics:
Accuracy: 0.47
Precision: 0.60
Recall: 0.48
F1-Score: 0.53
----------------------------------------
Regularization C=1
Validation Metrics:
Accuracy: 0.47
Precision: 0.60
Recall: 0.48
F1-Score: 0.53
----------------------------------------
Regularization C=10
Validation Metrics:
Accuracy: 0.47
Precision: 0.60
Recall: 0.48
F1-Score: 0.53
----------------------------------------
Regularization C=100
Validation Metrics:
Accuracy: 0.47
Precision: 0.60
Recall: 0.48
F1-Score: 0.53
----------------------------------------
Regularization C=1000
Validation Metrics:
Accuracy: 0.47
Precision: 0.60
Recall: 0.48
F1-Score: 0.53
----------------------------------------


### Lets do a final run on the test data

In [282]:
# Best Logistic Regression Model: PCA transformation
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
logistic_model = LogisticRegression(C=C, penalty='l1', solver='liblinear', random_state=seed_value)
logistic_model.fit(X_train, y_train)
# Evaluate on Test Set
y_test_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"Test Set Metrics for Logistic Regression (C=10):")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Test Set Metrics for Logistic Regression (C=10):
Accuracy: 0.91
Precision: 0.95
Recall: 0.90
F1-Score: 0.93
