In [47]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import time


In [34]:
# Binary Classification

num_splits = 5

for i in range(num_splits):
    start_time=time.time()
    train_file_path = f'./F24_Proj3_Data/split_{i+1}/train.csv'
    test_file_path = f'./F24_Proj3_Data/split_{i+1}/test.csv'
    test_y_file_path = f'./F24_Proj3_Data/split_{i+1}/test_y.csv'

    # Load data
    X_train = pd.read_csv(train_file_path).iloc[:, 3:]
    y_train = pd.read_csv(train_file_path).iloc[:, 1]

    X_test = pd.read_csv(test_file_path).iloc[:, 2:]
    y_test = pd.read_csv(test_y_file_path).iloc[:, 1]

    clf = LogisticRegression(penalty=None, solver='lbfgs', max_iter=1000)
    clf.fit(X_train, y_train)

    y_pred_proba = clf.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    print(f'Split {i}: AUC Score for split {i}: {auc_score:.7f} | Execution time : {round(time.time() - start_time, 4)} seconds')


Split 0: AUC Score for split 0: 0.9861161 | Execution time : 14.9251 seconds
Split 1: AUC Score for split 1: 0.9860022 | Execution time : 15.3945 seconds
Split 2: AUC Score for split 2: 0.9855030 | Execution time : 16.0086 seconds
Split 3: AUC Score for split 3: 0.9858564 | Execution time : 16.8945 seconds
Split 4: AUC Score for split 4: 0.9856297 | Execution time : 15.0286 seconds


In [43]:
# Use LogisticRegressionCV with elastic net penalty. Using cross validation and finding the best C and l1 ratio to speed up training.

num_splits = 5
cv = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

for i in range(num_splits):
    start_time = time.time()
    train_file_path = f'./F24_Proj3_Data/split_{2}/train.csv'
    test_file_path = f'./F24_Proj3_Data/split_{2}/test.csv'
    test_y_file_path = f'./F24_Proj3_Data/split_{2}/test_y.csv'

    # Load data
    X_train = pd.read_csv(train_file_path).iloc[:, 3:]
    y_train = pd.read_csv(train_file_path).iloc[:, 1]

    X_test = pd.read_csv(test_file_path).iloc[:, 2:]
    y_test = pd.read_csv(test_y_file_path).iloc[:, 1]

    # LogisticRegressionCV with 'elasticnet' penalty
    log_reg_cv = LogisticRegressionCV(
        penalty='elasticnet',
        solver='saga',
        l1_ratios=[0.1],  
        cv=cv,
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    )
    
    log_reg_cv.fit(X_train, y_train)

    best_C = log_reg_cv.C_
    best_l1_ratio = log_reg_cv.l1_ratio_

    print(f"Split {i+1}: Best C: {best_C} | Best l1_ratio: {best_l1_ratio}")

    y_pred_proba = log_reg_cv.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f'Split {i+1}: AUC Score for LogisticRegressionCV: {auc_score:.7f} | Execution time: {round(time.time() - start_time, 4)} seconds')



Split 1: Best C: [2.7825594] | Best l1_ratio: [0.1]
Split 1: AUC Score for LogisticRegressionCV: 0.9865602 | Execution time: 949.7106 seconds


KeyboardInterrupt: 

In [46]:
# Use LogisticRegression with best C and l1 ratio we found in the CV approach
num_splits = 5

for i in range(num_splits):
    start_time = time.time()
    train_file_path = f'./F24_Proj3_Data/split_{i+1}/train.csv'
    test_file_path = f'./F24_Proj3_Data/split_{i+1}/test.csv'
    test_y_file_path = f'./F24_Proj3_Data/split_{i+1}/test_y.csv'

    # Load data
    X_train = pd.read_csv(train_file_path).iloc[:, 3:]
    y_train = pd.read_csv(train_file_path).iloc[:, 1]

    X_test = pd.read_csv(test_file_path).iloc[:, 2:]
    y_test = pd.read_csv(test_y_file_path).iloc[:, 1]

    log_reg = LogisticRegression(
        penalty='elasticnet',
        solver='saga',
        l1_ratio=0.1,  
        C=2.7825594,
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    )
    
    log_reg.fit(X_train, y_train)

    y_pred_proba = log_reg.predict_proba(X_test)[:, 1]

    auc_score = roc_auc_score(y_test, y_pred_proba)
    print(f'Split {i+1}: AUC Score for LogisticRegressionCV: {auc_score:.7f} | Execution time: {round(time.time() - start_time, 4)} seconds')

Split 1: AUC Score for LogisticRegressionCV: 0.9869905 | Execution time: 24.5737 seconds
Split 2: AUC Score for LogisticRegressionCV: 0.9865600 | Execution time: 25.2715 seconds
Split 3: AUC Score for LogisticRegressionCV: 0.9862596 | Execution time: 25.7928 seconds
Split 4: AUC Score for LogisticRegressionCV: 0.9867768 | Execution time: 24.2103 seconds
Split 5: AUC Score for LogisticRegressionCV: 0.9862021 | Execution time: 25.2237 seconds
