In [21]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def split_and_visualize_dataset(file_path, output_dir):
    """
    Splits the dataset into training, validation, and test sets,
    and saves a visualization of the split sizes.

    Parameters:
    - file_path: str, path to the CSV file.
    - output_dir: str, directory where the visualization will be saved.

    Returns:
    - None, but saves the visualization as a PNG file in the specified directory.
    """
    # Load the dataset
    data = pd.read_csv(file_path)

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Separate features and target variable
    X = data.drop(columns=["Market_Label", "Date"])
    y = data["Market_Label"]

    # First, split into training (70%) and temp (30% for validation and test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )

    # Split the temp set further into validation (15%) and test (15%)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
    )

    # Visualize the split sizes
    split_sizes = [len(X_train), len(X_val), len(X_test)]
    labels = ['Training Set', 'Validation Set', 'Test Set']

    plt.figure(figsize=(8, 5))
    plt.bar(labels, split_sizes, color=['blue', 'orange', 'green'])
    plt.title('Dataset Split Visualization', fontsize=14)
    plt.ylabel('Number of Entries', fontsize=12)
    plt.xlabel('Dataset Splits', fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    # Save the plot
    output_file = os.path.join(output_dir, "dataset_split_visualization.png")
    plt.savefig(output_file)
    plt.close()

    print(f"Visualization saved to {output_file}")
    # Return the splits
    return X_train, X_val, X_test, y_train, y_val, y_test

def train_logistic_regression(X_train, y_train, X_val, y_val):
    """
    Trains Logistic Regression models with L1 and L2 regularization and evaluates them.

    Parameters:
    - X_train, y_train: Training features and labels.
    - X_val, y_val: Validation features and labels.

    Returns:
    - None, prints evaluation metrics.
    """
    for penalty in ["l1", "l2"]:
        print(f"\nTraining Logistic Regression with {penalty.upper()} regularization")
        model = LogisticRegression(penalty=penalty, solver="liblinear", random_state=42)
        model.fit(X_train, y_train)

        # Predict on validation set
        y_pred = model.predict(X_val)

        # Evaluate the model
        accuracy = accuracy_score(y_val, y_pred)
        print(f"Accuracy with {penalty.upper()} regularization: {accuracy:.4f}")
        print("Classification Report:\n", classification_report(y_val, y_pred))

def plot_confusion_matrix(y_true, y_pred, title, output_dir):
    """
    Plots and saves a confusion matrix visualization.

    Parameters:
    - y_true: True labels.
    - y_pred: Predicted labels.
    - title: Title for the plot.
    - output_dir: Directory where the plot will be saved.

    Returns:
    - None
    """
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[-1, 1])
    
    plt.figure(figsize=(8, 6))
    disp.plot(cmap='Blues', values_format='d')
    plt.title(title)
    
    output_file = os.path.join(output_dir, f"{title.replace(' ', '_').lower()}_confusion_matrix.png")
    plt.savefig(output_file)
    plt.close()
    print(f"{title} Confusion Matrix saved to {output_file}")

def fine_tune_logistic_regression(X_train, y_train):
    """
    Fine-tunes Logistic Regression hyperparameters using GridSearchCV.

    Parameters:
    - X_train, y_train: Training features and labels.

    Returns:
    - Best estimator from GridSearchCV.
    """
    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear']
    }

    grid_search = GridSearchCV(
        LogisticRegression(random_state=42),
        param_grid,
        cv=5,
        scoring='accuracy',
        verbose=1
    )

    grid_search.fit(X_train, y_train)

    print("\nBest Hyperparameters:")
    print(grid_search.best_params_)
    print("\nBest Cross-Validated Accuracy:")
    print(grid_search.best_score_)

    return grid_search.best_estimator_

# split the standardscaled nasdaq data
csv_file_path_standardscaled_nasdaq = "../data/standardscaler/cleaned_normalized_combined_data_nasdaq.csv"
output_directory_standardscaled_nasdaq = "../data/data_split/standard_scaler_nasdaq"
split_and_visualize_dataset(csv_file_path_standardscaled_nasdaq, output_directory_standardscaled_nasdaq)

# split the standardscaled sp500
csv_file_path_standardscaled_sp500 = "../data/standardscaler/cleaned_normalized_combined_data_sp500.csv"
output_directory_standardscaled_sp500 = "../data/data_split/standard_scaler_sp500"
split_and_visualize_dataset(csv_file_path_standardscaled_sp500, output_directory_standardscaled_sp500)

print("Run logistic regression model for standardscaled nasdaq data")

# Run logistic regression model for standardscaled nasdaq data
X_train, X_val, X_test, y_train, y_val, y_test = split_and_visualize_dataset(csv_file_path_standardscaled_nasdaq, output_directory_standardscaled_nasdaq)
train_logistic_regression(X_train, y_train, X_val, y_val)

# Train and plot confusion matrices for both L1 and L2 regularization - Standard-scaled nasdaq
for penalty in ["l1", "l2"]:
    print(f"\nEvaluating Logistic Regression with {penalty.upper()} regularization")
    model = LogisticRegression(penalty=penalty, solver="liblinear", random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    # Plot confusion matrix
    output_dir_standardscaled_nasdaq = '../output/logistic_regression/nasdaq'
    plot_confusion_matrix(y_val, y_pred, f"Logistic Regression ({penalty.upper()})", output_dir_standardscaled_nasdaq)

# Perform fine-tuning of Logistic Regression - Standard-scaled nasdaq
best_model = fine_tune_logistic_regression(X_train, y_train)

# Evaluate the best model on the validation set - Standard-scaled nasdaq
y_pred = best_model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

print("Run logistic regression model for standardscaled sp500 data")

# Run logistic regression model for standardscaled sp500 data
X_train, X_val, X_test, y_train, y_val, y_test = split_and_visualize_dataset(csv_file_path_standardscaled_sp500, output_directory_standardscaled_sp500)
train_logistic_regression(X_train, y_train, X_val, y_val)

# Train and plot confusion matrices for both L1 and L2 regularization - Standard-scaled sp500
for penalty in ["l1", "l2"]:
    print(f"\nEvaluating Logistic Regression with {penalty.upper()} regularization")
    model = LogisticRegression(penalty=penalty, solver="liblinear", random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    # Plot confusion matrix
    output_dir_standardscaled_sp500 = '../output/logistic_regression/sp500'
    plot_confusion_matrix(y_val, y_pred, f"Logistic Regression ({penalty.upper()})", output_dir_standardscaled_sp500)

# Perform fine-tuning of Logistic Regression - Standard-scaled sp500
best_model = fine_tune_logistic_regression(X_train, y_train)

# Evaluate the best model on the validation set - Standard-scaled sp500
y_pred = best_model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))



Visualization saved to ../data/data_split/standard_scaler_nasdaq\dataset_split_visualization.png
Visualization saved to ../data/data_split/standard_scaler_sp500\dataset_split_visualization.png
Run logistic regression model for standardscaled nasdaq data
Visualization saved to ../data/data_split/standard_scaler_nasdaq\dataset_split_visualization.png

Training Logistic Regression with L1 regularization
Accuracy with L1 regularization: 0.8655
Classification Report:
               precision    recall  f1-score   support

          -1       0.68      0.57      0.62       188
           1       0.90      0.94      0.92       786

    accuracy                           0.87       974
   macro avg       0.79      0.75      0.77       974
weighted avg       0.86      0.87      0.86       974


Training Logistic Regression with L2 regularization
Accuracy with L2 regularization: 0.8655
Classification Report:
               precision    recall  f1-score   support

          -1       0.68      0.57

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>