In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from utility_functions import SquaredHingeLoss, get_acc

In [None]:
# Define the linear model
class LinearModel(nn.Module):
    def __init__(self, input_size):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        
    def forward(self, x):
        return self.linear(x)

In [None]:
# Define the L1 regularization term
def l1_regularization(model, lambda_l1):
    l1_loss = 0
    for param in model.parameters():
        l1_loss += torch.norm(param, p=1)
    return lambda_l1 * l1_loss

In [None]:
# filter non-complete columns
def filter_numeric_columns(df):
    numeric_columns = ['sequenceID']
    for column in df.columns:
        if df[column].dtype in ['int64', 'float64']:
            if df[column].notna().all():
                if not df[column].isin([np.inf, -np.inf]).any():
                    numeric_columns.append(column)
    numeric_columns = numeric_columns
    return df[numeric_columns]

In [None]:
# get fold dfs
def get_fold_dfs(fold, fold_df, inputs_df, outputs_df, evaluation_df):
    train_inputs_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] != fold]['sequenceID'])]
    train_outputs_df = outputs_df[outputs_df['sequenceID'].isin(fold_df[fold_df['fold'] != fold]['sequenceID'])]
    train_eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] != fold]['sequenceID'])]
    test_inputs_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    test_eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
    return filter_numeric_columns(train_inputs_df), train_outputs_df, train_eval_df, filter_numeric_columns(test_inputs_df), test_eval_df

In [None]:
for dataset in ['detailed', 'systematic', 'epigenomic']:
    # Paths setup
    fold_path = f'training_data/{dataset}/folds.csv'
    inputs_path = f'training_data/{dataset}/inputs_old.csv'
    outputs_path = f'training_data/{dataset}/outputs.csv'
    evaluation_path = f'training_data/{dataset}/evaluation.csv'
    acc_rate_path = f'acc_rate/{dataset}.csv'
    output_df_path = f'record_dataframe/{dataset}/'

    # Read dataframes
    fold_df = pd.read_csv(fold_path)
    inputs_df = pd.read_csv(inputs_path)
    outputs_df = pd.read_csv(outputs_path)
    evaluation_df = pd.read_csv(evaluation_path)

    # Number of folds
    n_folds = fold_df['fold'].nunique()

    # Define candidate lambda_l1 values
    candidate_lambda_l1 = [0, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 20]

    # Initialize variables to store best lambda_l1 and corresponding best accuracy
    best_lambda_l1 = None
    best_accuracy = 0

    # Iterate over each candidate lambda_l1
    for lambda_l1 in candidate_lambda_l1:
        total_acc = 0
        for fold in range(1, n_folds + 1):
            train_inputs_df, train_outputs_df, train_eval_df, test_inputs_df, test_eval_df = get_fold_dfs(fold, fold_df, inputs_df, outputs_df, evaluation_df)

            # Convert data to tensors
            inputs = torch.Tensor(train_inputs_df.drop(columns=['sequenceID']).to_numpy())
            test_inputs = torch.Tensor(test_inputs_df.drop(columns=['sequenceID']).to_numpy())
            targets_low = torch.Tensor(train_outputs_df['min.log.lambda'].to_numpy().reshape(-1, 1))
            targets_high = torch.Tensor(train_outputs_df['max.log.lambda'].to_numpy().reshape(-1, 1))
            outputs = torch.cat((targets_low, targets_high), dim=1)

            # Hyperparameters
            lr = 0.00001
            n_iters = 10000000

            # Initialize the model
            model = LinearModel(inputs.shape[1])

            # Define loss function and optimizer
            criterion = SquaredHingeLoss()
            optimizer = optim.Adam(model.parameters(), lr=lr)

            # Training loop
            best_loss = float('inf')
            patience = 100000
            wait = 0
            for epoch in range(n_iters):
                model.train()
                loss = criterion(model(inputs), outputs)
                loss += l1_regularization(model, lambda_l1)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if loss < best_loss:
                    best_loss = loss
                    wait = 0
                else:
                    wait += 1
                    if wait >= patience:
                        print("Early stopping after {} epochs without improvement.".format(patience))
                        break

            # Calculate accuracy
            with torch.no_grad():
                lldas = model(test_inputs).numpy().reshape(-1)

            lldas_df = pd.DataFrame(list(zip(test_inputs_df['sequenceID'], lldas)), columns=['sequenceID', 'llda'])
            acc = get_acc(test_eval_df, lldas_df)
            total_acc += acc
            print(dataset, fold, acc)

        # Calculate average accuracy across folds
        avg_acc = total_acc / n_folds

        # Check if this lambda_l1 gives better accuracy
        if avg_acc > best_accuracy:
            best_accuracy = avg_acc
            best_lambda_l1 = lambda_l1