In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import os

In [2]:
category = 'proposed'

In [3]:
def train_and_predict(dataset, test_fold, number_of_features):
    # Load data
    folds_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
    features_df = pd.read_csv(f'../../training_data/{dataset}/features_sorted_transformed.csv').iloc[:, :1 + number_of_features * 2]
    target_df = pd.read_csv(f'../../training_data/{dataset}/target.csv')

    # Split data into training and test sets
    train_ids = folds_df[folds_df['fold'] != test_fold]['sequenceID']
    test_ids = folds_df[folds_df['fold'] == test_fold]['sequenceID']
    
    features_df_train = features_df[features_df['sequenceID'].isin(train_ids)]
    features_df_test = features_df[features_df['sequenceID'].isin(test_ids)]
    target_df_train = target_df[target_df['sequenceID'].isin(train_ids)]

    def train_model(suffix, target_column):
        # Filter features and target
        X_train = features_df_train.filter(regex=fr'\.{suffix}$')
        X_test = features_df_test.filter(regex=fr'\.{suffix}$')
        y_train = target_df_train[target_column]

        # Filter out invalid rows
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        if suffix == 'lower':
            valid_rows = y_train != -np.inf
        else:
            valid_rows = y_train != np.inf

        # Ensure y_train has no NaN values
        valid_rows &= ~np.isnan(y_train)
        X_train = X_train[valid_rows].to_numpy()
        y_train = y_train[valid_rows].to_numpy()

        # Replace NaN values in X_train and X_test with 0
        X_train = np.nan_to_num(X_train)
        X_test = np.nan_to_num(X_test)

        # Train model
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Predict
        return model.predict(X_test)

    # Train and predict for both lower and upper models
    lda_lower_pred = train_model('lower', 'min.log.lambda')
    lda_upper_pred = train_model('upper', 'max.log.lambda')

    # Calculate and return the average vector
    pred_lldas = (lda_lower_pred + lda_upper_pred) / 2

    lldas_df = pd.DataFrame(list(zip(features_df_test['sequenceID'], pred_lldas)), columns=['sequenceID', 'llda'])
    lldas_df.to_csv(f'predictions/{category}.{dataset}.{test_fold}.{number_of_features}.csv', index=False)

In [4]:
folder_path = '../../training_data'
datasets = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

In [5]:
for number_of_features in range(1, 21):
    for dataset in datasets:
        fold_df = pd.read_csv(f'../../training_data/{dataset}/folds.csv')
        for test_fold in range(1, np.unique(fold_df['fold']).__len__() + 1):
            train_and_predict(dataset, test_fold, number_of_features)