In [10]:
import numpy as np
import pandas as pd
from utility_functions import get_acc, add_row_to_csv

In [11]:
def BIC(inputs_df):
    seqID = inputs_df['sequenceID']
    lldas = np.log(np.log(inputs_df['length']))
    lldas_df = pd.concat([seqID, lldas], axis=1)
    return lldas_df.rename(columns={'length': 'llda'})

In [12]:
for dataset in ['detailed', 'systematic', 'epigenomic']:

    # training data
    fold_path = 'training_data/' + dataset + '/folds.csv'
    inputs_path = 'training_data/' + dataset + '/inputs.csv'
    evaluation_path = 'training_data/' + dataset + '/evaluation.csv'

    # raw dfs
    fold_df = pd.read_csv(fold_path)
    inputs_df = pd.read_csv(inputs_path)
    evaluation_df = pd.read_csv(evaluation_path)

    # number of folds
    n_folds = fold_df['fold'].nunique()

    # main function
    total_acc = 0
    for fold in range(1, n_folds + 1):
        fold_inputs_df = inputs_df[inputs_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]
        fold_eval_df = evaluation_df[evaluation_df['sequenceID'].isin(fold_df[fold_df['fold'] == fold]['sequenceID'])]

        lldas_df = BIC(fold_inputs_df)
        add_row_to_csv('acc_rate_csvs/' + dataset + '.csv', 
                    ['method', 'fold', 'feature engineer', 'acc'],
                    ['BIC.1', fold, 'yes', get_acc(fold_eval_df, lldas_df)])