In [2]:
from pathlib import Path

import pandas as pd
from sklearn.metrics import accuracy_score, mean_squared_error

from utils.prepare_data import read_data, save_data
from utils.project_parameters import summary_statistic_order

### Get performance metrics for each feature subset

In [7]:
def get_validation_metric(filename, df, is_baseline=False):
    if is_baseline:
        target, _, _ = Path(filename).stem.split('_')
        feature_subset = '1111111'
    else:
        target, _, features, _ = Path(filename).stem.split('_')
        feature_subset = features.split('-')[1]
    if 'true_log_selection_coefficient' in df.columns:
        metric = "rmse"
        value = mean_squared_error(
            y_true = df.true_log_selection_coefficient,
            y_pred = df.predicted_log_selection_coefficient,
            squared=False
        )
    else:
        metric = "accuracy"
        value = accuracy_score(
            y_true=df.true_ix,
            y_pred=df.predicted_ix
        )
    return (feature_subset, target, metric, value)

In [18]:
feature_analysis = pd.DataFrame.from_records(
    (get_validation_metric(filename, read_data(filename)) for filename in snakemake.input["feature_subsets"]),
    columns=['feature_subset', 'target', 'metric', 'value']
)

baseline = pd.DataFrame.from_records(
    (get_validation_metric(filename, read_data(filename), is_baseline=True) for filename in snakemake.input["baseline"]),
    columns=['feature_subset', 'target', 'metric', 'value']
)

feature_analysis = pd.concat([feature_analysis, baseline])

In [20]:
save_data(feature_analysis, snakemake.output["feature_analysis"])

### Get grid of feature subsets

In [21]:
subsets = feature_analysis.feature_subset.unique()

In [23]:
def subset_to_features(subset):
    results = [int(i) for i in subset]
    return (subset, *results)

In [24]:
grid = pd.DataFrame.from_records(
    (subset_to_features(subset) for subset in subsets),
    columns=["feature_subset", *summary_statistic_order]
)

In [26]:
save_data(grid, snakemake.output["feature_analysis_code"])