In [2]:
from pathlib import Path

import pandas as pd
import fastai
import fastai.learner
import fastai.metrics
import fastai.callback.schedule
import fastai.callback.progress

from utils.prepare_data import read_data, save_data, target_columns, balancing_functions
from utils.project_parameters import data_dimension, summary_statistic_order, neural_network_batch_size
import utils.deeplearning

In [3]:
class SubsetSweepsDataset(utils.deeplearning.SweepsDataset):
    def __init__(self, prop_subset, balancing_func, data_tar, df, target_column, is_validation=False, feature_subset=None):
        smaller_df = self.subset_df(df, prop_subset, balancing_func)
        super().__init__(data_tar, smaller_df, target_column, is_validation=is_validation, feature_subset=feature_subset)
        
    def subset_df(self, df, proportion, balancing_func):
        """Subsets dataset to right proportion"""
        result = df.sample(frac=proportion)
        if balancing_func is not None:
            result = balancing_func(result, seed=None)
        return result

In [4]:
target_col = target_columns[snakemake.wildcards["target"]]
feature_subset = None
num_channels = len(summary_statistic_order)
balance = balancing_functions[snakemake.wildcards["target"]]

In [10]:
validation_dset = utils.deeplearning.SweepsDataset(
    snakemake.input["data"], read_data(snakemake.input["validation"]),
    target_col, is_validation=True, feature_subset=feature_subset
)

In [11]:
sample_proportions = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
dataset_sizes = dict()
for prop in sample_proportions:
    train_ds = SubsetSweepsDataset(
        prop, balance, snakemake.input["data"], read_data(snakemake.input["training"]),
        target_col, feature_subset=feature_subset
    )

    loader = fastai.data.core.DataLoaders.from_dsets(
        train_ds, validation_dset, bs=neural_network_batch_size
    )
    
    dataset_sizes[prop] = loader

In [17]:
loss_functions = {
    'classification': fastai.losses.CrossEntropyLossFlat(),
    'regression': fastai.losses.MSELossFlat(),
}

metric_functions = {
    'classification': fastai.metrics.accuracy,
    'regression': fastai.metrics.rmse,
}

model_type, output_dim = validation_dset.get_task()
metric = metric_functions[model_type]
loss = loss_functions[model_type]
num_epochs = int(snakemake.params["epochs"])

In [20]:
fit_reports = []

In [21]:
for prop, loader in dataset_sizes.items():
    neural_network = utils.deeplearning.SimpleCNN2Layer(
        input_dim=data_dimension, output_dim=output_dim, in_channels=num_channels
    )
    model = fastai.learner.Learner(loader, neural_network, loss_func=loss, metrics=metric)
    model.fit_one_cycle(num_epochs)
    fit_report = (
        pd.DataFrame.from_records(
            model.recorder.values, columns=model.recorder.metric_names[1:-1]
        )
        .assign(
            epoch=range(1, model.recorder.n_epoch + 1),
            sample_prop=prop,
            training_samples=len(loader.train_ds)
        )
    )

    fit_reports.append(fit_report)

In [22]:
all_report = pd.concat(fit_reports)

In [None]:
save_data(all_report, snakemake.output["fit_report"])