In [43]:
from pathlib import Path

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

from utils.prepare_data import target_columns, read_data
from utils.project_parameters import gradient_boosting_params
from utils.deeplearning import SweepsDataset

In [3]:
target_col = target_columns[snakemake.wildcards["target"]]
feature_subset = None

In [26]:
datasets = {
    "training": {
        "loader": SweepsDataset(
            snakemake.input["data"],
            read_data(snakemake.input["training"]),
            target_col,
            feature_subset=feature_subset
        ),
    },
    "validation":  {
        "loader": SweepsDataset(
            snakemake.input["data"],
            read_data(snakemake.input["validation"]),
            target_col,
            is_validation=True,
            feature_subset=feature_subset
        ),
    },
}

In [64]:
for name, contents in datasets.items():
    dataset = contents["loader"]
    contents["X"] = pd.DataFrame.from_records(
        (tens.flatten().numpy() for tens in dataset.data.values()),
        index=dataset.data.keys()
    )
    contents["y"] = dataset.df[dataset.target_column].set_axis(dataset.df.uuid)

In [65]:
gradient_boosting_params.update({"n_estimators": int(snakemake.config["num_gradient_boosting_estimators"])})

In [66]:
task, _ = datasets["training"]["loader"].get_task()

if task == "classification":
    model = GradientBoostingClassifier()
elif task == "regression":
    model = GradientBoostingRegressor()
    
model.set_params(**gradient_boosting_params)

In [69]:
model.fit(datasets["training"]["X"], datasets["training"]["y"])

### Save model fitting outcomes

In [115]:
def get_training_inferences(model, datasets):
    task, _ = datasets["training"]["loader"].get_task()
    inferences = dict()
    for kind in ["training", "validation"]:
        dataset = datasets[kind]
        if task == "classification":
            labels = model.classes_.tolist()
            true_label = dataset["y"]
            true_ix = [labels.index(lab) for lab in true_label]
            predicted_probas = model.predict_proba(dataset["X"])
            predicted_label = model.predict(dataset["X"])
            predicted_ix = [labels.index(lab) for lab in predicted_label]
            result = (
                pd.DataFrame(predicted_probas, columns=labels)
                .assign(
                    uuid=dataset["loader"].df.uuid,
                    true_ix=true_ix,
                    predicted_ix=predicted_ix,
                    true_label=true_label.values,
                    predicted_label=predicted_label,
                )
                .set_index("uuid")
            )
        elif task == "regression":
            label = dataset["loader"].labels
            result = (
                pd.DataFrame(
                    model.predict(dataset["X"]), columns=["predicted_" + label]
                )
                .assign(
                    uuid=dataset["loader"].df.uuid,
                    true=dataset["y"].values
                )
                .set_index("uuid")
                .rename({"true": "true_" + label}, axis="columns")
            )
        inferences[kind] = result
    return inferences

In [116]:
inferences = get_training_inferences(model, datasets)
inferences["training"].to_csv(snakemake.output["training_inferences"], sep='\t', index=True)
inferences["validation"].to_csv(snakemake.output["validation_inferences"], sep='\t', index=True)