In [1]:
import pandas as pd
import numpy as np
import os
import time
import copy
import pathlib, tempfile

import matplotlib.pyplot as plt
import seaborn as sns

custom_params = {"axes.spines.right": False, 'grid.color': 'lightgray', 'axes.grid': True, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

from survivors import metrics as metr
import survivors.datasets as ds
import survivors.constants as cnt

%load_ext line_profiler
%load_ext scalene

Scalene extension successfully loaded. Note: Scalene currently only
supports CPU+GPU profiling inside Jupyter notebooks. For full Scalene
profiling, use the command line version.


In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from survivors.experiments.grid import generate_sample, prepare_sample, count_metric

X, y, features, categ, sch_nan = ds.load_gbsg_dataset()

df = X.copy()
df["time"] = y["time"]
df["cens"] = y["cens"]

In [3]:
def get_comb_fast(df, features):
    X = df[features + [cnt.CENS_NAME, cnt.TIME_NAME]].to_numpy().T

    def create_params_f(v_feature, name):
        d = {}
        d["arr"] = np.vstack((v_feature, X[-2:]))
        d["type_attr"] = "categ"
        return d

    return list(map(create_params_f, X[:-2], features))

def get_comb_fastx3(df, features):
    l = dict(zip(df.columns, df.values.T))
    def create_params_f(name):
        d = {}
        d["arr"] = np.vstack((l[name], l[cnt.CENS_NAME], l[cnt.TIME_NAME]))
        d["type_attr"] = "categ"
        return d

    return list(map(create_params_f, features))

%timeit get_comb_fast(df, features)
%timeit get_comb_fastx3(df, features)

547 µs ± 1.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
171 µs ± 423 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [6]:
def get_and_mean(X, features):
    X_sub = X.loc[:, features]
    shape = X_sub.shape
    features_predict = X_sub.mean(axis=0).to_dict()
    lists = X_sub[["time", "cens"]].to_dict(orient="list")

def get_and_mean_fast(X, features):
    shape = (X.shape[0], len(features))
    l = dict(zip(X.columns, X.values.T))
    lists = {k: v for k, v in l.items() if k in ["time", "cens"]}
    features_predict = {k: np.mean(v) for k, v in l.items() if k in features}
#     X_sub.mean(axis=0).to_dict()
#     features_predict = dict(zip(X.columns, X.values.mean(axis=0)))
#     features_predict = {k: v for k, v in features_predict.items() if k in features}
#     features_predict = {k: np.mean(lists[k]) for k in features}
#     features_predict = {k: np.mean(v) for k, v in lists.items() if k in features}
#     print(shape, features_predict)
#     print(lists)
    
%timeit get_and_mean(df, features[1:-1] + ["time", "cens"])
%timeit get_and_mean_fast(df, features[1:-1] + ["time", "cens"])

1.35 ms ± 92.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
231 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
