In [1]:
import logging
# set seed
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import utils as ut
import experiment as exp
from evaluation import *
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *
from plot import *
from tqdm.notebook import tqdm, trange
from sk_models import setup_pls_models_exh, StandardScaler, PLSRegression, CustomWrapper

seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)

In [2]:
#set up some stuff to manage metadata for each dataset
id_col_db = {'A_C_OF_ALPHA':["sample_id"],
             'A_C_OF_SIWARE':[],
             'A_AL_RT':[],
             'PLN7':["db_id", "sample_id"],
             'mango_684_990': ['Set','Season','Region','Date','Type','Cultivar','Pop','Temp',"FruitID"]
            }

output_col_db= {'A_C_OF_ALPHA':None,
             'A_C_OF_SIWARE':None,
             'A_AL_RT':None,
             'PLN7':None,
             'mango_684_990': ['DM']
            }


In [3]:
#setup input and output formats, load data

file_name = "mango_684_990.csv"
dataset_name = re.sub(r'\.(?=csv$)[^.]+$', '',file_name)



data_path = Path('D:/workspace/lazydeep/data/soil_data/')
model_path = Path('D:/workspace/lazydeep/experiments/1.01/')
log_path = Path("D:/workspace/lazydeep/experiments/1.04")
n_components = 59

data_file = data_path / file_name
log_dir = log_path / dataset_name
model_dir = model_path / dataset_name

if not log_dir.exists():
    log_dir.mkdir()
print(log_dir)

id_cols =id_col_db[dataset_name]
output_cols = output_col_db[dataset_name]


D:\workspace\lazydeep\experiments\1.04\mango_684_990


In [4]:
data = pd.read_csv(data_file)
if not file_name == 'mango_684_990.csv': 
    data = data.sample(frac=1)
nrow, ncol = data.shape
data = ut.sample_data(data,random_state)
n_features = ncol - 1-len(id_cols)
dataset = TabularDataset(data,id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
#dataset = TabularDataset(data,id_cols = id_cols, cat_cols=None, output_cols=None, ignore_cols= None)


In [5]:
n_models = 100

model_names = [f"random_{i}" for i in range(0,n_models)]
deep_models = {name:torch.load(model_dir/"models"/name/"_model") for name in model_names}
#for each model, load state
print(f"Loaded {len(deep_models)} models")

#print(deep_models)
for name in model_names:
    sub_path = log_dir / name
    if not sub_path.exists():
        sub_path.mkdir()

Loaded 100 models


In [6]:
# set logging, in this case the root logger
ut.setup_logger(logger_name="",file_name=log_dir/"log.txt")
ut.setup_logger(logger_name="summary",file_name=log_dir/"summary.txt")
summary_logger = logging.getLogger("summary")
tb = SummaryWriter(log_dir/"tb")

In [7]:
fixed_hyperparams = {'bs': 32,'loss': nn.MSELoss(),'epochs': 100}
preprocessing = PLSRegression(n_components=n_components)

if dataset_name == 'mango_684_990':
    eval_ = MangoesSplitter(preprocessing=preprocessing,tensorboard=None,time=True,random_state=random_state)
else:
    eval_ = CrossValEvaluation(preprocessing=preprocessing,tensorboard=None,time=True,random_state=random_state)
    
print(f"Dataset shape is {data.shape}")
scores={} #->model->knn:{fold_0:number,...,fold_n:number,mean:number,median:number
preds={} #model-> foldsxknn_models
deep_scores_dict={}
deep_preds_dict={}
actual_y = None

device = "cpu" #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
load_fun_cv = lambda name,model, fold : model.load_state(model_dir/'models'/name/f"_fold_{fold}")
load_fun_pp_cv = lambda fold : preprocessing.from_state(preprocessing.load_state(model_dir/'preprocessing'/f"_fold_{fold}"))
load_fun_build = lambda name,model : model.load_state(model_dir/'models'/name/f"_final")
load_fun_pp_build = lambda : preprocessing.from_state(preprocessing.load_state(model_dir/'preprocessing'/f"_final"))

Dataset shape is (11691, 113)


In [8]:
deep_scheme = DeepScheme(None, fixed_hyperparams=fixed_hyperparams,loss_eval=loss_target,device=device,tensorboard=tb,adaptive_lr=True,update=False)
deep_scores, deep_preds, _ , _, _,_ = eval_.evaluate(deep_models,dataset,deep_scheme,logger_name="log",load_fun=load_fun_cv,load_fun_pp = load_fun_pp_cv)
deep_scores_final, deep_preds_final, _ ,_, _,_ = eval_.build(deep_models,dataset,deep_scheme,logger_name="test_log",load_fun=load_fun_build,load_fun_pp = load_fun_pp_build)

all_scores = []
for k,v in ut.flip_dicts(deep_scores).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores.append({**dict1,**v})
    
all_scores_final = []
for k,v in ut.flip_dicts(deep_scores_final).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores_final.append({**dict1,**v})

Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 5879 - Val 1929 - Test 1905-----------------------------------'
Tested (test) on 1905 instances with mean losses of: random_0:0.8233,random_1:0.807,random_2:0.6161,random_3:264.4626,random_4:0.8111,random_5:0.5017,random_6:271.8982,random_7:0.5023,random_8:0.5081,random_9:0.9613,random_10:0.7669,random_11:0.5153,random_12:0.5384,random_13:0.7874,random_14:6.1302,random_15:0.975,random_16:0.5693,random_17:1.1252,random_18:0.5785,random_19:0.5572,random_20:0.8867,random_21:0.4698,random_22:0.9858,random_23:0.7741,random_24:0.8587,random_25:0.7535,random_26:0.5371,random_27:0.8033,random_28:0.8394,random_29:0.7193,random_30:0.562,random_31:0.9906,random_32:0.6003,random_33:273.8991,random_34:0.7762,random_35:0.7732,random_36:0.8258,random_37:0.9836,random_38:6.1238,random_39:0.6793,random_40:0.7944,random_41:0.5056,random_42:0.5355,random_43:0.7195,random_44:0.8293,random_45:280.2945,random_46:0.9234,

In [9]:
scores_df_sorted = pd.DataFrame(all_scores).sort_values(by='MSE')

for i,(index,row) in enumerate(scores_df_sorted.iterrows()):
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    print(s)

0 - random_59 - deep - 0.41344394868440204 - 0.4268353875640713 - 0.38668853449156804 - 0.4572330106437476 - 0.45067953918038345 - 0.42715798708023967 - 0.9296530954646005
1 - random_21 - deep - 0.46982851644826373 - 0.4829046178351888 - 0.4421578718634937 - 0.5073498061939122 - 0.48945786849971135 - 0.47852009789036615 - 0.9211944791793387
2 - random_82 - deep - 0.4732125640228352 - 0.4900174628743125 - 0.4473183443082259 - 0.5129462654654137 - 0.49000114415925466 - 0.4828983874404394 - 0.9204734365526689
3 - random_77 - deep - 0.4860427215969156 - 0.4911194901332221 - 0.43916691134255387 - 0.5363065382234411 - 0.5134993734357267 - 0.49345653250734556 - 0.9187346587571994
4 - random_5 - deep - 0.5017142913316491 - 0.48238601705912126 - 0.45223166167766826 - 0.5203985189180867 - 0.5261903511747895 - 0.4966379684436345 - 0.9182107210646717
5 - random_32 - deep - 0.6002785096018333 - 0.47158061427533476 - 0.41952084913411336 - 0.5031710790293249 - 0.4923633533434727 - 0.49703222356799426

In [None]:
def build_predictors(n):
    predictors = {}
    for i in [5,10,20,50,100,200,500,1000]:
        if i* 2 < n:
            predictors[f'knn_unif_n={i}'] = CustomWrapper(KNeighborsRegressor(n_neighbors=i, weights='uniform'))
            predictors[f'knn_dist_n={i}'] = CustomWrapper(KNeighborsRegressor(n_neighbors=i, weights='distance'))
    return predictors

for deep_name,deep_model in tqdm(deep_models.items()):
    logging.getLogger().info(f"Running model {deep_name}")
    temp_dict = {deep_name:deep_model}

    lwr_scheme = DeepLWRScheme_1_to_n(lwr_models = build_predictors(nrow),n_neighbours=500,loss_fun_sk = mean_squared_error)
    lwr_scores, lwr_preds, _ , _, _,_= eval_.evaluate(temp_dict,dataset,lwr_scheme,logger_name="log")
    lwr_scores_final, lwr_preds_final, _ , _, _,_= eval_.build(temp_dict,dataset,lwr_scheme,logger_name="test_log")

    #scores
    for k,v in ut.flip_dicts(lwr_scores).items():
        dict1 = {'model_num':deep_name,"predictor":k}
        all_scores.append({**dict1,**v})

    for k,v in ut.flip_dicts(lwr_scores_final).items():
        dict1 = {'model_num':deep_name,"predictor":k}
        all_scores_final.append({**dict1,**v})

    lwr_preds['deep'] = deep_preds[deep_name]
    lwr_preds_final['deep'] = deep_preds_final[deep_name]

    lwr_preds.to_csv(log_dir/deep_name/ f"predictions.csv",index=False)
    lwr_preds_final.to_csv(log_dir/deep_name/ f"predictions_test.csv",index=False)

    #preds
    # todo save predictions - appending solns
    plot_preds_and_res(lwr_preds,name_lambda=lambda x:f"{deep_name} with {x} predictor",save_lambda= lambda x:f"deep_lwr{x}",save_loc=log_dir/deep_name)

  0%|          | 0/100 [00:00<?, ?it/s]

Running model random_0'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 5879 - Val 1929 - Test 1905-----------------------------------'
Finished training DeepLWR with a train loss of knn_unif_n=5:0.4686,knn_dist_n=5:0.0,knn_unif_n=10:0.5813,knn_dist_n=10:0.0,knn_unif_n=20:0.6474,knn_dist_n=20:0.0,knn_unif_n=50:0.7085,knn_dist_n=50:0.0,knn_unif_n=100:0.7357,knn_dist_n=100:0.0,knn_unif_n=200:0.7602,knn_dist_n=200:0.0,knn_unif_n=500:0.8167,knn_dist_n=500:0.0,knn_unif_n=1000:0.9093,knn_dist_n=1000:0.0'
Tested (test) on 1905 instances with mean losses of: knn_unif_n=5:0.736,knn_dist_n=5:0.6635,knn_unif_n=10:0.715,knn_dist_n=10:0.6507,knn_unif_n=20:0.712,knn_dist_n=20:0.6567,knn_unif_n=50:0.7278,knn_dist_n=50:0.6847,knn_unif_n=100:0.7481,knn_dist_n=100:0.712,knn_unif_n=200:0.7664,knn_dist_n=200:0.7364,knn_unif_n=500:0.8262,knn_dist_n=500:0.7894,knn_unif_n=1000:0.9202,knn_dist_n=1000:0.8569'
-----------------------------------Fold 1 - Train 5855 - Val 

In [None]:
scores_df = pd.DataFrame(all_scores)
scores_df.to_csv(log_dir/f"scores.csv",index=False)

scores_df_sorted = pd.DataFrame(scores_df).sort_values(by='MSE')

best_5 = []
summary_logger.info(f"Rank - " +" - ".join(list(scores_df_sorted.columns)))
for i,(index,row) in enumerate(scores_df_sorted.iterrows()):
    if i < 5:
        best_5.append((row["model_num"],row["predictor"],row["MSE"],row["R2"]))
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    summary_logger.info(s)

In [None]:
scores_df_final = pd.DataFrame(all_scores_final)
scores_df_final.to_csv(log_dir/f"test_scores.csv",index=False)

summary_logger.info("-----------------------\n Best 5 on Test Sest \n ---------------------")
summary_logger.info(f"Rank -  Deep Model - Predictor - Val Set - Test Set")
for i, (j,k,v,x) in enumerate(best_5):

    row = scores_df_final.loc[(scores_df_final['model_num']==j) & (scores_df_final['predictor'] == k)].iloc[0]
    #print(row)
    s = f"{i} - {j} - {k} - {v} - {x} - {row['MSE']} - {row['R2']}"
    summary_logger.info(s)


In [None]:
#take 1 is a scatter plot - lets, for each dataset
#graph our deep models by rank - plot - then overlay our knn moels
#plot points

deep_set = scores_df[scores_df["predictor"]=="deep"].sort_values("R2")
deep_set["order"] = [i for i in range(0,100)]
deep_ordering = {row["model_num"]:row["order"] for index, row in deep_set.iterrows()}

def order_models(x):
    x = [deep_ordering[i] for i in x]
    return x

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df[scores_df["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        s=10
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot.png", bbox_inches='tight')
logging.getLogger().info("Wrote Summary Graph")

In [None]:
scores_df["n_features"] = [deep_models[i].n_features for i in scores_df["model_num"]] 

In [None]:
from matplotlib.colors import Colormap
import seaborn as sns #heatmap of features - pls model - score
class nlcmap(Colormap):
    def __init__(self, cmap, levels):
        self.cmap = cmap
        self.N = cmap.N
        self.monochrome = self.cmap.monochrome
        self.levels = np.asarray(levels, dtype='float64')
        self._x = self.levels
        self.levmax = self.levels.max()
        self.levmin = self.levels.min()
        self.transformed_levels = np.linspace(self.levmin, self.levmax, #uniform spacing along levels (colour segments)
             len(self.levels))

    def __call__(self, xi, alpha=1.0, **kw):
        yi = np.interp(xi, self._x, self.transformed_levels)
        return self.cmap((yi-self.levmin) / (self.levmax-self.levmin), alpha)
    
levels = np.concatenate((
    [0, 1],
    [0.6,0.8,0.9,0.95,0.98]
    ))

levels = levels[levels <= 1]
levels.sort()
cmap_nonlin = nlcmap(plt.cm.YlGnBu, levels)


In [None]:
scores_df_base = scores_df[scores_df["predictor"]=='deep']
scores_df_dist = scores_df[scores_df["predictor"].str.contains('dist')]   #val_eq_list(scores_df["predictor"],'dist')] #np.logical_or(scores_df["predictor"]=="deep",'dist' in scores_df["predictor"])]
scores_df_unif = scores_df[scores_df["predictor"].str.contains('unif')] 

In [None]:
def rand_jitter(arr):
    stdev = .01 * (max(arr) - min(arr))
    return arr + np.random.randn(len(arr)) * stdev

In [None]:
subset = scores_df_dist[["predictor","n_features","R2"]]
#print(subset)
subset = subset[np.logical_not(subset["predictor"]=="deep")]
trans = subset["predictor"].transform(lambda x: int(x.replace("knn_dist_n=",""))).tolist()
subset.loc[:,"predictor"]=trans
subset=subset.sort_values("predictor",ascending=False)

fig, ax = plt.subplots()
sc = ax.scatter(x=rand_jitter(subset["n_features"]), y=rand_jitter(subset["predictor"]), s=20,c=subset["R2"],cmap=cmap_nonlin,vmin=0)
ax.set_xlabel("Number of Features")
ax.set_ylabel("Number of Neighbours")

cbar = fig.colorbar(sc,label="R2 Score")

ax.set_title("LWR performance as a function of the number of components")
plt.savefig(log_dir/f"heat_scatter.png", bbox_inches='tight')



In [None]:
subset = scores_df_unif[["predictor","n_features","R2"]]
#print(subset)
subset = subset[np.logical_not(subset["predictor"]=="deep")]
trans = subset["predictor"].transform(lambda x: int(x.replace("knn_unif_n=",""))).tolist()
subset.loc[:,"predictor"]=trans
subset=subset.sort_values("predictor",ascending=False)

fig, ax = plt.subplots()
sc = ax.scatter(x=rand_jitter(subset["n_features"]), y=rand_jitter(subset["predictor"]), s=20,c=subset["R2"],cmap=cmap_nonlin,vmin=0)
ax.set_xlabel("Number of Features")
ax.set_ylabel("Number of Neighbours")

cbar = fig.colorbar(sc,label="R2 Score")

ax.set_title("LWR performance as a function of the number of components")
plt.savefig(log_dir/f"heat_scatter.png", bbox_inches='tight')



In [None]:
fig, ax = plt.subplots()
knn_models = scores_df_dist["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_dist[scores_df_dist["predictor"]==knn_model]
    s=3
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

ax.scatter(x=order_models(scores_df_base["model_num"].tolist()), y=scores_df_base["R2"], s=10, label='deep')


#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_dist.png", bbox_inches='tight')
logging.getLogger().info("Wrote Summary Graph")

fig, ax = plt.subplots()
knn_models = scores_df_dist["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_dist[scores_df_dist["predictor"]==knn_model]
    s=3
    y1 = subset["R2"].to_numpy() - scores_df_base["R2"].to_numpy()
    
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=y1, s=s, label=knn_model)

ax.set_ylim(-1,1)

ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("Difference, KNN R2 - BaseR2")
ax.set_xlabel("Deep Model Rank")

ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"improve_plot_dist.png", bbox_inches='tight')

In [None]:
fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df_unif["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_unif[scores_df_unif["predictor"]==knn_model]
    s=3
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

ax.scatter(x=order_models(scores_df_base["model_num"].tolist()), y=scores_df_base["R2"], s=10, label='deep')


#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_unif.png", bbox_inches='tight')
logging.getLogger().info("Wrote Summary Graph")

fig, ax = plt.subplots()
knn_models = scores_df_unif["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_unif[scores_df_unif["predictor"]==knn_model]
    s=3
    y1 = subset["R2"].to_numpy() - scores_df_base["R2"].to_numpy()
    
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=y1, s=s, label=knn_model)

ax.set_ylim(-1,1)

ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("Difference, KNN R2 - BaseR2")
ax.set_xlabel("Deep Model Rank")

ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"improve_plot_unif.png", bbox_inches='tight')


In [None]:
import seaborn as sns
from matplotlib.colors import Colormap
class nlcmap(object):
    def __init__(self, cmap, levels):
        self.cmap = cmap
        self.N = cmap.N
        self.monochrome = self.cmap.monochrome
        self.levels = np.asarray(levels, dtype='float64')
        self._x = self.levels
        self.levmax = self.levels.max()
        self.levmin = self.levels.min()
        self.transformed_levels = np.linspace(self.levmin, self.levmax, #uniform spacing along levels (colour segments)
             len(self.levels))

    def __call__(self, xi, alpha=1.0, **kw):
        yi = np.interp(xi, self._x, self.transformed_levels)
        return self.cmap((yi-self.levmin) / (self.levmax-self.levmin), alpha)
    
levels = np.concatenate((
    [0, 1],
    [0.6,0.8,0.9,0.95,0.98]
    ))

levels = levels[levels <= 1]
levels.sort()
print(levels)
cmap_nonlin = nlcmap(plt.cm.YlGnBu, levels) 

In [None]:
subset = scores_df_base[["model_num",'predictor' ,"R2"]]
#subset = subset.sort_values('model_num', key=order_models)
#trans = subset["predictor"].transform(lambda x: (x.replace("knn_dist_n=",""))).tolist()
#subset.loc[:,"predictor"]=trans

#trans = subset["model_num"].transform(lambda x: int(x.replace("random_",""))).tolist()
#subset.loc[:,"model_num"]=trans

#subset=subset.sort_values("model_num",ascending=False)
wide = subset.pivot(index = "predictor",columns= "model_num",values="R2")
wide = wide.sort_index(axis=1,key=order_models)

ax = sns.heatmap(wide, linewidth=0.0,vmin=0,center=0,cbar_kws={'label':"R2 Score"},cmap=cmap_nonlin)

ax.set_title("Grid Search for number of neighbours and deep model ")
ax.set_xlabel("Deep Model")
ax.set_ylabel("Number of Neighbours")

plt.savefig(log_dir/"pls_heatmap.png", bbox_inches='tight')


In [None]:
# heat map for distance based knn and deep model

subset = scores_df_dist[["model_num","predictor","R2"]]
subset = subset.sort_values('model_num', key=order_models)

trans = subset["predictor"].transform(lambda x: int(x.replace("knn_dist_n=",""))).tolist()
subset.loc[:,"predictor"]=trans

#trans = subset["model_num"].transform(lambda x: int(x.replace("random_",""))).tolist()
#subset.loc[:,"model_num"]=trans

#subset=subset.sort_values("model_num",ascending=False)
wide = subset.pivot(index = "predictor",columns= "model_num",values="R2")
wide = wide.sort_index(axis=1,key=order_models)

ax = sns.heatmap(wide, linewidth=0.0,vmin=0,center=0,cbar_kws={'label':"R2 Score"},cmap=cmap_nonlin)

ax.set_title("Grid Search for number of neighbours and deep model ")
ax.set_xlabel("Deep Model")
ax.set_ylabel("Number of Neighbours")

#plt.savefig(log_dir/"pls_heatmap.png", bbox_inches='tight')

In [None]:
# heat map for uniform  based knn and deep model

subset = scores_df_unif[["model_num","predictor","R2"]]
subset = subset.sort_values('model_num', key=order_models)

trans = subset["predictor"].transform(lambda x: int(x.replace("knn_unif_n=",""))).tolist()
subset.loc[:,"predictor"]=trans

#trans = subset["model_num"].transform(lambda x: int(x.replace("random_",""))).tolist()
#subset.loc[:,"model_num"]=trans

#subset=subset.sort_values("model_num",ascending=False)
wide = subset.pivot(index = "predictor",columns= "model_num",values="R2")
wide = wide.sort_index(axis=1,key=order_models)

ax = sns.heatmap(wide, linewidth=0.0,vmin=0,center=0,cbar_kws={'label':"R2 Score"},cmap=cmap_nonlin)

ax.set_title("Grid Search for number of neighbours and deep model ")
ax.set_xlabel("Deep Model")
ax.set_ylabel("Number of Neighbours")

#plt.savefig(log_dir/"pls_heatmap.png", bbox_inches='tight')


In [None]:
#graph our deep models by rank on final set - plot - then overlay our knn moels

deep_set = scores_df_final[scores_df_final["predictor"]=="deep"].sort_values("R2")
deep_set["order"] = [i for i in range(0,100)]
deep_ordering = {row["model_num"]:row["order"] for index, row in deep_set.iterrows()}

def order_models(x):
    x = [deep_ordering[i] for i in x]
    return x

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df_final["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_final[scores_df_final["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        s=10
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_final.png", bbox_inches='tight')

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df_final["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_final[scores_df_final["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        pass
    else:
        y1 = subset["R2"].to_numpy() - scores_df_final[scores_df_final["predictor"]=='deep']["R2"].to_numpy()
        ax.scatter(x=order_models(subset["model_num"].tolist()), y=y1, s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"improvement_plot_final.png", bbox_inches='tight')