In [1]:
import logging
# set seed
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import utils as ut
import experiment as exp
from evaluation import *
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *
from plot import *
from sk_models import setup_pls_models_exh, StandardScaler, PLSRegression, DeepKNN,CustomWrapper,KNNBoost
from tqdm.notebook import tqdm, trange

seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)
import gc
torch.cuda.empty_cache()
gc.collect()

print(f"GPU detected is {torch.cuda.get_device_name(0)}")

GPU detected is GeForce GTX 970


In [2]:
id_col_db = {'A_C_OF_ALPHA':["sample_id"],
             'A_C_OF_SIWARE':[],
             'A_AL_RT':[],
             'PLN7':["db_id", "sample_id"],
             'mango_684_990': ['Set','Season','Region','Date','Type','Cultivar','Pop','Temp',"FruitID"]
            }

output_col_db= {'A_C_OF_ALPHA':None,
             'A_C_OF_SIWARE':None,
             'A_AL_RT':None,
             'PLN7':None,
             'mango_684_990': ['DM']
            }


In [3]:
#setup input and output formats, load data

file_name = "PLN7.csv"
dataset_name = re.sub(r'\.(?=csv$)[^.]+$', '',file_name)


data_path = Path('D:/workspace/lazydeep/data/soil_data/')
model_path = Path('D:/workspace/lazydeep/experiments/2.00/')
log_path = Path("D:/workspace/lazydeep/experiments/2.04_reverse")

data_file = data_path / file_name
log_dir = log_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)
model_dir = model_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)

if not log_dir.exists():
    log_dir.mkdir()
print(log_dir)

id_cols =id_col_db[dataset_name]
output_cols = output_col_db[dataset_name]


D:\workspace\lazydeep\experiments\2.04_reverse\A_C_OF_SIWARE


In [4]:
data = pd.read_csv(data_file)
data = data.sample(frac=1)
data = ut.sample_data(data,random_state)
nrow, ncol = data.shape
n_features = ncol - 1-len(id_cols)

dataset = TabularDataset(data,id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
print(data.shape)

(13916, 247)


In [5]:
# set logging, in this case the root logger
ut.setup_logger(logger_name="",file_name=log_dir/"log.txt")
ut.setup_logger(logger_name="summary",file_name=log_dir/"summary.txt")
summary_logger = logging.getLogger("summary")
tb = SummaryWriter(log_dir/"tb")


In [6]:
n_models = 100
model_names = [f"random_{i}" for i in range(0,n_models)]
deep_models = {name:torch.load(model_dir/"models"/name/"_model") for name in model_names}
#configs =  {name:Configuration().load(model_dir/"models"/name/"_config") for name in model_names}
#for each model, load state
print(f"Loaded {len(deep_models)} models")
#print(deep_models)
fixed_hyperparams = {'bs': 32,'loss': nn.MSELoss(),'epochs': 100}
preprocessing = StandardScaler()

model_names = deep_models.keys()
for name in model_names:
        sub_path = log_dir / name
        if not sub_path.exists():
            sub_path.mkdir()

Loaded 100 models


In [7]:
if dataset_name == 'mango_684_990':
    eval_ = MangoesSplitter(preprocessing=preprocessing,tensorboard=None,time=True,random_state=random_state)
else:
    eval_ = CrossValEvaluation(preprocessing=preprocessing,tensorboard=None,time=True,random_state=random_state)

In [8]:
scores={} #->model->knn:{fold_0:number,...,fold_n:number,mean:number,median:number
preds={} #model-> foldsxknn_models
deep_scores_dict={}
deep_preds_dict={}
actual_y = None
device = "cpu" 

load_fun_cv = lambda name,model, fold : model.load_state(model_dir/'models'/name/f"_fold_{fold}")
load_fun_pp_cv = None #lambda fold : preprocessing.from_state(preprocessing.load_state(model_dir/'preprocessing'/f"_fold_{fold}"))
load_fun_build = lambda name,model : model.load_state(model_dir/'models'/name/f"_final")
load_fun_pp_build = None #lambda : preprocessing.from_state(preprocessing.load_state(model_dir/'preprocessing'/f"_final"))


In [9]:
deep_scheme = DeepScheme(None, fixed_hyperparams=fixed_hyperparams,loss_eval=loss_target,device=device,tensorboard=tb,adaptive_lr=False,update=False)
deep_scores, deep_preds, _ , _, _,_ = eval_.evaluate(deep_models,dataset,deep_scheme,logger_name="log",load_fun=load_fun_cv,load_fun_pp=load_fun_pp_cv)
deep_scores_final, deep_preds_final, _ ,_, _,_ = eval_.build(deep_models,dataset,deep_scheme,logger_name="test_log",load_fun=load_fun_build,load_fun_pp=load_fun_pp_build)

Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Tested (test) on 2320 instances with mean losses of: random_0:628.4683,random_1:258.9156,random_2:301.2125,random_3:682.699,random_4:153.2868,random_5:117.8278,random_6:404.3059,random_7:312.3909,random_8:343.6564,random_9:146.2347,random_10:156.4236,random_11:175.8986,random_12:180.1223,random_13:142.6379,random_14:307.0093,random_15:184.7355,random_16:116.8974,random_17:127.7617,random_18:145.4492,random_19:188.9521,random_20:424.769,random_21:343.6529,random_22:197.0028,random_23:133.7144,random_24:343.617,random_25:342.9373,random_26:126.9094,random_27:258.0966,random_28:197.9956,random_29:177.0037,random_30:215.7712,random_31:161.9369,random_32:216.8611,random_33:621.9322,random_34:132.3104,random_35:343.6219,random_36:119.5602,random_37:178.8677,random_38:148.7674,random_39:217.5964,random_40:111.1418,random_41:171.1611,random_42

In [10]:
all_scores = []
for k,v in ut.flip_dicts(deep_scores).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores.append({**dict1,**v})

all_scores_final = []
for k,v in ut.flip_dicts(deep_scores_final).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores_final.append({**dict1,**v})

In [11]:
n = 30
best_n = []
scores_df_sorted = pd.DataFrame(all_scores).sort_values(by='MSE')
for i,(index,row) in enumerate(scores_df_sorted.iterrows()):
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    print(s)
    if i < n:
        best_n.append(row['model_num'])

0 - random_77 - deep - 114.54153915931438 - 152.8207722639616 - 104.25755461725298 - 110.82466808728691 - 104.50628847004576 - 117.38991802182585 - 0.6620716666595301
1 - random_5 - deep - 117.82781637783708 - 160.39428847799428 - 106.77852464143754 - 108.9054703895467 - 112.2098785703066 - 121.22290283731783 - 0.6510377193474814
2 - random_94 - deep - 122.7633532030829 - 149.67107165337842 - 113.71855478257956 - 127.20804967711639 - 113.69057255687771 - 125.41009271592955 - 0.6389841280263233
3 - random_56 - deep - 114.26998714578563 - 166.4831926708542 - 116.44538460254874 - 109.14064373133151 - 128.5843738228558 - 126.9836186780309 - 0.6344544459650241
4 - random_40 - deep - 111.14184654498922 - 165.918114705351 - 138.65532447449758 - 111.19993794956511 - 115.57543765352224 - 128.4966362148703 - 0.6300989484644296
5 - random_66 - deep - 113.39283807688746 - 156.3097939501553 - 124.72496610101105 - 126.55761228053926 - 121.97465503950887 - 128.59066218726588 - 0.6298282775185955
6 - 

In [12]:
scores_df_sorted_final = pd.DataFrame(all_scores_final).sort_values(by='MSE')

for i,(index,row) in enumerate(scores_df_sorted_final.iterrows()):
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    print(s)

0 - random_26 - deep - 108.9328985074833 - 0.6688365711308477
1 - random_57 - deep - 110.37025384063605 - 0.664466913046357
2 - random_56 - deep - 115.68681646990136 - 0.6483042006405713
3 - random_16 - deep - 115.74186881727334 - 0.6481368377559626
4 - random_5 - deep - 116.05796744694129 - 0.6471758763981366
5 - random_77 - deep - 117.85318823448966 - 0.6417182829646813
6 - random_82 - deep - 118.04934539534422 - 0.6411219518390445
7 - random_18 - deep - 120.62528767974072 - 0.633290911894659
8 - random_55 - deep - 121.30038129414473 - 0.631238581338722
9 - random_36 - deep - 121.5920608791385 - 0.6303518555394353
10 - random_40 - deep - 122.32375614472875 - 0.6281274521097997
11 - random_79 - deep - 125.37283424443685 - 0.6188580470703312
12 - random_68 - deep - 126.28232979846372 - 0.6160931186571568
13 - random_89 - deep - 131.9252204730282 - 0.5989383467734897
14 - random_51 - deep - 134.01242141695366 - 0.592593113783276
15 - random_94 - deep - 134.0267225006046 - 0.592549637515

In [None]:
def build_predictors(n,deep):
    predictors = {}
    for i in [5,10,20,50,100]:
        if i* 2 < n:
            #predictors[f'knn_uu_n={i}'] = KNNBoost(deep,n_neighbors=i, weights='uniform',errors='uniform',convolution='additive',reverse=True)
            #predictors[f'knn_ut_n={i}'] = KNNBoost(deep,n_neighbors=i, weights='uniform',errors='triangle',convolution='additive',reverse=True)
            #predictors[f'knn_tu_n={i}'] = KNNBoost(deep,n_neighbors=i, weights='triangle',errors='uniform',convolution='additive',reverse=True)
            predictors[f'knn_tta_n={i}'] = KNNBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='additive',reverse=True)
            predictors[f'knn_ttm_n={i}'] = KNNBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='multiplicative',reverse=True)
            predictors[f'knn_tta_r_n={i}'] = KNNBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='additive',reverse=False)
            predictors[f'knn_ttm_r_n={i}'] = KNNBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='multiplicative',reverse=False)
    return predictors

deep_models = {k:v for k,v in deep_models.items() if k in best_n}
for deep_name,deep_model in tqdm(deep_models.items()):
    #if int(deep_name.replace("random_",""))>80:
        logging.getLogger().info(f"Running model {deep_name}")
        temp_dict = {deep_name:deep_model}

        lwr_scheme = BoostScheme(boost_models = build_predictors(nrow,deep_model),loss_fun_sk = mean_squared_error)
        lwr_scores, lwr_preds, _ , _, _,_= eval_.evaluate(temp_dict,dataset,lwr_scheme,logger_name="log")
        lwr_scores_final, lwr_preds_final, _ , _, _,_= eval_.build(temp_dict,dataset,lwr_scheme,logger_name="test_log")

        #scores
        for k,v in ut.flip_dicts(lwr_scores).items(): 
            dict1 = {'model_num':deep_name,"predictor":k}
            all_scores.append({**dict1,**v})

        for k,v in ut.flip_dicts(lwr_scores_final).items():
            dict1 = {'model_num':deep_name,"predictor":k}
            all_scores_final.append({**dict1,**v})

        lwr_preds['deep'] = deep_preds[deep_name]
        lwr_preds_final['deep'] = deep_preds_final[deep_name]

        lwr_preds.to_csv(log_dir/deep_name/ f"predictions.csv",index=False)
        lwr_preds_final.to_csv(log_dir/deep_name/ f"predictions_test.csv",index=False)

        #preds
        # todo save predictions - appending solns
        plot_preds_and_res(lwr_preds,name_lambda=lambda x:f"{deep_name} with {x} predictor",save_lambda= lambda x:f"deep_lwr{x}",save_loc=log_dir/deep_name)


  0%|          | 0/30 [00:00<?, ?it/s]

Running model random_5'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:66.5942,knn_ttm_n=5:67.3029,knn_tta_r_n=5:66.9742,knn_ttm_r_n=5:94.4129,knn_tta_n=10:90.6863,knn_ttm_n=10:109.9736,knn_tta_r_n=10:90.9707,knn_ttm_r_n=10:96.6801,knn_tta_n=20:102.151,knn_ttm_n=20:131.014,knn_tta_r_n=20:107.2269,knn_ttm_r_n=20:105.8213,knn_tta_n=50:115.8851,knn_ttm_n=50:141.7073,knn_tta_r_n=50:124.9301,knn_ttm_r_n=50:118.8978,knn_tta_n=100:123.7304,knn_ttm_n=100:146.7115,knn_tta_r_n=100:142.0179,knn_ttm_r_n=100:130.0352'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:161.8807,knn_ttm_n=5:212.7199,knn_tta_r_n=5:118.9677,knn_ttm_r_n=5:121.7863,knn_tta_n=10:129.4011,knn_ttm_n=10:170.1184,knn_tta_r_n=10:115.7604,knn_ttm_r_n=10:117.6012,knn_tta_n=20:125.5879,knn_ttm_n=20:173.2984,knn_tta_r_n=20:118.2479,knn_ttm_r_n=20:1



Running model random_10'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:79.7346,knn_ttm_n=5:89.3123,knn_tta_r_n=5:83.9233,knn_ttm_r_n=5:117.4489,knn_tta_n=10:105.1867,knn_ttm_n=10:128.0077,knn_tta_r_n=10:108.9992,knn_ttm_r_n=10:123.3244,knn_tta_n=20:121.031,knn_ttm_n=20:155.0152,knn_tta_r_n=20:120.6709,knn_ttm_r_n=20:124.9093,knn_tta_n=50:130.7096,knn_ttm_n=50:179.5609,knn_tta_r_n=50:132.0949,knn_ttm_r_n=50:133.5477,knn_tta_n=100:133.8198,knn_ttm_n=100:181.5562,knn_tta_r_n=100:144.1191,knn_ttm_r_n=100:140.9349'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:167.3993,knn_ttm_n=5:240.2674,knn_tta_r_n=5:145.6001,knn_ttm_r_n=5:161.9462,knn_tta_n=10:133.8939,knn_ttm_n=10:215.6523,knn_tta_r_n=10:131.7665,knn_ttm_r_n=10:149.8351,knn_tta_n=20:129.8783,knn_ttm_n=20:184.8017,knn_tta_r_n=20:122.3513,knn_ttm_r_



Running model random_13'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:82.6741,knn_ttm_n=5:84.6269,knn_tta_r_n=5:77.901,knn_ttm_r_n=5:110.5996,knn_tta_n=10:102.2544,knn_ttm_n=10:120.6952,knn_tta_r_n=10:102.5488,knn_ttm_r_n=10:110.3904,knn_tta_n=20:114.8371,knn_ttm_n=20:149.7867,knn_tta_r_n=20:119.1556,knn_ttm_r_n=20:118.7476,knn_tta_n=50:126.3513,knn_ttm_n=50:156.4842,knn_tta_r_n=50:135.7595,knn_ttm_r_n=50:130.78,knn_tta_n=100:135.965,knn_ttm_n=100:158.9708,knn_tta_r_n=100:150.899,knn_ttm_r_n=100:142.0522'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:168.4318,knn_ttm_n=5:214.4528,knn_tta_r_n=5:132.3399,knn_ttm_r_n=5:142.0545,knn_tta_n=10:138.7349,knn_ttm_n=10:180.8493,knn_tta_r_n=10:127.1002,knn_ttm_r_n=10:129.5858,knn_tta_n=20:133.2606,knn_ttm_n=20:181.1691,knn_tta_r_n=20:128.7089,knn_ttm_r_n=20



Running model random_17'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:87.6962,knn_ttm_n=5:91.7489,knn_tta_r_n=5:82.8034,knn_ttm_r_n=5:108.9282,knn_tta_n=10:106.0138,knn_ttm_n=10:119.0272,knn_tta_r_n=10:107.4197,knn_ttm_r_n=10:115.3724,knn_tta_n=20:118.483,knn_ttm_n=20:139.4538,knn_tta_r_n=20:121.57,knn_ttm_r_n=20:122.754,knn_tta_n=50:129.3087,knn_ttm_n=50:149.7233,knn_tta_r_n=50:137.6743,knn_ttm_r_n=50:133.1268,knn_tta_n=100:135.2301,knn_ttm_n=100:149.903,knn_tta_r_n=100:152.7471,knn_ttm_r_n=100:141.3033'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:156.8526,knn_ttm_n=5:197.0341,knn_tta_r_n=5:131.324,knn_ttm_r_n=5:151.8508,knn_tta_n=10:140.1925,knn_ttm_n=10:192.4577,knn_tta_r_n=10:126.308,knn_ttm_r_n=10:138.6735,knn_tta_n=20:134.3958,knn_ttm_n=20:185.6907,knn_tta_r_n=20:132.1501,knn_ttm_r_n=20:1



Running model random_18'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:84.8876,knn_ttm_n=5:92.6667,knn_tta_r_n=5:72.2737,knn_ttm_r_n=5:101.0052,knn_tta_n=10:98.0413,knn_ttm_n=10:130.9656,knn_tta_r_n=10:98.3819,knn_ttm_r_n=10:100.6771,knn_tta_n=20:108.4243,knn_ttm_n=20:139.0783,knn_tta_r_n=20:116.0395,knn_ttm_r_n=20:113.3671,knn_tta_n=50:121.0343,knn_ttm_n=50:142.0967,knn_tta_r_n=50:134.0079,knn_ttm_r_n=50:127.1509,knn_tta_n=100:130.4867,knn_ttm_n=100:146.9572,knn_tta_r_n=100:151.8725,knn_ttm_r_n=100:138.6118'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:212.3135,knn_ttm_n=5:265.9893,knn_tta_r_n=5:137.7814,knn_ttm_r_n=5:141.5728,knn_tta_n=10:167.3632,knn_ttm_n=10:241.5095,knn_tta_r_n=10:133.4232,knn_ttm_r_n=10:137.3134,knn_tta_n=20:142.764,knn_ttm_n=20:210.7257,knn_tta_r_n=20:135.5924,knn_ttm_r_n=



Running model random_23'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:72.5169,knn_ttm_n=5:76.4332,knn_tta_r_n=5:74.9248,knn_ttm_r_n=5:100.7546,knn_tta_n=10:93.9102,knn_ttm_n=10:121.0051,knn_tta_r_n=10:100.9929,knn_ttm_r_n=10:107.5162,knn_tta_n=20:111.0386,knn_ttm_n=20:144.6821,knn_tta_r_n=20:119.0592,knn_ttm_r_n=20:118.2378,knn_tta_n=50:123.5076,knn_ttm_n=50:146.0451,knn_tta_r_n=50:137.435,knn_ttm_r_n=50:130.9722,knn_tta_n=100:132.4241,knn_ttm_n=100:147.6003,knn_tta_r_n=100:155.3188,knn_ttm_r_n=100:142.848'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:140.728,knn_ttm_n=5:158.4904,knn_tta_r_n=5:125.987,knn_ttm_r_n=5:136.2631,knn_tta_n=10:129.3899,knn_ttm_n=10:170.088,knn_tta_r_n=10:124.8381,knn_ttm_r_n=10:129.1754,knn_tta_n=20:124.7627,knn_ttm_n=20:162.0504,knn_tta_r_n=20:129.5598,knn_ttm_r_n=20:



Running model random_31'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:80.6512,knn_ttm_n=5:92.3091,knn_tta_r_n=5:75.0418,knn_ttm_r_n=5:98.4646,knn_tta_n=10:100.1457,knn_ttm_n=10:126.944,knn_tta_r_n=10:100.5107,knn_ttm_r_n=10:106.9623,knn_tta_n=20:107.6547,knn_ttm_n=20:134.7776,knn_tta_r_n=20:114.0705,knn_ttm_r_n=20:114.0232,knn_tta_n=50:118.9542,knn_ttm_n=50:132.0238,knn_tta_r_n=50:130.8644,knn_ttm_r_n=50:124.9905,knn_tta_n=100:127.6705,knn_ttm_n=100:131.4156,knn_tta_r_n=100:145.1676,knn_ttm_r_n=100:135.2682'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:182.8862,knn_ttm_n=5:227.2018,knn_tta_r_n=5:132.985,knn_ttm_r_n=5:139.6964,knn_tta_n=10:159.5483,knn_ttm_n=10:205.1334,knn_tta_r_n=10:126.4056,knn_ttm_r_n=10:128.758,knn_tta_n=20:127.8748,knn_ttm_n=20:181.6778,knn_tta_r_n=20:125.9859,knn_ttm_r_n=2



Running model random_34'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:78.4476,knn_ttm_n=5:79.5815,knn_tta_r_n=5:81.6414,knn_ttm_r_n=5:110.4625,knn_tta_n=10:107.6416,knn_ttm_n=10:115.6093,knn_tta_r_n=10:111.5536,knn_ttm_r_n=10:118.0369,knn_tta_n=20:124.1914,knn_ttm_n=20:140.168,knn_tta_r_n=20:127.3968,knn_ttm_r_n=20:128.2228,knn_tta_n=50:136.855,knn_ttm_n=50:150.8833,knn_tta_r_n=50:144.7816,knn_ttm_r_n=50:139.9101,knn_tta_n=100:147.1811,knn_ttm_n=100:156.8776,knn_tta_r_n=100:160.6781,knn_ttm_r_n=100:148.8631'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:163.5935,knn_ttm_n=5:195.1693,knn_tta_r_n=5:129.9931,knn_ttm_r_n=5:159.7416,knn_tta_n=10:162.9136,knn_ttm_n=10:188.6856,knn_tta_r_n=10:142.9233,knn_ttm_r_n=10:152.7732,knn_tta_n=20:161.778,knn_ttm_n=20:203.1827,knn_tta_r_n=20:147.0078,knn_ttm_r_n=



Running model random_36'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:155.5688,knn_ttm_n=5:118.3972,knn_tta_r_n=5:107.7122,knn_ttm_r_n=5:144.7215,knn_tta_n=10:145.4943,knn_ttm_n=10:136.2091,knn_tta_r_n=10:129.0886,knn_ttm_r_n=10:137.6472,knn_tta_n=20:152.5497,knn_ttm_n=20:158.8012,knn_tta_r_n=20:138.1031,knn_ttm_r_n=20:138.6002,knn_tta_n=50:222.0901,knn_ttm_n=50:162.0862,knn_tta_r_n=50:145.3959,knn_ttm_r_n=50:143.342,knn_tta_n=100:194.5135,knn_ttm_n=100:161.842,knn_tta_r_n=100:156.6153,knn_ttm_r_n=100:149.823'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:242.963,knn_ttm_n=5:267.1585,knn_tta_r_n=5:145.3433,knn_ttm_r_n=5:160.6784,knn_tta_n=10:182.8224,knn_ttm_n=10:205.7293,knn_tta_r_n=10:143.1327,knn_ttm_r_n=10:138.3739,knn_tta_n=20:169.4497,knn_ttm_n=20:180.7242,knn_tta_r_n=20:142.3967,knn_ttm_r_



Running model random_40'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:83.083,knn_ttm_n=5:90.2559,knn_tta_r_n=5:81.6405,knn_ttm_r_n=5:116.7607,knn_tta_n=10:101.8838,knn_ttm_n=10:132.1124,knn_tta_r_n=10:106.1036,knn_ttm_r_n=10:117.5145,knn_tta_n=20:114.9996,knn_ttm_n=20:141.9521,knn_tta_r_n=20:121.113,knn_ttm_r_n=20:123.0693,knn_tta_n=50:125.4699,knn_ttm_n=50:147.341,knn_tta_r_n=50:135.1819,knn_ttm_r_n=50:132.6006,knn_tta_n=100:131.7942,knn_ttm_n=100:145.7487,knn_tta_r_n=100:149.1726,knn_ttm_r_n=100:139.8743'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:146.9956,knn_ttm_n=5:202.61,knn_tta_r_n=5:125.0695,knn_ttm_r_n=5:147.1782,knn_tta_n=10:135.5435,knn_ttm_n=10:173.7699,knn_tta_r_n=10:123.5667,knn_ttm_r_n=10:127.9325,knn_tta_n=20:129.0395,knn_ttm_n=20:168.0221,knn_tta_r_n=20:128.0577,knn_ttm_r_n=20



Running model random_48'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 6956 - Val 2320 - Test 2320-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=5:83.2266,knn_ttm_n=5:92.089,knn_tta_r_n=5:81.7606,knn_ttm_r_n=5:108.3909,knn_tta_n=10:100.8413,knn_ttm_n=10:135.3013,knn_tta_r_n=10:105.93,knn_ttm_r_n=10:114.8644,knn_tta_n=20:117.6208,knn_ttm_n=20:152.4433,knn_tta_r_n=20:121.7122,knn_ttm_r_n=20:123.0727,knn_tta_n=50:127.6681,knn_ttm_n=50:151.5843,knn_tta_r_n=50:138.0572,knn_ttm_r_n=50:133.441,knn_tta_n=100:134.1105,knn_ttm_n=100:150.4941,knn_tta_r_n=100:155.6957,knn_ttm_r_n=100:142.7583'
Tested (test) on 2320 instances with mean losses of: knn_tta_n=5:153.3733,knn_ttm_n=5:199.2892,knn_tta_r_n=5:131.0667,knn_ttm_r_n=5:140.3101,knn_tta_n=10:140.9532,knn_ttm_n=10:191.2664,knn_tta_r_n=10:127.1121,knn_ttm_r_n=10:131.6029,knn_tta_n=20:127.3953,knn_ttm_n=20:177.8833,knn_tta_r_n=20:129.228,knn_ttm_r_n=20

In [None]:
scores_df = pd.DataFrame(all_scores)
scores_df.to_csv(log_dir/f"scores.csv",index=False)

scores_df_sorted = pd.DataFrame(scores_df).sort_values(by='MSE')

best_5 = []
summary_logger.info(f"Rank - " +" - ".join(list(scores_df_sorted.columns)))
for i,(index,row) in enumerate(scores_df_sorted.iterrows()):
    if i < 5:
        best_5.append((row["model_num"],row["predictor"],row["MSE"],row["R2"]))
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    summary_logger.info(s)

In [None]:
scores_df_final = pd.DataFrame(all_scores_final)
scores_df_final.to_csv(log_dir/f"test_scores.csv",index=False)

summary_logger.info("-----------------------\n Best 5 on Test Sest \n ---------------------")
summary_logger.info(f"Rank -  Deep Model - Predictor - Val Set - Test Set")
for i, (j,k,v,x) in enumerate(best_5):
    row = scores_df_final.loc[(scores_df_final['model_num']==j) & (scores_df_final['predictor'] == k)].iloc[0]
    #print(row)
    s = f"{i} - {j} - {k} - {v} - {x} - {row['MSE']} - {row['R2']}"
    summary_logger.info(s)

In [None]:
#graph our deep models by rank - plot - then overlay our knn moels

deep_set = scores_df[scores_df["predictor"]=="deep"].sort_values("R2")
deep_set["order"] = [i for i in range(0,100)]
deep_ordering = {row["model_num"]:row["order"] for index, row in deep_set.iterrows()}

def order_models(x):
    x = [deep_ordering[i] for i in x]
    return x

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df[scores_df["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        s=10
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot.png", bbox_inches='tight')

In [None]:
scores_df_base = scores_df[scores_df["predictor"]=='deep']
scores_df_uu = scores_df[scores_df["predictor"].str.contains('_uu')]   #val_eq_list(scores_df["predictor"],'dist')] #np.logical_or(scores_df["predictor"]=="deep",'dist' in scores_df["predictor"])]
scores_df_ut = scores_df[scores_df["predictor"].str.contains('_ut')] 
scores_df_tu = scores_df[scores_df["predictor"].str.contains('_tu')]   #val_eq_list(scores_df["predictor"],'dist')] #np.logical_or(scores_df["predictor"]=="deep",'dist' in scores_df["predictor"])]
scores_df_tta = scores_df[scores_df["predictor"].str.contains('_tta')] 
scores_df_ttm = scores_df[scores_df["predictor"].str.contains('_ttm')] 

In [None]:
fig, ax = plt.subplots()
knn_models = scores_df_tta["predictor"].unique()
ax.scatter(x=order_models(scores_df_base["model_num"].tolist()), y=scores_df_base["R2"], s=10, label='deep')
for knn_model in knn_models:
    subset = scores_df_tta[scores_df_tta["predictor"]==knn_model]
    s=3
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_tta.png", bbox_inches='tight')


In [None]:
fig, ax = plt.subplots()
knn_models = scores_df_ttm["predictor"].unique()
ax.scatter(x=order_models(scores_df_base["model_num"].tolist()), y=scores_df_base["R2"], s=10, label='deep')
for knn_model in knn_models:
    subset = scores_df_ttm[scores_df_ttm["predictor"]==knn_model]
    s=3
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_ttm.png", bbox_inches='tight')


In [None]:
#graph our deep models by rank on final set - plot - then overlay our knn moels

deep_set = scores_df_final[scores_df_final["predictor"]=="deep"].sort_values("R2")
deep_set["order"] = [i for i in range(0,100)]
deep_ordering = {row["model_num"]:row["order"] for index, row in deep_set.iterrows()}

def order_models(x):
    x = [deep_ordering[i] for i in x]
    return x

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df_final["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_final[scores_df_final["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        s=10
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_final.png", bbox_inches='tight')

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df_final["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_final[scores_df_final["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        pass
    else:
        y1 = subset["R2"].to_numpy() - scores_df_final[scores_df_final["predictor"]=='deep']["R2"].to_numpy()
        ax.scatter(x=order_models(subset["model_num"].tolist()), y=y1, s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"improvement_plot_final.png", bbox_inches='tight')