In [1]:
import logging
# set seed
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import utils as ut
import experiment as exp
from evaluation import *
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *
from plot import *
from sk_models import setup_pls_models_exh, StandardScaler, PLSRegression, DeepKNN,CustomWrapper,LWRBoost
from tqdm.notebook import tqdm, trange

seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)
import gc
torch.cuda.empty_cache()
gc.collect()

print(f"GPU detected is {torch.cuda.get_device_name(0)}")

GPU detected is GeForce GTX 970


In [2]:
id_col_db = {'A_C_OF_ALPHA':["sample_id"],
             'A_C_OF_SIWARE':[],
             'A_AL_RT':[],
             'PLN7':["db_id", "sample_id"],
             'mango_684_990': ['Set','Season','Region','Date','Type','Cultivar','Pop','Temp',"FruitID"]
            }

output_col_db= {'A_C_OF_ALPHA':None,
             'A_C_OF_SIWARE':None,
             'A_AL_RT':None,
             'PLN7':None,
             'mango_684_990': ['DM']
            }


In [3]:
#setup input and output formats, load data

file_name = "A_C_OF_SIWARE.csv"
dataset_name = re.sub(r'\.(?=csv$)[^.]+$', '',file_name)


data_path = Path('D:/workspace/lazydeep/data/soil_data/')
model_path = Path('D:/workspace/lazydeep/experiments/2.00/')
log_path = Path("D:/workspace/lazydeep/experiments/2.05_reverse")

data_file = data_path / file_name
log_dir = log_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)
model_dir = model_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)

if not log_dir.exists():
    log_dir.mkdir()
print(log_dir)

id_cols =id_col_db[dataset_name]
output_cols = output_col_db[dataset_name]


D:\workspace\lazydeep\experiments\2.05_reverse\PLN7


In [4]:
data = pd.read_csv(data_file)
data = data.sample(frac=1)
data = ut.sample_data(data,random_state)
nrow, ncol = data.shape
n_features = ncol - 1-len(id_cols)

dataset = TabularDataset(data,id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
print(data.shape)

(10000, 129)


In [5]:
# set logging, in this case the root logger
ut.setup_logger(logger_name="",file_name=log_dir/"log.txt")
ut.setup_logger(logger_name="summary",file_name=log_dir/"summary.txt")
summary_logger = logging.getLogger("summary")
tb = SummaryWriter(log_dir/"tb")


In [6]:
if False: 
    n_models = 100
    model_names = [f"random_{i}" for i in range(0,n_models)]
    deep_models = {name:torch.load(model_dir/"models"/name/"_model") for name in model_names}
    configs =  {name:Configuration().load(model_dir/"models"/name/"_config") for name in model_names}
    #for each model, load state
    print(f"Loaded {len(deep_models)} models")
    #print(deep_models)
            
n_models = 100
epochs = 100
bs = 32
fixed_hyperparams = {'bs': bs,'loss': nn.MSELoss(),'epochs': epochs}
device = "cpu"#torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#setup models
config_gen = RandomConfigGen(lr= (0,1),
                             allow_increase_size=False,
                             n_features=50,
                             opt=[torch.optim.SGD,
                                  torch.optim.Adam],
                             lr_update = [None,
                                          torch.optim.lr_scheduler.ReduceLROnPlateau,
                                          torch.optim.lr_scheduler.ExponentialLR,
                                          torch.optim.lr_scheduler.CosineAnnealingLR],
                            dropout = [True,False],
                            batch_norm = [True,False])
configs = {f"random_{i}":config_gen.sample() for i in range(n_models)}
config_gen.save(log_dir/'config_gen.txt')

models = {name:RandomNet(input_size=n_features,
                         n_layers=config.n_layers,
                         act_function=config.act_function,
                         n_features = config.n_features,
                         dropout=config.dropout,
                         batch_norm=config.batch_norm,
                         device=device,dtype=torch.float)
          for name, config in configs.items()}
preprocessing = StandardScaler()

model_names = models.keys()
for name in model_names:
        sub_path = log_dir / name
        if not sub_path.exists():
            sub_path.mkdir()

In [7]:
if dataset_name == 'mango_684_990':
    eval_ = MangoesSplitter(preprocessing=preprocessing,tensorboard=None,time=True,random_state=random_state)
else:
    eval_ = CrossValEvaluation(preprocessing=preprocessing,tensorboard=None,time=True,random_state=random_state)

In [8]:
scores={} #->model->knn:{fold_0:number,...,fold_n:number,mean:number,median:number
preds={} #model-> foldsxknn_models
deep_scores_dict={}
deep_preds_dict={}
actual_y = None

load_fun_cv = lambda name,model, fold : model.load_state(model_dir/'models'/name/f"_fold_{fold}")
load_fun_pp_cv = None #lambda fold : preprocessing.from_state(preprocessing.load_state(model_dir/'preprocessing'/f"_fold_{fold}"))
load_fun_build = lambda name,model : model.load_state(model_dir/'models'/name/f"_final")
load_fun_pp_build = None #lambda : preprocessing.from_state(preprocessing.load_state(model_dir/'preprocessing'/f"_final"))


In [9]:
deep_scheme = DeepScheme(configs, fixed_hyperparams=fixed_hyperparams,loss_eval=loss_target,device=device,tensorboard=tb,adaptive_lr=False,update=False)
deep_scores, deep_preds, _ , _, _,_ = eval_.evaluate(models,dataset,deep_scheme,logger_name="log",load_fun=load_fun_cv,load_fun_pp=load_fun_pp_cv)
deep_scores_final, deep_preds_final, _ ,_, _,_ = eval_.build(models,dataset,deep_scheme,logger_name="test_log",load_fun=load_fun_build,load_fun_pp=load_fun_pp_build)

Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Tested (test) on 1667 instances with mean losses of: random_0:168.6752,random_1:142.1982,random_2:177.255,random_3:2788.8217,random_4:160.6797,random_5:154.4269,random_6:2779.8074,random_7:401.2411,random_8:2765.041,random_9:149.3919,random_10:167.512,random_11:169.7688,random_12:153.2087,random_13:161.1382,random_14:222.6899,random_15:160.8311,random_16:161.9417,random_17:176.623,random_18:155.8045,random_19:302.548,random_20:160.1394,random_21:155.9266,random_22:155.6808,random_23:160.6616,random_24:466.815,random_25:135.623,random_26:167.997,random_27:155.127,random_28:179.1668,random_29:136.1725,random_30:162.3986,random_31:160.6022,random_32:2757.4355,random_33:2775.6764,random_34:156.4009,random_35:162.9483,random_36:158.9109,random_37:147.8967,random_38:165.4038,random_39:2747.4657,random_40:170.8331,random_41:139.9774,random_42

In [10]:
all_scores = []
for k,v in ut.flip_dicts(deep_scores).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores.append({**dict1,**v})

all_scores_final = []
for k,v in ut.flip_dicts(deep_scores_final).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores_final.append({**dict1,**v})

In [11]:
scores_df_sorted = pd.DataFrame(all_scores).sort_values(by='MSE')
for i,(index,row) in enumerate(scores_df_sorted.iterrows()):
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    print(s)

0 - random_29 - deep - 136.1725005096637 - 132.73565888018686 - 144.03927211383896 - 116.98450558812392 - 125.26079097715747 - 131.04092569468253 - 0.7267939682642388
1 - random_60 - deep - 142.17859075050833 - 128.13758790600278 - 142.85257588222345 - 122.92048939937303 - 132.18773698749519 - 133.65686071986772 - 0.721340029170626
2 - random_63 - deep - 139.99827609551332 - 136.84030156859254 - 150.87722592963098 - 119.90759105155735 - 129.62207239615816 - 135.45165808914675 - 0.7175980724923514
3 - random_41 - deep - 139.9773749138112 - 134.79085337107384 - 143.33308914999228 - 127.02502206937463 - 133.13876126622523 - 135.65435717780355 - 0.7171754670097146
4 - random_15 - deep - 160.83109276582184 - 139.18237164639825 - 142.10121095516615 - 112.93918198382869 - 129.92729984755132 - 136.99996686917987 - 0.714370017627415
5 - random_38 - deep - 165.40380923906773 - 126.28380756343849 - 151.49548406892717 - 117.41265477138121 - 128.35157377007198 - 137.79304447122234 - 0.7127165373626

In [12]:
scores_df_sorted_final = pd.DataFrame(all_scores_final).sort_values(by='MSE')

n = 30
best_n = []
for i,(index,row) in enumerate(scores_df_sorted_final.iterrows()):
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    print(s)
    if i < n:
        best_n.append(row['model_num'])

0 - random_60 - deep - 125.80911623732636 - 0.7385531124617956
1 - random_25 - deep - 126.50414113452753 - 0.7371087648535346
2 - random_89 - deep - 127.04149021236918 - 0.735992087079102
3 - random_59 - deep - 128.68949149600525 - 0.7325673367974703
4 - random_75 - deep - 128.9325215528976 - 0.7320622903123423
5 - random_38 - deep - 129.77903000569577 - 0.7303031411594194
6 - random_50 - deep - 131.2077916805784 - 0.7273339978723203
7 - random_4 - deep - 131.8868952787036 - 0.7259227366906511
8 - random_41 - deep - 134.28216908942576 - 0.7209450617705464
9 - random_29 - deep - 135.5702412752982 - 0.7182682886241107
10 - random_0 - deep - 135.7012607060716 - 0.7179960140592205
11 - random_51 - deep - 135.83677560040803 - 0.7177143973656219
12 - random_20 - deep - 135.83817254590082 - 0.7177114943402922
13 - random_9 - deep - 136.25181961887546 - 0.716851884615632
14 - random_63 - deep - 137.08992446864116 - 0.7151101991880946
15 - random_74 - deep - 137.24237700707667 - 0.7147933839773

In [None]:
def build_predictors(n,deep):
    predictors = {}
    for i in [100,500,1000]:
        if i* 2 < n:
            #predictors[f'knn_uu_n={i}'] = LWRBoost(deep,n_neighbors=i, weights='uniform',errors='uniform',convolution='additive',reverse=True)
            #predictors[f'knn_ut_n={i}'] = LWRBoost(deep,n_neighbors=i, weights='uniform',errors='triangle',convolution='additive',reverse=True)
            #predictors[f'knn_tu_n={i}'] = LWRBoost(deep,n_neighbors=i, weights='triangle',errors='uniform',convolution='additive',reverse=True)
            predictors[f'knn_tta_n={i}'] = LWRBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='additive',reverse=True)
            predictors[f'knn_ttm_n={i}'] = LWRBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='multiplicative',reverse=True)
            predictors[f'knn_tta_r_n={i}'] = LWRBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='additive',reverse=False)
            predictors[f'knn_ttm_r_n={i}'] = LWRBoost(deep,n_neighbors=i, weights='triangle',errors='triangle',convolution='multiplicative',reverse=False)

    return predictors

models = {k:v for k,v in models.items() if k in best_n}
for deep_name,deep_model in tqdm(models.items()):
    if int(deep_name.replace("random_","")) >40 :
        logging.getLogger().info(f"Running model {deep_name}")
        temp_dict = {deep_name:deep_model}

        lwr_scheme = BoostScheme(boost_models = build_predictors(nrow,deep_model),loss_fun_sk = mean_squared_error)
        lwr_scores, lwr_preds, _ , _, _,_= eval_.evaluate(temp_dict,dataset,lwr_scheme,logger_name="log")
        lwr_scores_final, lwr_preds_final, _ , _, _,_= eval_.build(temp_dict,dataset,lwr_scheme,logger_name="test_log")

        #scores
        for k,v in ut.flip_dicts(lwr_scores).items(): 
            dict1 = {'model_num':deep_name,"predictor":k}
            all_scores.append({**dict1,**v})

        for k,v in ut.flip_dicts(lwr_scores_final).items():
            dict1 = {'model_num':deep_name,"predictor":k}
            all_scores_final.append({**dict1,**v})

        lwr_preds['deep'] = deep_preds[deep_name]
        lwr_preds_final['deep'] = deep_preds_final[deep_name]

        lwr_preds.to_csv(log_dir/deep_name/ f"predictions.csv",index=False)
        lwr_preds_final.to_csv(log_dir/deep_name/ f"predictions_test.csv",index=False)

        #preds
        # todo save predictions - appending solns
        plot_preds_and_res(lwr_preds,name_lambda=lambda x:f"{deep_name} with {x} predictor",save_lambda= lambda x:f"deep_lwr{x}",save_loc=log_dir/deep_name)


  0%|          | 0/30 [00:00<?, ?it/s]

Running model random_41'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:45.1988,knn_ttm_n=100:49.8315,knn_tta_r_n=100:49.3822,knn_ttm_r_n=100:49.9464,knn_tta_n=500:76.4518,knn_ttm_n=500:80.315,knn_tta_r_n=500:78.2082,knn_ttm_r_n=500:77.9281,knn_tta_n=1000:80.9944,knn_ttm_n=1000:84.6034,knn_tta_r_n=1000:82.5705,knn_ttm_r_n=1000:82.3121'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:1770.0925,knn_ttm_n=100:3147.2647,knn_tta_r_n=100:570.3344,knn_ttm_r_n=100:738.1917,knn_tta_n=500:106.6066,knn_ttm_n=500:136.3864,knn_tta_r_n=500:88.4249,knn_ttm_r_n=500:88.7726,knn_tta_n=1000:90.0039,knn_ttm_n=1000:105.9309,knn_tta_r_n=1000:82.7845,knn_ttm_r_n=1000:83.9355'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Model



Running model random_44'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:19.7142,knn_ttm_n=100:15.9751,knn_tta_r_n=100:29.4283,knn_ttm_r_n=100:21.4403,knn_tta_n=500:74.3062,knn_ttm_n=500:62.4721,knn_tta_r_n=500:83.2269,knn_ttm_r_n=500:68.4843,knn_tta_n=1000:87.6345,knn_ttm_n=1000:80.631,knn_tta_r_n=1000:94.1244,knn_ttm_r_n=1000:85.8992'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:302.0417,knn_ttm_n=100:395.3003,knn_tta_r_n=100:203.9057,knn_ttm_r_n=100:231.6413,knn_tta_n=500:140.3139,knn_ttm_n=500:171.7646,knn_tta_r_n=500:113.0881,knn_ttm_r_n=500:118.0225,knn_tta_n=1000:117.9271,knn_ttm_n=1000:142.4397,knn_tta_r_n=1000:109.7565,knn_ttm_r_n=1000:111.1736'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Mo



Running model random_46'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:56.0561,knn_ttm_n=100:56.242,knn_tta_r_n=100:62.0288,knn_ttm_r_n=100:58.7513,knn_tta_n=500:91.1588,knn_ttm_n=500:96.5718,knn_tta_r_n=500:93.3194,knn_ttm_r_n=500:91.3925,knn_tta_n=1000:97.4751,knn_ttm_n=1000:104.5133,knn_tta_r_n=1000:98.7586,knn_ttm_r_n=1000:98.006'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:1399.8803,knn_ttm_n=100:1692.908,knn_tta_r_n=100:547.7816,knn_ttm_r_n=100:2094.4374,knn_tta_n=500:108.8315,knn_ttm_n=500:135.8376,knn_tta_r_n=500:103.0339,knn_ttm_r_n=500:104.2757,knn_tta_n=1000:102.4784,knn_ttm_n=1000:124.014,knn_tta_r_n=1000:100.3332,knn_ttm_r_n=1000:102.3193'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted M



Running model random_49'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:120.7012,knn_ttm_n=100:118.7622,knn_tta_r_n=100:123.0442,knn_ttm_r_n=100:120.9853,knn_tta_n=500:128.6183,knn_ttm_n=500:130.2521,knn_tta_r_n=500:130.0228,knn_ttm_r_n=500:129.4231,knn_tta_n=1000:130.1407,knn_ttm_n=1000:131.8011,knn_tta_r_n=1000:131.3326,knn_ttm_r_n=1000:130.8327'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:145.346,knn_ttm_n=100:162.8653,knn_tta_r_n=100:139.2093,knn_ttm_r_n=100:140.4895,knn_tta_n=500:140.5175,knn_ttm_n=500:146.7017,knn_tta_r_n=500:139.014,knn_ttm_r_n=500:138.7812,knn_tta_n=1000:140.3226,knn_ttm_n=1000:144.7428,knn_tta_r_n=1000:139.7339,knn_ttm_r_n=1000:139.3143'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training



Running model random_50'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:75.7404,knn_ttm_n=100:79.1792,knn_tta_r_n=100:77.8376,knn_ttm_r_n=100:79.0053,knn_tta_n=500:88.4366,knn_ttm_n=500:91.0446,knn_tta_r_n=500:88.9239,knn_ttm_r_n=500:88.5711,knn_tta_n=1000:89.8002,knn_ttm_n=1000:93.1573,knn_tta_r_n=1000:90.0705,knn_ttm_r_n=1000:89.9692'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:2545982.7745,knn_ttm_n=100:2585507.518,knn_tta_r_n=100:28518.9427,knn_ttm_r_n=100:1888104.001,knn_tta_n=500:440.9412,knn_ttm_n=500:774.0295,knn_tta_r_n=500:196.0632,knn_ttm_r_n=500:204.1737,knn_tta_n=1000:83.5672,knn_ttm_n=1000:86.7884,knn_tta_r_n=1000:83.5415,knn_ttm_r_n=1000:83.6758'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training B



Running model random_51'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:29.4222,knn_ttm_n=100:30.9317,knn_tta_r_n=100:37.5551,knn_ttm_r_n=100:51.9144,knn_tta_n=500:87.5217,knn_ttm_n=500:90.5191,knn_tta_r_n=500:91.686,knn_ttm_r_n=500:88.2182,knn_tta_n=1000:97.9895,knn_ttm_n=1000:101.9561,knn_tta_r_n=1000:101.3694,knn_ttm_r_n=1000:100.0527'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:460.6546,knn_ttm_n=100:577.5423,knn_tta_r_n=100:319.0736,knn_ttm_r_n=100:390.5624,knn_tta_n=500:271.9159,knn_ttm_n=500:424.8677,knn_tta_r_n=500:146.9255,knn_ttm_r_n=500:154.5191,knn_tta_n=1000:130.2617,knn_ttm_n=1000:189.6144,knn_tta_r_n=1000:126.4501,knn_ttm_r_n=1000:115.7302'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted



Running model random_59'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:44.8219,knn_ttm_n=100:27.9334,knn_tta_r_n=100:55.6413,knn_ttm_r_n=100:37.321,knn_tta_n=500:78.7566,knn_ttm_n=500:64.452,knn_tta_r_n=500:84.3899,knn_ttm_r_n=500:73.8146,knn_tta_n=1000:85.2069,knn_ttm_n=1000:76.8665,knn_tta_r_n=1000:88.5918,knn_ttm_r_n=1000:82.6704'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:240.7736,knn_ttm_n=100:331.4459,knn_tta_r_n=100:102.4715,knn_ttm_r_n=100:108.3479,knn_tta_n=500:119.6391,knn_ttm_n=500:187.4471,knn_tta_r_n=500:93.4132,knn_ttm_r_n=500:93.5874,knn_tta_n=1000:106.3515,knn_ttm_n=1000:147.7095,knn_tta_r_n=1000:92.9385,knn_ttm_r_n=1000:95.2008'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models 



Running model random_60'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:89.8349,knn_ttm_n=100:95.3021,knn_tta_r_n=100:91.3956,knn_ttm_r_n=100:92.1041,knn_tta_n=500:100.4641,knn_ttm_n=500:105.4696,knn_tta_r_n=500:100.6775,knn_ttm_r_n=500:100.8116,knn_tta_n=1000:102.017,knn_ttm_n=1000:106.38,knn_tta_r_n=1000:102.1035,knn_ttm_r_n=1000:102.2189'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:121.5049,knn_ttm_n=100:147.782,knn_tta_r_n=100:108.8611,knn_ttm_r_n=100:109.7172,knn_tta_n=500:108.155,knn_ttm_n=500:114.985,knn_tta_r_n=500:105.5409,knn_ttm_r_n=500:105.9437,knn_tta_n=1000:107.7588,knn_ttm_n=1000:113.7387,knn_tta_r_n=1000:105.4135,knn_ttm_r_n=1000:105.5327'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted



Running model random_61'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:82.3236,knn_ttm_n=100:75.3724,knn_tta_r_n=100:90.4462,knn_ttm_r_n=100:81.8794,knn_tta_n=500:107.7864,knn_ttm_n=500:109.9985,knn_tta_r_n=500:111.551,knn_ttm_r_n=500:107.4992,knn_tta_n=1000:112.4509,knn_ttm_n=1000:117.0939,knn_tta_r_n=1000:115.4619,knn_ttm_r_n=1000:112.7533'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:165.6381,knn_ttm_n=100:266.0053,knn_tta_r_n=100:149.3796,knn_ttm_r_n=100:164.078,knn_tta_n=500:129.9102,knn_ttm_n=500:151.5498,knn_tta_r_n=500:128.7837,knn_ttm_r_n=500:128.8806,knn_tta_n=1000:129.4471,knn_ttm_n=1000:142.6631,knn_tta_r_n=1000:130.7412,knn_ttm_r_n=1000:129.6152'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boo



Running model random_63'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:75.9649,knn_ttm_n=100:76.1873,knn_tta_r_n=100:78.3905,knn_ttm_r_n=100:79.0853,knn_tta_n=500:91.7581,knn_ttm_n=500:93.8262,knn_tta_r_n=500:92.7593,knn_ttm_r_n=500:92.5593,knn_tta_n=1000:93.9438,knn_ttm_n=1000:96.3421,knn_tta_r_n=1000:94.8428,knn_ttm_r_n=1000:94.5728'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:150.3684,knn_ttm_n=100:196.6327,knn_tta_r_n=100:101.823,knn_ttm_r_n=100:104.1691,knn_tta_n=500:98.8998,knn_ttm_n=500:112.7632,knn_tta_r_n=500:92.8744,knn_ttm_r_n=500:92.858,knn_tta_n=1000:93.7863,knn_ttm_n=1000:102.928,knn_tta_r_n=1000:92.6164,knn_ttm_r_n=1000:92.6589'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models wit



Running model random_66'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:86.6481,knn_ttm_n=100:85.9808,knn_tta_r_n=100:91.2034,knn_ttm_r_n=100:87.3906,knn_tta_n=500:105.793,knn_ttm_n=500:111.6892,knn_tta_r_n=500:107.4209,knn_ttm_r_n=500:106.6237,knn_tta_n=1000:108.7311,knn_ttm_n=1000:115.06,knn_tta_r_n=1000:110.0185,knn_ttm_r_n=1000:109.9325'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:1680.8586,knn_ttm_n=100:1736.3692,knn_tta_r_n=100:422.4062,knn_ttm_r_n=100:665.0146,knn_tta_n=500:123.532,knn_ttm_n=500:135.982,knn_tta_r_n=500:122.4345,knn_ttm_r_n=500:123.0419,knn_tta_n=1000:122.9989,knn_ttm_n=1000:131.172,knn_tta_r_n=1000:122.721,knn_ttm_r_n=1000:122.5052'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Booste



Running model random_69'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:97.0349,knn_ttm_n=100:86.0278,knn_tta_r_n=100:100.2788,knn_ttm_r_n=100:95.0677,knn_tta_n=500:109.7198,knn_ttm_n=500:110.5861,knn_tta_r_n=500:111.4047,knn_ttm_r_n=500:108.9714,knn_tta_n=1000:112.8894,knn_ttm_n=1000:115.8001,knn_tta_r_n=1000:113.536,knn_ttm_r_n=1000:111.8758'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:130.1614,knn_ttm_n=100:149.9807,knn_tta_r_n=100:120.72,knn_ttm_r_n=100:123.7433,knn_tta_n=500:126.238,knn_ttm_n=500:130.1433,knn_tta_r_n=500:120.9631,knn_ttm_r_n=500:125.3081,knn_tta_n=1000:121.2335,knn_ttm_n=1000:126.6228,knn_tta_r_n=1000:120.2873,knn_ttm_r_n=1000:119.735'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boost



Running model random_74'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:23.6201,knn_ttm_n=100:15.3611,knn_tta_r_n=100:35.3051,knn_ttm_r_n=100:24.4648,knn_tta_n=500:66.6241,knn_ttm_n=500:55.4095,knn_tta_r_n=500:75.5313,knn_ttm_r_n=500:60.7219,knn_tta_n=1000:76.7301,knn_ttm_n=1000:71.856,knn_tta_r_n=1000:82.8979,knn_ttm_r_n=1000:73.332'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:194.4718,knn_ttm_n=100:262.1575,knn_tta_r_n=100:140.5536,knn_ttm_r_n=100:143.554,knn_tta_n=500:96.8714,knn_ttm_n=500:125.0706,knn_tta_r_n=500:87.992,knn_ttm_r_n=500:89.2701,knn_tta_n=1000:91.6043,knn_ttm_n=1000:107.3663,knn_tta_r_n=1000:88.1903,knn_ttm_r_n=1000:87.9016'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with



Running model random_75'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:42.9764,knn_ttm_n=100:36.9344,knn_tta_r_n=100:51.3917,knn_ttm_r_n=100:348.4152,knn_tta_n=500:83.0349,knn_ttm_n=500:82.1407,knn_tta_r_n=500:86.0978,knn_ttm_r_n=500:110.8749,knn_tta_n=1000:90.3201,knn_ttm_n=1000:92.0633,knn_tta_r_n=1000:91.8303,knn_ttm_r_n=1000:103.0534'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:60871.6832,knn_ttm_n=100:42226.4501,knn_tta_r_n=100:1964.3574,knn_ttm_r_n=100:3997.0565,knn_tta_n=500:11637.7345,knn_ttm_n=500:21097.8601,knn_tta_r_n=500:833.2598,knn_ttm_r_n=500:1469.2201,knn_tta_n=1000:160.1304,knn_ttm_n=1000:979.9175,knn_tta_r_n=1000:102.0423,knn_ttm_r_n=1000:106.7116'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished trai



Running model random_76'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:41.0022,knn_ttm_n=100:32.9633,knn_tta_r_n=100:48.7865,knn_ttm_r_n=100:37944448.3818,knn_tta_n=500:69.2004,knn_ttm_n=500:65.9809,knn_tta_r_n=500:73.1057,knn_ttm_r_n=500:66.8379,knn_tta_n=1000:76.2102,knn_ttm_n=1000:76.861,knn_tta_r_n=1000:78.2079,knn_ttm_r_n=1000:74.4199'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:172.4072,knn_ttm_n=100:295.7784,knn_tta_r_n=100:121.262,knn_ttm_r_n=100:199.1787,knn_tta_n=500:84.9076,knn_ttm_n=500:102.8917,knn_tta_r_n=500:73.6308,knn_ttm_r_n=500:73.6601,knn_tta_n=1000:78.3897,knn_ttm_n=1000:92.8325,knn_tta_r_n=1000:72.1234,knn_ttm_r_n=1000:71.7986'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Mode



Running model random_89'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:83.1675,knn_ttm_n=100:80.2557,knn_tta_r_n=100:84.971,knn_ttm_r_n=100:82.4558,knn_tta_n=500:91.0837,knn_ttm_n=500:94.6604,knn_tta_r_n=500:91.4274,knn_ttm_r_n=500:90.8931,knn_tta_n=1000:91.8421,knn_ttm_n=1000:95.5897,knn_tta_r_n=1000:92.3841,knn_ttm_r_n=1000:92.0475'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:97.702,knn_ttm_n=100:112.8929,knn_tta_r_n=100:91.3205,knn_ttm_r_n=100:92.6884,knn_tta_n=500:92.8264,knn_ttm_n=500:101.0849,knn_tta_r_n=500:90.4885,knn_ttm_r_n=500:90.7752,knn_tta_n=1000:91.2142,knn_ttm_n=1000:98.7029,knn_tta_r_n=1000:89.9088,knn_ttm_r_n=1000:90.1766'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a



Running model random_93'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:20.5172,knn_ttm_n=100:19.5918,knn_tta_r_n=100:28.6766,knn_ttm_r_n=100:62.75,knn_tta_n=500:78.6161,knn_ttm_n=500:74.4669,knn_tta_r_n=500:84.4201,knn_ttm_r_n=500:75.6386,knn_tta_n=1000:91.1646,knn_ttm_n=1000:91.2913,knn_tta_r_n=1000:94.8645,knn_ttm_r_n=1000:91.3078'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:53750.8135,knn_ttm_n=100:100622.2745,knn_tta_r_n=100:11726.2771,knn_ttm_r_n=100:11445.2774,knn_tta_n=500:575.093,knn_ttm_n=500:1268.753,knn_tta_r_n=500:148.719,knn_ttm_r_n=500:150.826,knn_tta_n=1000:116.6264,knn_ttm_n=1000:215.1249,knn_tta_r_n=1000:115.0586,knn_ttm_r_n=1000:124.6985'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boost



Running model random_99'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Models with a train loss of knn_tta_n=100:24.0973,knn_ttm_n=100:18.6791,knn_tta_r_n=100:33.6269,knn_ttm_r_n=100:24.1146,knn_tta_n=500:74.591,knn_ttm_n=500:60.1455,knn_tta_r_n=500:83.3532,knn_ttm_r_n=500:67.762,knn_tta_n=1000:85.836,knn_ttm_n=1000:76.8515,knn_tta_r_n=1000:91.7596,knn_ttm_r_n=1000:81.8997'
Tested (test) on 1667 instances with mean losses of: knn_tta_n=100:1277.4683,knn_ttm_n=100:5482.7018,knn_tta_r_n=100:849.5069,knn_ttm_r_n=100:1515.501,knn_tta_n=500:124.2552,knn_ttm_n=500:206.3748,knn_tta_r_n=500:102.9291,knn_ttm_r_n=500:116.5705,knn_tta_n=1000:109.8167,knn_ttm_n=1000:132.967,knn_tta_r_n=1000:103.5063,knn_ttm_r_n=1000:102.413'
-----------------------------------Fold 1 - Train 4999 - Val 1667 - Test 1667-----------------------------------'
Finished training Boosted Mode

In [None]:
scores_df = pd.DataFrame(all_scores)
scores_df.to_csv(log_dir/f"scores.csv",index=False)

scores_df_sorted = pd.DataFrame(scores_df).sort_values(by='MSE')

best_5 = []
summary_logger.info(f"Rank - " +" - ".join(list(scores_df_sorted.columns)))
for i,(index,row) in enumerate(scores_df_sorted.iterrows()):
    if i < 5:
        best_5.append((row["model_num"],row["predictor"],row["MSE"],row["R2"]))
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    summary_logger.info(s)

In [None]:
scores_df_final = pd.DataFrame(all_scores_final)
scores_df_final.to_csv(log_dir/f"test_scores.csv",index=False)

summary_logger.info("-----------------------\n Best 5 on Test Sest \n ---------------------")
summary_logger.info(f"Rank -  Deep Model - Predictor - Val Set - Test Set")
for i, (j,k,v,x) in enumerate(best_5):
    row = scores_df_final.loc[(scores_df_final['model_num']==j) & (scores_df_final['predictor'] == k)].iloc[0]
    #print(row)
    s = f"{i} - {j} - {k} - {v} - {x} - {row['MSE']} - {row['R2']}"
    summary_logger.info(s)

In [None]:
#graph our deep models by rank - plot - then overlay our knn moels

deep_set = scores_df[scores_df["predictor"]=="deep"].sort_values("R2")
deep_set["order"] = [i for i in range(0,100)]
deep_ordering = {row["model_num"]:row["order"] for index, row in deep_set.iterrows()}

def order_models(x):
    x = [deep_ordering[i] for i in x]
    return x

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df[scores_df["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        s=10
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot.png", bbox_inches='tight')

In [None]:
#graph our deep models by rank on final set - plot - then overlay our knn moels

deep_set = scores_df_final[scores_df_final["predictor"]=="deep"].sort_values("R2")
deep_set["order"] = [i for i in range(0,100)]
deep_ordering = {row["model_num"]:row["order"] for index, row in deep_set.iterrows()}

def order_models(x):
    x = [deep_ordering[i] for i in x]
    return x

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df_final["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_final[scores_df_final["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        s=10
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_final.png", bbox_inches='tight')

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df_final["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df_final[scores_df_final["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        pass
    else:
        y1 = subset["R2"].to_numpy() - scores_df_final[scores_df_final["predictor"]=='deep']["R2"].to_numpy()
        ax.scatter(x=order_models(subset["model_num"].tolist()), y=y1, s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"improvement_plot_final.png", bbox_inches='tight')

In [None]:
scores_df_base = scores_df[scores_df["predictor"]=='deep']
scores_df_uu = scores_df[scores_df["predictor"].str.contains('_uu')]   #val_eq_list(scores_df["predictor"],'dist')] #np.logical_or(scores_df["predictor"]=="deep",'dist' in scores_df["predictor"])]
scores_df_ut = scores_df[scores_df["predictor"].str.contains('_ut')] 
scores_df_tu = scores_df[scores_df["predictor"].str.contains('_tu')]   #val_eq_list(scores_df["predictor"],'dist')] #np.logical_or(scores_df["predictor"]=="deep",'dist' in scores_df["predictor"])]
scores_df_tta = scores_df[scores_df["predictor"].str.contains('_tta')] 
scores_df_ttm = scores_df[scores_df["predictor"].str.contains('_ttm')] 

In [None]:
fig, ax = plt.subplots()
knn_models = scores_df_tta["predictor"].unique()
ax.scatter(x=order_models(scores_df_base["model_num"].tolist()), y=scores_df_base["R2"], s=10, label='deep')
for knn_model in knn_models:
    subset = scores_df_tta[scores_df_tta["predictor"]==knn_model]
    s=3
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_tta.png", bbox_inches='tight')


In [None]:
fig, ax = plt.subplots()
knn_models = scores_df_ttm["predictor"].unique()
ax.scatter(x=order_models(scores_df_base["model_num"].tolist()), y=scores_df_base["R2"], s=10, label='deep')
for knn_model in knn_models:
    subset = scores_df_ttm[scores_df_ttm["predictor"]==knn_model]
    s=3
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R^2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_ylim(0,200)
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot_ttm.png", bbox_inches='tight')
