In [1]:
import logging
# set seed
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import utils as ut
import experiment as exp
from evaluation import *
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *
from plot import *
from sk_models import setup_pls_models_slim

seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)

In [2]:
#setup input and output formats, load data

file_name = "A_AL_RT.csv"

id_cols =[]#["db_id", "sample_id"]#["sample_id"]

data_path = Path('D:/workspace/lazydeep/data/soil_data/')
model_path = Path('D:/workspace/lazydeep/experiments/1.01/')
log_path = Path("D:/workspace/lazydeep/experiments/1.02")
n_components = 22

data_file = data_path / file_name
log_dir = log_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)
model_dir = model_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)

if not log_dir.exists():
    log_dir.mkdir()
print(log_dir)


D:\workspace\lazydeep\experiments\1.02\A_AL_RT


In [3]:
data = pd.read_csv(data_file)
nrow, ncol = data.shape
data = ut.sample_data(data,random_state)

n_features = ncol - 1-len(id_cols)
dataset = TabularDataset(data,id_cols = id_cols, cat_cols=None, output_cols=None, ignore_cols= None)
print(data.shape)

(1438, 1702)


In [4]:
n_models = 100
model_names = [f"random_{i}" for i in range(0,n_models)]
deep_models = {name:torch.load(model_dir/"models"/name/"_model") for name in model_names}
#for each model, load state
print(f"Loaded {len(deep_models)} models")
#print(deep_models)
for name in model_names:
    sub_path = log_dir / name
    if not sub_path.exists():
        sub_path.mkdir()

Loaded 100 models


In [5]:
# set logging, in this case the root logger
ut.setup_logger(logger_name="",file_name=log_dir/"log.txt")
ut.setup_logger(logger_name="summary",file_name=log_dir/"summary.txt")
summary_logger = logging.getLogger("summary")
tb = SummaryWriter(log_dir/"tb")


In [6]:
fixed_hyperparams = {'bs': 32,'loss': nn.MSELoss(),'epochs': 100}
preprocessing = Preprocess_PLS(n_components=n_components)
eval = CrossValEvaluation(preprocessing=preprocessing,tensorboard=None,time=True,random_state=random_state)
scores={} #->model->knn:{fold_0:number,...,fold_n:number,mean:number,median:number
preds={} #model-> foldsxknn_models
deep_scores_dict={}
deep_preds_dict={}
actual_y = None

device = "cpu" #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
load_fun_cv = lambda name,model, fold : model.load_state(model_dir/'models'/name/f"_fold_{fold}")
load_fun_build = lambda name,model : model.load_state(model_dir/'models'/name/f"_final")



In [25]:
#run the best model on the test split - best model is random_57  - MSE:0.4028,R2:0.9337'
model = torch.load(model_dir/"models"/"random_5"/f"_model")
tt_splitter= train_test_split
train_split1,test_split = tt_splitter([i for i in range(0,len(data))],train_size=5/6,random_state=random_state,shuffle=False) #take our training and our test 

In [26]:
#run  the best model on the test split

train_split, val_data, test_split = dataset.split(train_split1, None, test_split, preprocessing = preprocessing)
test_X,test_y = zip(*[ (X,y) for X,y in test_split])
test_y = np.asarray(test_y)
model = torch.load(model_dir/"models"/"random_5"/f"_model")
for code in ['init_state','final']:
    model.load_state_dict(torch.load(model_dir/"models"/"random_5"/f"_{code}"))
    preds = model.forward(torch.tensor(test_X).float()).detach().numpy()

    val_score = r2_score(test_y,preds)
    val_mse =  mean_squared_error(test_y,preds)

    print(val_score,val_mse)

[[ 0.01695968  0.02120247  0.07917055 ...  0.11714174  0.08865453
   0.05088219]
 [ 0.01810302  0.01935943  0.07575022 ...  0.09206153  0.05771873
   0.08069276]
 [ 0.01921557  0.01744857  0.07227662 ...  0.0805834   0.04197767
   0.09448845]
 ...
 [ 0.02311599  0.01646588  0.01949793 ... -0.01682693 -0.00362798
   0.01046275]
 [ 0.02308568  0.01651355  0.01956011 ... -0.01889519 -0.00592329
   0.01299535]
 [ 0.0230551   0.01656241  0.01962823 ... -0.02135868 -0.00859426
   0.01603646]]
[[ 0.02477695 -0.02356564 -0.03739578 -0.03558914  0.02909668 -0.03307994
   0.02678511 -0.03401774 -0.01054564 -0.02391075  0.03878953 -0.02226663
   0.02977767 -0.03633407  0.02407958 -0.06044229  0.03856325  0.04666698
   0.03257466 -0.03104083 -0.06529193  0.05352792]]
-2.163925624453733 2541.6070273990995
0.8663137513298369 107.39124411161661


In [23]:
cv_splitter = K2Fold(n_splits=5,random_state=random_state)
for fold, (train_ind, val_ind, test_ind) in enumerate(cv_splitter.split(train_split)):
        train_data, val_data, test_data = dataset.split(train_ind, val_ind, test_ind, preprocessing = preprocessing)
        test_X,test_y = zip(*[ (X,y) for X,y in test_data])
        test_y = np.asarray(test_y)
        
        model.load_state_dict(torch.load(model_dir/"models"/"random_5"/f"_fold_{fold}"))
        preds = model.forward(torch.tensor(test_X).float()).detach().numpy()

        val_score = r2_score(test_y,preds)
        val_mse =  mean_squared_error(test_y,preds)

        print(val_score,val_mse)



[[ 0.01694714  0.02137554  0.08178384 ...  0.05415289  0.00442955
   0.06188886]
 [ 0.01809617  0.01955103  0.07852984 ...  0.03513465  0.02700274
   0.08497646]
 [ 0.01919185  0.01766278  0.07534896 ...  0.03013528  0.03091026
   0.08608117]
 ...
 [ 0.02322585  0.01677321  0.02119172 ... -0.00607184  0.0093803
  -0.00449615]
 [ 0.02319835  0.01681831  0.02124571 ... -0.00836434  0.01181686
  -0.00196554]
 [ 0.02317117  0.01686441  0.02131657 ... -0.01139777  0.0150539
   0.00126707]]
[[ 0.02498952 -0.02320319 -0.04105547 -0.03646868  0.028643   -0.03229133
   0.02282377 -0.03420892 -0.00947398 -0.02095455  0.03703694 -0.04147501
  -0.04219318  0.01547907  0.01398815  0.08056134 -0.02697616  0.03705634
  -0.03712943 -0.02817297  0.06312697  0.03979943]]
0.85559276381646 116.61514177257519
[[ 0.01728657  0.02055388 -0.0749399  ...  0.04875266 -0.03983927
  -0.06500631]
 [ 0.01836846  0.01895595 -0.07216779 ...  0.03292234 -0.03312869
  -0.07433996]
 [ 0.01940502  0.01727566 -0.06954137 

In [None]:
deep_scheme = DeepScheme(None, fixed_hyperparams=fixed_hyperparams,loss_eval=loss_target,device=device,tensorboard=tb,adaptive_lr=True,update=False)
deep_scores, deep_preds, _ , _, _ = eval.evaluate(deep_models,dataset,deep_scheme,logger_name="log",load_fun=load_fun_cv)
deep_scores_final, deep_preds_final, _ ,_, _ = eval.build(deep_models,dataset,deep_scheme,logger_name="test_log",load_fun=load_fun_build)

all_scores = []
for k,v in ut.flip_dicts(deep_scores).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores.append({**dict1,**v})

all_scores_final = []
for k,v in ut.flip_dicts(deep_scores_final).items():
    dict1 = {'model_num':k,"predictor":"deep"}
    all_scores_final.append({**dict1,**v})

In [None]:
for deep_name,deep_model in deep_models.items():
    logging.getLogger().info(f"Running model {deep_name}")
    temp_dict = {deep_name:deep_model}

    lwr_scheme = DeepLWRScheme_1_to_n(lwr_models = setup_pls_models_slim(nrow),n_neighbours=500,loss_fun_sk = mean_squared_error)
    lwr_scores, lwr_preds, _ , _, _= eval.evaluate(temp_dict,dataset,lwr_scheme,logger_name="log",load_fun=load_fun_cv)
    lwr_scores_final, lwr_preds_final, _ , _, _= eval.build(temp_dict,dataset,lwr_scheme,logger_name="test_log",load_fun=load_fun_build)

    #scores
    for k,v in ut.flip_dicts(lwr_scores).items():
        dict1 = {'model_num':deep_name,"predictor":k}
        all_scores.append({**dict1,**v})

    for k,v in ut.flip_dicts(lwr_scores_final).items():
        dict1 = {'model_num':deep_name,"predictor":k}
        all_scores_final.append({**dict1,**v})

    lwr_preds['deep'] = deep_preds[deep_name]
    lwr_preds_final['deep'] = deep_preds_final[deep_name]

    lwr_preds.to_csv(log_dir/deep_name/ f"predictions.csv",index=False)
    lwr_preds_final.to_csv(log_dir/deep_name/ f"predictions_test.csv",index=False)

    #preds
    # todo save predictions - appending solns
    plot_preds_and_res(lwr_preds,name_lambda=lambda x:f"{deep_name} with {x} predictor",save_lambda= lambda x:f"deep_lwr{x}",save_loc=log_dir/deep_name)



In [None]:
scores_df = pd.DataFrame(all_scores)
scores_df.to_csv(log_dir/f"scores.csv",index=False)
scores_df_final = pd.DataFrame(all_scores_final)
scores_df_final.to_csv(log_dir/f"test_scores.csv",index=False)

scores_df_sorted = pd.DataFrame(scores_df).sort_values(by='MSE')

best_5 = []
summary_logger.info(f"Rank - " +" - ".join(list(scores_df_sorted.columns)))
for i,(index,row) in enumerate(scores_df_sorted.iterrows()):
    if i < 5:
        best_5.append((row["model_num"],row["predictor"],row["MSE"],row["R2"]))
    s = f"{i} - " + " - ".join([f"{i}" for i in row.tolist()])
    summary_logger.info(s)

In [None]:
summary_logger.info("-----------------------\n Best 5 on Test Sest \n ---------------------")
summary_logger.info(f"Rank -  Deep Model - Predictor - Val Set - Test Set")
for i, (j,k,v,x) in enumerate(best_5):

    row = scores_df_final.loc[(scores_df_final['model_num']==j) & (scores_df_final['predictor'] == k)].iloc[0]
    #print(row)
    s = f"{i} - {j} - {k} - {v} - {x} - {row['MSE']} - {row['R2']}"
    summary_logger.info(s)


In [None]:
#take 1 is a scatter plot - lets, for each dataset
#graph our deep models by rank - plot - then overlay our knn moels
#plot points

deep_set = scores_df[scores_df["predictor"]=="deep"].sort_values("R2")
deep_set["order"] = [i for i in range(0,100)]
deep_ordering = {row["model_num"]:row["order"] for index, row in deep_set.iterrows()}

def order_models(x):
    x = [deep_ordering[i] for i in x]
    return x

fig, ax = plt.subplots()
set_deep = False
knn_models = scores_df["predictor"].unique()
for knn_model in knn_models:
    subset = scores_df[scores_df["predictor"]==knn_model]
    s=3
    if knn_model == "deep":
        s=10
    ax.scatter(x=order_models(subset["model_num"].tolist()), y=subset["R2"], s=s, label=knn_model)

#ax.set_ylim(0,scores_db["deep_mean"].max())
ax.set_ylim(0,1)
# plot residuals
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
ax.set_ylabel("R2 Score")
ax.set_xlabel("Deep Model Rank")
#ax.set_yscale("symlog")
ax.set_title("Summary of LWR improvements over Deep Models")
plt.savefig(log_dir/f"summary_plot.png", bbox_inches='tight')
logging.getLogger().info("Wrote Summary Graph")