In [1]:
#imports
import logging
# set seed
import pandas as pd
import numpy as np
import utils as ut
import experiment as ex
from evaluation import *
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *
from sk_models import PLSRegression, StandardScaler
import matplotlib.pyplot as plt
from river_models import *

from river import stream,linear_model,preprocessing, ensemble, metrics, optim
from river.ensemble import SRPRegressor
from river.neighbors import KNNRegressor 
from sklearn.metrics import mean_squared_error
from river.utils import dict2numpy, numpy2dict

seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)
import gc
torch.cuda.empty_cache()
gc.collect()

print(f"GPU detected is {torch.cuda.get_device_name(0)}")

GPU detected is GeForce GTX 970


In [2]:
#setup input and output directories

#setup input and outpu t formats, load data

#we need to set parametesr
file_name = "PLN7.csv" #"mango_684_990.csv" #"mango_729_975.csv" #fitlered=513-1050
id_cols =["db_id","sample_id"] #
output_cols = None
data_path = Path('D:/workspace/lazydeep/data/soil_data/')
log_path = Path("D:/workspace/lazydeep/experiments/6.02_v3") #1.01/")
if not log_path.exists():
    log_path.mkdir()

data_file = data_path / file_name
log_dir = log_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)
if not log_dir.exists():
    log_dir.mkdir()
print(f"Output directory is {log_dir}")

Output directory is D:\workspace\lazydeep\experiments\6.02_v3\PLN7


In [3]:
data = pd.read_csv(data_file)
data=data.sample(frac=1,random_state=random_state)

pre_ind =[i for i in range(0,10000)]
pretrain_ind,pretest_ind = train_test_split(pre_ind,train_size=5/6,random_state=random_state,shuffle=False)
stream_ind = [i for i in range(10000,110000)]

pretrain_data =  ut.TabularDataset(data.iloc[pretrain_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
pretest_data = ut.TabularDataset(data.iloc[pretest_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
stream_data = ut.TabularDataset(data.iloc[stream_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)

nrow, ncol = data.shape
nrow_train = len(pretrain_data)
nrow_test = len(pretest_data)
nrow_stream = len(stream_data)

print(f"train: {nrow_train}, test: {nrow_test}, stream: {nrow_stream}")

train: 8333, test: 1667, stream: 100000


In [4]:

def build_model(dir_,id_,scaler_):
    deep_ = torch.load(dir_/"models"/id_/"_model")
    deep_.load_state(dir_/"models"/id_/"_final")
    
    return (StreamWrapper(scaler_)|StreamDeep(deep_))


pls_model_dir = Path("D:/workspace/lazydeep/experiments/1.01/PLN7")
deep_model_dir = Path("D:/workspace/lazydeep/experiments/2.00/PLN7")

pls_scaler = PLSRegression(n_components=34).from_state(PLSRegression(n_components=34).load_state(pls_model_dir/'preprocessing'/f"_final"))      
deep_scaler = StandardScaler().from_state(StandardScaler().load_state(deep_model_dir/'preprocessing'/f"_final"))      

pls_nums = ["random_82","random_24","random_10","random_4","random_73"]
deep_nums = ["random_29","random_60","random_63","random_41","random_15"]
deep_num = 1
pls_num = 0



In [5]:
#setup evaluation
def setup_models(window_sizes=[1000,10000,20000],proportions =[0.01,0.05,0.1,0.5,0.8,1]):
    model_dicts_ = {}
    
    for ws in window_sizes:
        for p in proportions:
            model_dicts_[f"lwr_std_{ws}_{p}"]= (build_model(deep_model_dir,deep_nums[deep_num],deep_scaler)|StreamLocalWeightedRegression(n_neighbors= int(ws*p), window_size=ws,floor=True))
            model_dicts_[f"lwr_pls_{ws}_{p}"]= (build_model(pls_model_dir, pls_nums[pls_num],pls_scaler)  |StreamLocalWeightedRegression(n_neighbors= int(ws*p), window_size=ws,floor=True))
    

    return model_dicts_


In [6]:
#setup our metrics and stores of results
river_models = setup_models()
full_set = river_models.keys()
metrics = {'R2':{name:metrics.R2() for name in full_set},
           'R2_rolling': {name:RollingR2(window_size=1000) for name in full_set},
           'MSE':{name:metrics.MSE() for name in river_models.keys()},
           'MSE_rolling':{name:RollingMSE(window_size=1000) for name in full_set}
          }


In [7]:
#so sofar we have establish our metrics and scores are correct
from tqdm.notebook import tqdm, trange

In [8]:
#take our pretrained models, now evaluate them on 

In [9]:
_,_,river_models,metrics = prequential_evaluate(pretrain_data,river_models,metrics,pretrain = len(pretrain_data),num_its= len(pretrain_data) )

  0%|          | 0/8333 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
preds_test,scores_test,metrics = score_evaluate(pretest_data,river_models,metrics,num_its=len(pretest_data))

In [None]:
preds_stream, scores_stream,river_models,metrics = prequential_evaluate(stream_data,river_models,metrics,pretrain=0,num_its=20000)

In [None]:
def zip_dict(dict1,dict2):

    dict12 = {k:dict1[k]+dict2[k] for k in dict1.keys()}   
    return dict12b  

def zip_nested_dict(dict1,dict2):
    dict12 = {}
    
    for k in dict1.keys():
        dict12[k] = {name:dict1[k][name]+dict2[k][name] for name in dict1[k].keys()}
    return dict12

In [None]:
preds = zip_dict(preds_test,preds_stream)

In [None]:
scores = zip_nested_dict(scores_test,scores_stream)

In [None]:
preds_df = pd.DataFrame(preds)
preds_df.to_csv(log_dir/"preds_df.csv")


In [None]:
#findings
#1) preprocessing works, random lr things for lr don't
#) standardisation asks as regularisation

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("R^2 Score")
ax.set_title("Streaming performance ")

scores_df = pd.DataFrame(scores['R2'])
scores_df.to_csv(log_dir/"r2_scores.csv")
for (columnName, columnData) in scores_df.iteritems():

    ax.plot(columnData.index,columnData,'-',label = f"{columnName}")

ax.plot([len(pretest_data),len(pretest_data)],[-1,1],c="black",ls='--')
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"r2_plot.png",bbox_inches='tight')
ax.set_ylim(0.4,0.8)
#plt.savefig(log_dir / f"r2_plot_v2.png",bbox_inches='tight')


In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("R^2 Score")
ax.set_title("Streaming performance (rolling average) ")
    
scores_df = pd.DataFrame(scores['R2_rolling'])
scores_df.to_csv(log_dir/"r2_scores_rolling.csv")
for (columnName, columnData) in scores_df.iteritems():
    columnData
    ax.plot(columnData.index,columnData,label = f"{columnName}")
ax.plot([len(pretest_data),len(pretest_data)],[-1,1],c="black",ls='--')

ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"r2_plot_rolling.png",bbox_inches='tight')
ax.set_ylim(-1,1)
plt.savefig(log_dir / f"r2_plot_rolling_v2.png",bbox_inches='tight')

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("MSE")
ax.set_title("Streaming performance ")

scores_df = pd.DataFrame(scores['MSE'])
scores_df.to_csv(log_dir/"MSE.csv")
for (columnName, columnData) in scores_df.iteritems():
    ax.plot(columnData.index,columnData,'-',label = f"{columnName}")
ax.plot([len(pretest_data),len(pretest_data)],[0,1000],c="black",ls='--')
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"mse_plot.png",bbox_inches='tight')
ax.set_ylim(0,600)
plt.savefig(log_dir / f"mse_plot_v2.png",bbox_inches='tight')

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("MSE")
ax.set_title("Streaming performance (rolling average) ")

scores_df = pd.DataFrame(scores['MSE_rolling'])
scores_df.to_csv(log_dir/"MSE_rolling.csv")
for (columnName, columnData) in scores_df.iteritems():
    ax.plot(columnData.index,columnData,label = f"{columnName}")
ax.plot([len(pretest_data),len(pretest_data)],[0,1000],c="black",ls='--')
ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"mse_plot_rolling.png",bbox_inches='tight')
ax.set_ylim(0,1000)
plt.savefig(log_dir / f"mse_plot_rolling_v2.png",bbox_inches='tight')

In [None]:
scores_df = pd.DataFrame(scores['R2_rolling'])

In [None]:
def take_subset_by_str(dataset,string):
    col_names = dataset.columns.tolist()
    encoding = [i for i in col_names if (string in i)]
    return scores_df[encoding]
    

In [None]:
pp_opts = ['pls_','std_']
ws_opts = ['1000_','10000_','20000_']

for pp_opt in pp_opts:
    subset1 = take_subset_by_str(scores_df,pp_opt)
    for ws_opt in ws_opts:
        subset2 = take_subset_by_str(subset1,ws_opt)
        
        fig, ax = plt.subplots()

        ax.set_xlabel("Stream Index")
        ax.set_ylabel("R2")
        ax.set_title("Streaming performance (rolling average) ")
        
        for (columnName, columnData) in subset2.iteritems():
            ax.plot(columnData.index,columnData,label = f"{columnName.replace(pp_opt,'').replace(ws_opt,'')}")
        ax.plot([len(pretest_data),len(pretest_data)],[-1,1],c="black",ls='--')

        ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
        plt.savefig(log_dir / f"r2_plot_{pp_opt}{ws_opt}_.png",bbox_inches='tight')
        ax.set_ylim(-1,1)
        plt.savefig(log_dir / f"r2_plot_v2_{pp_opt}{ws_opt}_.png",bbox_inches='tight')