In [None]:
#imports
import logging
# set seed
import pandas as pd
import numpy as np
import utils as ut
import experiment as ex
from evaluation import *
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *
from sk_models import PLSRegression
import matplotlib.pyplot as plt
from river_models import StreamLocalWeightedRegression

from river import stream,linear_model,preprocessing, ensemble, metrics, optim
from river.neighbors import KNNRegressor 
from sklearn.metrics import mean_squared_error
from river.utils import dict2numpy, numpy2dict

seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)
import gc
torch.cuda.empty_cache()
gc.collect()

print(f"GPU detected is {torch.cuda.get_device_name(0)}")

In [None]:
#setup input and output directories

#setup input and outpu t formats, load data

#we need to set parametesr
file_name = "PLN7.csv" #"mango_684_990.csv" #"mango_729_975.csv" #fitlered=513-1050
id_cols =["db_id","sample_id"] #
output_cols = None
data_path = Path('D:/workspace/lazydeep/data/soil_data/')
log_path = Path("D:/workspace/lazydeep/experiments/5.02") #1.01/")
if not log_path.exists():
    log_path.mkdir()

data_file = data_path / file_name
log_dir = log_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)
if not log_dir.exists():
    log_dir.mkdir()
print(f"Output directory is {log_dir}")

In [None]:
def batch_predict(models,X):
    preds = {name:[] for name in models.keys()}
    for i in range(0,len(X)):
        xi = numpy2dict(X[i])
        for name,model in river_models.items():
            pred = model.predict_one(xi)
            preds[name].append(pred)
    return preds
        
def batch_score(models,X,y):
    preds = {name:[] for name in models.keys()}
    for i in range(0,len(X)):
        xi = numpy2dict(X[i])
        yi = y[i]
        for name,model in river_models.items():
            pred = model.predict_one(xi)
            preds[name].append(pred)
            
    scores = {name:r2_score(y,pred) for name,pred in preds.items()}
    mse = {name:mean_squared_error(y,pred) for name,pred in preds.items()}
    return scores, mse
        

def batch_learn(models,X,y):
    for i in range(0,len(X)):
        xi = numpy2dict(X[i])
        yi = y[i]
        for name,model in river_models.items():
             model.learn_one(xi,yi)
    return models

In [None]:
data = pd.read_csv(data_file)
data=data.sample(frac=1,random_state=random_state)

pre_ind =[i for i in range(0,10000)]
pretrain_ind,pretest_ind = train_test_split(pre_ind,train_size=5/6,random_state=random_state,shuffle=False)
stream_ind = [i for i in range(10000,110000)]

pretrain_data =  ut.TabularDataset(data.iloc[pretrain_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
pretest_data = ut.TabularDataset(data.iloc[pretest_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
stream_data = ut.TabularDataset(data.iloc[stream_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)

nrow, ncol = data.shape
nrow_train = len(pretrain_data)
nrow_test = len(pretest_data)
nrow_stream = len(stream_data)

print(f"train: {nrow_train}, test: {nrow_test}, stream: {nrow_stream}")

In [None]:
#setup evaluation
from river.neighbors import KNNRegressor

def setup_models():
    lwr1 =  StreamLocalWeightedRegression(n_neighbors= 500, window_size=10000)
    lwr2 =  (preprocessing.StandardScaler() |StreamLocalWeightedRegression(n_neighbors= 500, window_size=10000))
    lwr3 =  ExpHistLocalWightedregression(n_neighbors= 500, window_size=10000)
    lwr4 =  (preprocessing.StandardScaler() |ExpHistLocalWightedregressionlWeightedRegression(n_neighbors= 500, window_size=10000))
    
    return {
            'lwr1':lwr1,
            'lwr2':lwr2,

           }
            #,'lin1':lin1,
            #'lin2':lin2,
            #'lin3':lin3
            #'lwr1':(preprocessing.StandardScaler()|StreamLocalWeightedRegression(n_neighbors= 500, window_size=10000))
            #'lwr2':(preprocessing.StandardScaler()|StreamLocalWeightedRegression(n_neighbors= 800, window_size=10000)),
           # 'lwr3':(preprocessing.StandardScaler()|StreamLocalWeightedRegression(n_neighbors= 1000, window_size=10000))
           



In [None]:
#setup our metrics and stores of results
river_models = setup_models()
full_set = river_models.keys()
metrics = {'R2':{name:metrics.R2() for name in full_set},
           'R2_rolling':{name:metrics.Rolling(metrics.R2(), window_size=1000) for name in full_set},
           'MSE':{name:metrics.MSE() for name in river_models.keys()},
           'MSE_rolling':{name:metrics.Rolling(metrics.MSE(), window_size=1000) for name in full_set}
          }
if False:           
    scores = {'R2':{name:[] for name in full_set},
              'MSE':{name:[] for name in full_set},
              'R2_rolling':{name:[] for name in full_set},
              'MSE_rolling':{name:[] for name in full_set}
             }    

In [None]:
if False:

    train_X,train_y = pretrain_data[:]
    test_X, test_y = stream_data[0:1000]

    batch_learn(river_models,train_X,train_y)

    batch_r2,batch_mse = batch_score(river_models,train_X,train_y)
    preds = batch_predict(river_models,train_X)
    preds['y'] = train_y.tolist()

In [None]:
if False:
    for name in river_models.keys():
        model_preds = preds[name]
        mse = mean_squared_error(preds['y'],model_preds)
        print(f" {name} :  {mse}")
        for i, pred in enumerate(model_preds):
            y=train_y[i]
            for metric_k,metric_v in metrics.items():
                metric_v[name].update(y, pred)   

In [None]:
if False:
    for name in river_models.keys():
        scores['R2'][name] = scores['R2'][name] + [batch_r2[name] for _ in range(0,nrow_train)]
        scores['R2_rolling'][name] = scores['R2_rolling'][name] + [batch_r2[name] for _ in range(0,nrow_train)]
        scores['MSE'][name] = scores['MSE'][name] +  [batch_mse[name] for _ in range(0,nrow_test)]
        scores['MSE_rolling'][ name] = scores['MSE_rolling'][name] +  [batch_mse[name] for _ in range(0,nrow_train)]

In [None]:
if False:
    for name in river_models.keys():
        for m,v in metrics.items():
            print(v[name])

In [None]:
#so sofar we have establish our metrics and scores are correct
from tqdm.notebook import tqdm, trange

In [None]:
def prequential_evaluate(dataset,models,metrics,pretrain=1000,num_its=2000):
    """
    only make prediction if after pretrain length
    """
    X,y = dataset[0:num_its]
    
    preds = {name:[] for name in models.keys()}
    preds['y'] = []
    
    scores = {k:{name:[] for name in full_set} for k in metrics.keys()}
    
    for i in tqdm(range(0,num_its)):
        xi = numpy2dict(X[i])
        yi = y[i]
        

        preds['y'].append(yi)    
            
        for name,model in river_models.items():  
            pred = model.predict_one(xi)
            preds[name].append(pred)
            #predict if pretrained
            if i >= pretrain:

                for metric_k,metric_v in metrics.items():
                    score = metric_v[name].update(yi, pred).get()
                    scores[metric_k][name].append(score)
                    
            #learn
            model.learn_one(xi,yi)




    return preds, scores

In [None]:
preds, scores = prequential_evaluate(stream_data,river_models,metrics)

In [None]:
preds_df = pd.DataFrame(preds)
preds_df.to_csv(log_dir/"preds_df.csv")


In [None]:
#findings
#1) preprocessing works, random lr things for lr don't
#) standardisation asks as regularisation

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("R^2 Score")
ax.set_title("Streaming performance ")

scores_df = pd.DataFrame(scores['R2'])
scores_df.to_csv(log_dir/"r2_scores.csv")
for (columnName, columnData) in scores_df.iteritems():
    ax.plot(columnData.index,columnData,'-',label = f"{columnName}")


ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"r2_plot.png",bbox_inches='tight')
#ax.set_ylim(0,1)
plt.savefig(log_dir / f"r2_plot_v2.png",bbox_inches='tight')


In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("R^2 Score")
ax.set_title("Streaming performance (rolling average) ")
    
scores_df = pd.DataFrame(scores['R2_rolling'])
scores_df.to_csv(log_dir/"r2_scores_rolling.csv")
for (columnName, columnData) in scores_df.iteritems():
    ax.plot(columnData.index,columnData,label = f"{columnName}")


ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"r2_plot.png",bbox_inches='tight')
ax.set_ylim(-1,1)
plt.savefig(log_dir / f"r2_plot_v2.png",bbox_inches='tight')

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("MSE")
ax.set_title("Streaming performance ")

scores_df = pd.DataFrame(scores['MSE'])
scores_df.to_csv(log_dir/"MSE.csv")
for (columnName, columnData) in scores_df.iteritems():
    ax.plot(columnData.index,columnData,'-',label = f"{columnName}")

ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"mse_plot.png",bbox_inches='tight')

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel("Stream Index")
ax.set_ylabel("MSE")
ax.set_title("Streaming performance (rolling average) ")

scores_df = pd.DataFrame(scores['MSE_rolling'])
scores_df.to_csv(log_dir/"MSE_rolling.csv")
for (columnName, columnData) in scores_df.iteritems():
    ax.plot(columnData.index,columnData,label = f"{columnName}")

ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
plt.savefig(log_dir / f"mse_plot.png",bbox_inches='tight')
ax.set_ylim(-1,1000)
plt.savefig(log_dir / f"r2_plot_v2.png",bbox_inches='tight')