In [1]:
#imports
import logging
# set seed
import pandas as pd
import numpy as np
import utils as ut
import experiment as ex
from evaluation import *
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *

from river_models import StreamLocalWeightedRegression

from river import stream, metrics
from river.neighbors import KNNRegressor 
from sklearn.metrics import mean_squared_error
from river.utils import dict2numpy
from sk_models import PLSRegression, StandardScaler,LocalWeightedRegression,PLSLWR,LinearRidge


seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)
import gc
torch.cuda.empty_cache()
gc.collect()

print(f"GPU detected is {torch.cuda.get_device_name(0)}")

GPU detected is GeForce GTX 970


In [2]:
#setup input and output directories

#setup input and outpu t formats, load data

#we need to set parametesr
file_name = "PLN7.csv" #"mango_684_990.csv" #"mango_729_975.csv" #fitlered=513-1050
id_cols =["db_id","sample_id"] #
output_cols = None
data_path = Path('D:/workspace/lazydeep/data/soil_data/')
log_path = Path("D:/workspace/lazydeep/experiments/5.00") #1.01/")
if not log_path.exists():
    log_path.mkdir()

data_file = data_path / file_name
log_dir = log_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)
if not log_dir.exists():
    log_dir.mkdir()
print(f"Output directory is {log_dir}")

Output directory is D:\workspace\lazydeep\experiments\5.00\PLN7


In [3]:
def batch_predict(models,X,y):
    preds = {name:[] for name in models.keys()}
    for xi, yi in stream.iter_pandas(X,y):
        for name,model in river_models.items():
            pred = model.predict_one(xi)
            preds[name].append(pred)
    return preds
        
def batch_score(models,X,y):
    preds = {name:[] for name in models.keys()}
    for xi, yi in stream.iter_pandas(X,y):
        for name,model in river_models.items():
            pred = model.predict_one(xi)
            preds[name].append(pred)
            
    scores = {name:r2_score(y,pred) for name,pred in preds.items()}
    mse = {name:mean_squared_error(y,pred) for name,pred in preds.items()}
    return scores, mse
        

def batch_learn(models,X,y):
    for xi, yi in stream.iter_pandas(X,y):
        for name,model in river_models.items():
             model.learn_one(xi,yi)
    return models

In [4]:
def sk_predict_one(models,xi,yi):
    preds = {}
    for name,model in models.items():
        preds[name] = models.predict([dict2numpy(xi)])[0]
    
    return preds  

def sk_predict(models,X,y):
    preds = {}
    for name,model in models.items():
            pred = model.predict(X)
            preds[name]=pred.tolist()
    return preds
        
def sk_score(models,X,y):
    preds = {}
    for name,model in models.items():
        pred = model.predict(X)
        preds[name]=pred
            
    scores = {name:r2_score(y,pred) for name,pred in preds.items()}
    mse = {name:mean_squared_error(y,pred) for name,pred in preds.items()}
    return scores, mse
        

def sk_learn(models,X,y):
    for name,model in models.items():
        model.fit(X,y)
    return models

In [5]:
data = pd.read_csv(data_file)
data=data.sample(frac=1,random_state=random_state)

pre_ind =[i for i in range(0,10000)]
pretrain_ind,pretest_ind = train_test_split(pre_ind,train_size=5/6,random_state=random_state,shuffle=False)
stream_ind = [i for i in range(10000,100000)]


pretrain_data =  ut.TabularDataset(data.iloc[pretrain_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
pretest_data = ut.TabularDataset(data.iloc[pretest_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)
stream_data = ut.TabularDataset(data.iloc[stream_ind,:],id_cols = id_cols, cat_cols=None, output_cols=output_cols, ignore_cols= None)

nrow, ncol = data.shape
nrow_train = len(pretrain_data)
nrow_test = len(pretest_data)
nrow_stream = len(stream_data)

print(f"train: {nrow_train}, test: {nrow_test}, stream: {nrow_stream}")

train: 8333, test: 1667, stream: 90000


In [9]:
with open(deep_model_dir/'preprocessing'/f"_final",'r') as file:
    text = file.read()
    jsonpickle.decode(text)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



TypeError: __setstate__() argument 1 must be sequence of length 4, not 5

In [6]:
#setup evaluation

def setup_models():
    return {'lr':LinearRidge(),
            'plsr':PLSRegression(n_components=25),
            'lwr':LocalWeightedRegression(n_neighbours=800,normalize=True),
            'pls_lwr':PLSLWR(n_components=25, n_neighbours=300)
           }

#pls-deep - random_82, => random 24, random_10
#pls-deep lwr - random_82 - lwr_k=1000 -

#deep random 29 => random 60 => random 63
#deep-lwr - random_13 - lwr_k=800

deep_model_dir = Path("D:/workspace/lazydeep/experiments/1.01/PLN7")
pls_deep_model = torch.load(deep_model_dir/"models"/"random_82"/"_model")
pls_deep_model.load_state(deep_model_dir/"models"/"random_82"/"_final")
pls_scaler = PLSRegression(n_components=34).from_state(PLSRegression(n_components=34).load_state(deep_model_dir/'preprocessing'/f"_final"))       
                          
pls_deep_lwr = LocalWeightedRegression(n_neighbours=1000,normalize=True)
                          
deep_model_dir = Path("D:/workspace/lazydeep/experiments/2.00/PLN7")
deep_model = torch.load(deep_model_dir/"models"/"random_29"/"_model")
deep_model.load_state(deep_model_dir/"models"/"random_29"/"_final")
deep_scaler = StandardScaler().from_state(StandardScaler().load_state(deep_model_dir/'preprocessing'/f"_final"))                
                      
deep_lwr = LocalWeightedRegression(n_neighbours=1000,normalize=False)

#ew have learnt - don't standardise lwr after deep
#we can view features after pls

{'n_components': 34, 'deflation_mode': 'regression', 'mode': 'A', 'scale': True, 'algorithm': 'nipals', 'max_iter': 500, 'tol': 1e-06, 'copy': True}
{'n_components': 34, 'deflation_mode': 'regression', 'mode': 'A', 'scale': True, 'algorithm': 'nipals', 'max_iter': 500, 'tol': 1e-06, 'copy': True}


TypeError: __setstate__() argument 1 must be sequence of length 4, not 5