In [1]:
import logging
# set seed
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import utils as ut
import experiment as ex
from evaluation import *
from sk_models import CustomWrapper
from sklearn.metrics import mean_squared_error
from torch.utils.tensorboard import SummaryWriter
import torch
import random
#define fixed_hyperparams and create a config gen
from configurations import RandomConfigGen, Configuration
from torch import nn
from deep_net import RandomNet
from experiment import run_experiment
import regex as re
from pathlib import *

seed = 1
torch.manual_seed(seed)
random.seed(seed + 1)
np.random.seed(seed + 2)
random_state = np.random.RandomState(seed)
import gc
torch.cuda.empty_cache()
gc.collect()

print(f"GPU detected is {torch.cuda.get_device_name(0)}")

GPU detected is GeForce GTX 970


In [2]:
#setup input and outpu t formats, load data

file_name = "A_AL_RT.csv" # "PLN7.csv"
id_cols =[]#["sample_id"]

data_path = Path('D:/workspace/lazydeep/data/soil_data/')
log_path = Path("D:/workspace/lazydeep/experiments/1.01") #1.01/")
if not log_path.exists():
    log_path.mkdir()

data_file = data_path / file_name
log_dir = log_path / re.sub(r'\.(?=csv$)[^.]+$', '',file_name)
if not log_dir.exists():
    log_dir.mkdir()
print(f"Output directory is {log_dir}")

Output directory is D:\workspace\lazydeep\experiments\1.01\A_AL_RT


In [3]:
data = pd.read_csv(data_file)
data = data.sample(frac=1)
nrow, ncol = data.shape
data = ut.sample_data(data,random_state)
n_features = ncol - 1-len(id_cols)
dataset = TabularDataset(data,id_cols = id_cols, cat_cols=None, output_cols=None, ignore_cols= None)
eval = CrossValEvaluation(preprocessing=None,tensorboard=None,time=True,random_state=random_state)
print(f"Dataset shape is {data.shape}")

Dataset shape is (1438, 1702)




In [4]:
# set logging, in this case the root logger
ut.setup_logger(logger_name="log",file_name=log_dir/"log.txt")
ut.setup_logger(logger_name="test_log",file_name=log_dir/"test_log.txt")
ut.setup_logger(logger_name="summary",file_name=log_dir/"summary.txt")
tb = SummaryWriter(log_dir/"tb")
summary_logger = logging.getLogger("summary")
#step 1, run pls, set up pls - that runs best
n_comps = [i for i in range(1,101)]
pls_models = {i:CustomWrapper(PLSRegression(n_components=i)) for i in n_comps}

pls_scheme = SKLearnScheme(logger="log")
scores_pls, preds_pls, model_states_pls , train_time_pls, test_time_pls = eval.evaluate(pls_models,dataset,pls_scheme,logger_name="log")
summary_logger.info(f"Train times: {train_time_pls}")
summary_logger.info(f"Test times: {test_time_pls}")
from collections import defaultdict
summary_logger.info(f"Scores: {scores_pls}")
for key,value in flip_dicts(scores_pls).items():
    summary_logger.info(f"{key}: {value}")

selected_comps =  min(scores_pls["MSE"],key=scores_pls["MSE"].get)
summary_logger.info(f"Selected pls preprocessing with {selected_comps} components")

eval = CrossValEvaluation(preprocessing=Preprocess_PLS(n_components=selected_comps),tensorboard=None,time=True,random_state=random_state)

Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 718 - Val 240 - Test 240-----------------------------------'
Finished training SKLearn with a train loss of 1:280.4753,2:155.1537,3:110.6075,4:86.7992,5:67.4852,6:57.3901,7:52.0549,8:46.7996,9:46.0455,10:44.4836,11:42.2159,12:40.9681,13:39.2844,14:37.4114,15:36.3399,16:34.6181,17:34.0436,18:33.3473,19:32.059,20:31.6785,21:30.5849,22:30.1173,23:29.2096,24:28.29,25:27.8088,26:27.4381,27:27.0151,28:26.6836,29:26.0971,30:25.3753,31:25.1276,32:24.5871,33:24.2003,34:23.8514,35:23.541,36:23.1484,37:22.6938,38:22.1608,39:21.7682,40:21.3909,41:20.9326,42:20.5528,43:20.3091,44:19.9852,45:19.6581,46:19.3644,47:19.0281,48:18.7686,49:18.5219,50:18.3216,51:18.0052,52:17.6862,53:17.5097,54:17.263,55:16.9711,56:16.7658,57:16.5221,58:16.3184,59:16.0678,60:15.8015,61:15.6393,62:15.3913,63:15.1271,64:14.884,65:14.7138,66:14.5242,67:14.32,68:14.0954,69:13.8822,70:13.7865,71:13.629,72:13.4658,73:13.2582,74:13.1048,75:1



### Training the deep learners
The following cells setup our models and run a train-test evaluation.

In [5]:
n_models = 100
epochs = 100
bs = 32
fixed_hyperparams = {'bs': bs,'loss': nn.MSELoss(),'epochs': epochs}
device = "cpu" #torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#setup models
config_gen = RandomConfigGen(lr= (0,1),
                             allow_increase_size=False,
                             n_features=selected_comps,
                             opt=[torch.optim.SGD,
                                  torch.optim.Adam],
                             lr_update = [None,
                                          torch.optim.lr_scheduler.ReduceLROnPlateau,
                                          torch.optim.lr_scheduler.ExponentialLR,
                                          torch.optim.lr_scheduler.CosineAnnealingLR],
                            dropout = [True,False],
                            batch_norm = [True,False])
configs = {f"random_{i}":config_gen.sample() for i in range(n_models)}
config_gen.save(log_dir/'config_gen.txt')

deep_models = {name:RandomNet(input_size=selected_comps,
                             n_layers=config.n_layers,
                             act_function=config.act_function,
                             n_features = config.n_features,
                             dropout=config.dropout,
                             batch_norm=config.batch_norm,
                             device=device,dtype=torch.float)
              for name, config in configs.items()}

ex.write_summary_head(seed,fixed_hyperparams)
ex.save_models(deep_models,configs,log_dir)
start = datetime.datetime.now()
deep_scheme = DeepScheme(configs,fixed_hyperparams=fixed_hyperparams,logger="log",device=device,adaptive_lr=True)
scores_deep, preds_deep, model_states_deep , train_time_deep, test_time_deep = eval.evaluate(deep_models,dataset,deep_scheme,logger_name="log")



Starting Experiment'
Seed: 1'
bs: 32'
epochs: 100'
--------------------'
Running Cross Evaluation with 5 folds'
-----------------------------------Fold 0 - Train 718 - Val 240 - Test 240-----------------------------------'
Training extractors on 718 instances, validating on 240 instances, for 100 epochs'

--- EPOCH 0---'
Extractor Train Losses are random_0:nan(-0.091585),random_1:nan(-0.606802),random_2:1505.4774(-0.901208),random_3:nan(-0.56051),random_4:nan(-0.510167),random_5:772.2684(-0.980977),random_6:828.7778(-0.924832),random_7:nan(-0.940597),random_8:2797.7557(-0.73093),random_9:8424441863.4945(-0.907654),random_10:nan(-0.682478),random_11:1210.8557(-0.897968),random_12:1007.4779(-0.082903),random_13:nan(-0.813354),random_14:nan(-0.15142),random_15:308385.6618(-0.488761),random_16:2158.3577(-0.862446),random_17:nan(-0.447822),random_18:408.3297(-0.259096),random_19:432.9679(-0.033556),random_20:nan(-0.232474),random_21:2460.8316(-0.684561),random_22:nan(-0.111175),random_23:na

ValueError: operands could not be broadcast together with shapes (1438,1701) (22,) (1438,1701) 

In [None]:
scores_final, preds_final, model_states_ls_final , train_time_deep_final, test_time_deep_final = eval.build(deep_models,dataset,deep_scheme,logger_name="test_log")

In [None]:
ex.save_results(model_states_deep, preds_deep,configs, scores_deep, log_dir,tb,prefix="")

for model, state_dict in model_states_ls_final.items():
    torch.save(state_dict.state(), log_dir / "models" / f"{model}" / f"_final")

summary_logger.info(f"Train times: {train_time_deep}")
summary_logger.info(f"Test times: {test_time_deep}")
from collections import defaultdict

#summary_logger.info(f"Scores: {scores_deep}")
#for key,value in flip_dicts(scores_deep).items():
#    summary_logger.info(f"{key}: {value}")

end = datetime.datetime.now()
diff = end - start
ex.write_summary(diff, deep_models, scores_deep,prefix="")
ex.save_pred_plots(preds_deep, deep_models,log_dir)


In [None]:
scores_df = pd.DataFrame(scores_deep)
scores_df.to_csv(log_dir / f"scores.csv", index=False)


### Plotting deep results as a function of number of features

In [None]:
from matplotlib import pyplot as plt
if True:
    # plot deep results as a function of number of features

    n_features_dict = {name:config.n_features for name,config in configs.items()}
    to_plot = pd.DataFrame([[name, scores_deep["R2"][name],n_features_dict[name]] for name in scores_deep["R2"].keys()]
                           ,columns = ["name","score","n_features"])

    fig, ax = plt.subplots()
    ax.hist(to_plot["score"],bins=100,density=True)
    #ax.set_xscale('log')
    #ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
    ax.set_xlabel("Number of Models")
    ax.set_ylabel("R2")
    ax.set_title("Distribution of R2 Scoes")
    plt.savefig(log_dir / f"dist_plot.png",bbox_inches='tight')
    #plt.savefig(log_dir / f"pp_deep_pls_compressed.png",bbox_inches='tight')

    n_features_dict = {name:config.n_features for name,config in configs.items()}
    to_plot = pd.DataFrame([[name, scores_deep["R2"][name],n_features_dict[name]] for name in scores_deep["R2"].keys()]
                           ,columns = ["name","score","n_features"])
    to_plot = to_plot[to_plot["score"]>=0]

    fig, ax = plt.subplots()
    ax.hist(to_plot["score"],bins=100,density=True)
    #ax.set_xscale('log')
    #ax.legend(loc='upper right',bbox_to_anchor=(1.4, 1))
    ax.set_xlabel("Number of Models")
    ax.set_ylabel("R2")
    ax.set_title("Distribution of R2 Scoes")
    plt.savefig(log_dir / f"dist_plot_compressed.png",bbox_inches='tight')
    #plt.savefig(log_dir / f"pp_deep_pls_compressed.png",bbox_inches='tight')
    pass

    #plot

In [None]:
print(scores_deep['MSE'])
summary_logger.info("------------------\n Top 5 performance on Test Set")
summary_logger.info(f"Index - Model - Val - Score - Test Score")
for i,key in enumerate(sorted(scores_deep['MSE'],key=scores_deep['MSE'].get)):
    if i <5:
        summary_logger.info(f"{i} - {key} - {scores_deep['MSE'][key]} - {scores_final['MSE'][key]}")