In [None]:

from logging import root
from models.SupervisedAutoEncoder import create_hidden_rep
from operator import mod
import os
import random
import re
from typing import List, Tuple
import dotenv
import datatable as dt
from dotenv.main import load_dotenv
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from torch.utils.data import Dataset, Subset, BatchSampler, SequentialSampler, DataLoader


# from lightning_nn import Classifier


class FinData(Dataset):
    def __init__(self, data, target, era, hidden=None, mode='train', transform=None, cache_dir=None):
        self.data = data
        self.target = target
        self.mode = mode
        self.transform = transform
        self.cache_dir = cache_dir
        self.era = era
        self.hidden = hidden

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index.to_list()
        if self.transform:
            return self.transform(self.data.iloc[index].values)
        else:
            if type(index) is list:
                sample = {
                    'target': torch.Tensor(self.target[index].values),
                    'data':   torch.LongTensor(self.data[index]),
                    'era':    torch.Tensor(self.era[index].values),
                }
            else:
                sample = {
                    'target': torch.Tensor([self.target[index]]),
                    'data':   torch.LongTensor([self.data[index]]),
                    'era':    torch.Tensor([self.era[index]]),
                }
            if self.hidden is not None:
                sample['hidden'] = torch.Tensor(self.hidden['hidden'][index])
        return sample

    def __len__(self):
        return len(self.data)


def get_data_path(root_dir):
    dotenv_path = 'num_config.env'
    load_dotenv(dotenv_path=dotenv_path)
    curr_round = os.getenv('LATEST_ROUND')
    data_path = root_dir + '/numerai_dataset_' + str(curr_round)
    return data_path


def load_data(root_dir, mode, overide=None):
    data_path = get_data_path(root_dir=root_dir)
    if overide:
        data = dt.fread(overide).to_pandas()
    elif mode == 'train':
        data = dt.fread(data_path + '/numerai_training_data.csv').to_pandas()
    elif mode == 'test':
        data = dt.fread(data_path + '/numerai_tournament_data.csv').to_pandas()
    return data


def preprocess_data(data: pd.DataFrame, scale: bool = False, nn: bool = False, test=False, ordinal=False):
    """
    Preprocess the data.

    Parameters
    ----------
    data
        Pandas DataFrame
    scale
        scale data with unit std and 0 mean
    nn
        return data as np.array
    missing
        options to replace missing data with - mean, median, 0
    action
        options to create action value  - weight = (weight * resp) > 0
                                        - combined = (resp_cols) > 0
                                        - multi = each resp cols >0

    Returns
    -------
    """
    features = [col for col in data.columns if 'feature' in col]
    era = data['era']
    era = era.transform(lambda x: re.sub('[a-z]', '', x))
    if not test:
        era = era.astype('int')
    target = data['target']
    data = data[features]
    if scale:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
    if ordinal:
        oe = OrdinalEncoder()
        data = oe.fit_transform(data)
        # data = data.values
    if nn:
        data = data.values
    return data, target, features, era


def calc_data_mean(array, cache_dir=None, fold=None, train=True, mode='mean'):
    if train:
        if mode == 'mean':
            f_mean = np.nanmean(array, axis=0)
            if cache_dir and fold:
                np.save(f'{cache_dir}/f_{fold}_mean.npy', f_mean)
            elif cache_dir:
                np.save(f'{cache_dir}/f_mean.npy', f_mean)
            array = np.nan_to_num(array) + np.isnan(array) * f_mean
        if mode == 'median':
            f_med = np.nanmedian(array, axis=0)
            if cache_dir and fold:
                np.save(f'{cache_dir}/f_{fold}_median.npy', f_med)
            elif cache_dir:
                np.save(f'{cache_dir}/f_median.npy', f_med)
            array = np.nan_to_num(array) + np.isnan(array) * f_med
        if mode == 'zero':
            array = np.nan_to_num(array) + np.isnan(array) * 0
    if not train:
        if mode == 'mean':
            f_mean = np.load(f'{cache_dir}/f_mean.npy')
            array = np.nan_to_num(array) + np.isnan(array) * f_mean
        if mode == 'median':
            f_med = np.load(f'{cache_dir}/f_med.npy')
            array = np.nan_to_num(array) + np.isnan(array) * f_med
        if mode == 'zero':
            array = np.nan_to_num(array) + np.isnan(array) * 0
    return array


def weighted_mean(scores, sizes):
    largest = np.max(sizes)
    weights = [size / largest for size in sizes]
    return np.average(scores, weights=weights)


def create_dataloaders(dataset: Dataset, indexes: dict, batch_size):
    train_idx = indexes.get('train', None)
    val_idx = indexes.get('val', None)
    test_idx = indexes.get('test', None)
    dataloaders = {}
    if train_idx:
        train_set = Subset(dataset, train_idx)
        train_sampler = BatchSampler(
            train_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['train'] = DataLoader(
            dataset, sampler=train_sampler, num_workers=2, pin_memory=False, shuffle=False)
    if val_idx:
        val_set = Subset(dataset, val_idx)
        val_sampler = BatchSampler(
            val_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['val'] = DataLoader(
            dataset, sampler=val_sampler, num_workers=2, pin_memory=False, shuffle=False)
    if test_idx:
        test_set = Subset(dataset, test_idx)
        test_sampler = BatchSampler(
            test_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['test'] = DataLoader(
            dataset, sampler=test_sampler, num_workers=2, pin_memory=False, shuffle=False)
    return dataloaders


def seed_everything(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def load_model(path, p, pl_lightning, model):
    Classifier = model
    if os.path.isdir(path):
        models = []
        for file in os.listdir(path):
            if pl_lightning:
                model = Classifier.load_from_checkpoint(
                    checkpoint_path=path + '/' + file, params=p)
            else:
                model = Classifier(params=p)
                model.load_state_dict(torch.load(path + '/' + file))
            models.append(model)
        return models
    elif os.path.isfile(path):
        if pl_lightning:
            return Classifier.load_from_checkpoint(checkpoint_path=path, params=p)
        else:
            model = Classifier(params=p)
            model.load_state_dict(torch.load(path))
            return model


def init_weights(m, func):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight, nn.init.calculate_gain(func))


def create_predictions(root_dir: str = './data', models: dict = {}, hidden=True, ae=True):
    test_files_path, test_files_exist = check_test_files(root_dir)
    if test_files_exist:
        test_files = os.listdir(test_files_path)
    for file in test_files:
        df = load_data(root_dir='./data', mode='test',
                       overide=f'{test_files_path}/{file}')
        df['target'] = 0
        data, target, features, era = preprocess_data(data=df, ordinal=True)
        t_idx = np.arange(start=0, stop=len(era), step=1).tolist()
        data_dict = data_dict = {'data': data, 'target': target,
                                 'features': features, 'era': era}
        if models.get('ae', None):
            p_ae = models['ae'][1]
            p_ae['input_size'] = len(features)
            p_ae['output_size'] = 1
            model = models['ae'][0]
            model.eval()
        if not ae:
            hidden_pred = create_hidden_rep(
                model=model, data_dict=data_dict)
            data_dict['hidden_true'] = True
            df['prediction_ae'] = hidden_pred['preds']
        if models.get('ResNet', None):
            p_res = models['ResNet'][1]
            p_res['input_size'] = len(features)
            p_res['output_size'] = 1
            p_res['hidden_len'] = data_dict['hidden'].shape[-1]
            dataset = FinData(
                data=data_dict['data'], target=data_dict['target'], era=data_dict['era'], hidden=data_dict.get('hidden', None))
            dataloaders = create_dataloaders(
                dataset, indexes={'train': t_idx}, batch_size=p_res['batch_size'])
            model = models['ResNet'][0]
            model.eval()
            predictions = []
            for batch in dataloaders['train']:
                pred = model(batch['data'].view(
                    batch['data'].shape[1], -1), hidden=batch['hidden'].view(batch['hidden'].shape[1], -1))
                predictions.append(pred.cpu().detach().numpy().tolist())
            predictions = np.array([predictions[i][j] for i in range(
                len(predictions)) for j in range(len(predictions[i]))])
            df['prediction_resnet'] = predictions
        if models.get('xgboost', None):
            model_xgboost = models['xgboost'][0]
            p_xgboost = models['xgboost'][1]
            x_val = data_dict['data']
            df['prediction_xgb'] = model_xgboost.predict(x_val)
        if models.get('lgb', None):
            model_lgb = models['lgb'][0]
            p_lgb = models['lgb'][1]
            x_val = data_dict['data']
            df['prediction_lgb'] = model_lgb.predict(x_val)
            df = df[['id', 'prediction_lgb']]
        pred_path = f'{get_data_path(root_dir)}/predictions/{era[0]}'
        df.to_csv(f'{pred_path}.csv')


def check_test_files(root_dir='./data'):
    data_path = get_data_path(root_dir)
    test_files_path = f'{data_path}/test_files'
    # TODO Check to make sure all era's present
    if os.path.isdir(test_files_path):
        return test_files_path, True
    else:
        os.makedirs(test_files_path)
        df = load_data(root_dir=root_dir, mode='test')
        df['era'][df['era'] == 'eraX'] = 'era999'
        for era in df['era'].unique():
            path = f'{test_files_path}/{era}'
            df[df['era'] == era].to_csv(f'{path}.csv')
        return test_files_path, True


def create_prediction_file(root_dir='./data', eras=None):
    pred_path = f'{get_data_path(root_dir)}/predictions/'
    files = os.listdir(pred_path)
    files.sort()
    if eras:
        dfs = [pd.read_csv(f'{pred_path}{file}')
               for file in files if file != 'predictions.csv' and file in eras]
    else:
        dfs = [pd.read_csv(f'{pred_path}{file}')
               for file in files if file != 'predictions.csv']
    df = pd.concat(dfs)
    df = df[['id', 'prediction_lgb']]
    df.columns = ['id', 'prediction']
    df.to_csv(f'{pred_path}predictions.csv')

    return df

In [None]:
import torch
import joblib
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
from sklearn.metrics import mean_squared_error, mean_absolute_error
from data_loading import utils
from pytorch_lightning.callbacks.early_stopping import EarlyStopping


class SupAE(pl.LightningModule):
    def __init__(self, params):
        super(SupAE, self).__init__()
        self.lr = params['lr']
        self.loss_recon = params['loss_recon']()
        self.recon_loss_factor = params['recon_loss_factor']
        self.loss_sup_ae = params['loss_sup_ae']()
        self.activation = params['activation']
        self.drop = params['dropout']
        cat_dims = [5 for i in range(params['input_size'])]
        emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
        self.embedding_layers = nn.ModuleList(
            [nn.Embedding(x, y) for x, y in emb_dims]).to(self.device)
        self.num_embeddings = sum([y for x, y in emb_dims])
        self.encoder = nn.Sequential(
            nn.Linear(self.num_embeddings, params['dim_1']),
            nn.BatchNorm1d(params['dim_1']),
            self.activation(),
            nn.Dropout(self.drop),
            nn.Linear(params['dim_1'], params['dim_2']),
            nn.BatchNorm1d(params['dim_2']),
            self.activation(),
            nn.Dropout(self.drop),
            nn.Linear(params['dim_2'], params['dim_3']),
            nn.BatchNorm1d(params['dim_3']),
            self.activation(),
            nn.Dropout(self.drop),
            nn.Linear(params['dim_3'], params['hidden'])
        )
        self.regressor = nn.Sequential(
            nn.BatchNorm1d(params['hidden']),
            self.activation(),
            nn.Dropout(self.drop),
            nn.Linear(params['hidden'], params['output_size'])
        )
        self.decoder = nn.Sequential(
            nn.Linear(params['hidden'], params['dim_3']),
            nn.BatchNorm1d(params['dim_3']),
            self.activation(),
            nn.Dropout(self.drop),
            nn.Linear(params['dim_3'], params['dim_2']),
            nn.BatchNorm1d(params['dim_2']),
            self.activation(),
            nn.Dropout(self.drop),
            nn.Linear(params['dim_2'], params['dim_1']),
            nn.BatchNorm1d(params['dim_1']),
            self.activation(),
            nn.Dropout(self.drop),
            nn.Linear(params['dim_1'], self.num_embeddings)
        )

    def forward(self, x):
        x = [emb_lay(x[:, i])
             for i, emb_lay in enumerate(self.embedding_layers)]
        emb = torch.cat(x, 1)
        hidden = self.encoder(emb)
        reg_out = self.regressor(hidden)
        decoder_out = self.decoder(hidden)
        return emb, hidden, reg_out, decoder_out

    def training_step(self, batch, batch_idx):
        x, y = batch['data'], batch['target']
        x = x.view(x.size(1), -1)
        y = y.T
        emb, _, reg_out, decoder_out = self(x)
        sup_loss = self.loss_sup_ae(reg_out, y)
        recon_loss = torch.mean(torch.tensor(
            [self.loss_recon(decoder_out[i], emb[i]) for i in range(x.shape[0])]))
        loss = sup_loss + self.recon_loss_factor*recon_loss
        self.log('sup_loss', sup_loss, on_step=False,
                 on_epoch=True, prog_bar=True)
        self.log('recon_loss', recon_loss, on_step=False,
                 on_epoch=True, prog_bar=True)
        self.log('train_loss', loss, prog_bar=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        x, y = batch['data'], batch['target']
        x = x.view(x.size(1), -1)
        y = y.T
        emb, _, reg_out, decoder_out = self(x)
        sup_loss = self.loss_sup_ae(reg_out, y)
        recon_loss = torch.mean(torch.tensor(
            [self.loss_recon(decoder_out[i], emb[i]) for i in range(x.shape[0])]))
        loss = sup_loss + self.recon_loss_factor*recon_loss
        """
        self.log('sup_loss', sup_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('recon_loss', recon_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        """
        return {'val_loss': loss, 'val_sup_loss': sup_loss}

    def validation_epoch_end(self, outputs) -> None:
        epoch_loss = torch.tensor([x['val_loss'] for x in outputs]).mean()
        epoch_sup_loss = torch.tensor(
            [x['val_sup_loss'] for x in outputs]).mean()
        self.log('val_loss', epoch_loss, prog_bar=True)
        self.log('val_sup_loss', epoch_sup_loss, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=5, factor=0.1, min_lr=1e-7, eps=1e-08
        )
        return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}


def train_ae_model(data_dict):
    # TODO Dynamic
    p = {'dim_1': 675, 'dim_2': 400, 'dim_3': 224, 'hidden': 162,
         'activation': nn.ReLU, 'dropout': 0.2916447561918717, 'lr': 0.030272591341587315,
         'recon_loss_factor': 0.4447516076774931, 'batch_size': 1252, 'loss_sup_ae': nn.MSELoss,
         'loss_recon': nn.MSELoss,
         'embedding': True}
    # TODO Fix this
    joblib.dump(p,'./saved_models/parameters/ae_params.pkl')
    train_idx = np.arange(start=0, stop=452205, step=1, dtype=np.int).tolist()
    val_idx = np.arange(start=452206, stop=len(
        data_dict['data']), step=1, dtype=np.int).tolist()
    p['input_size'] = len(data_dict['features'])
    p['output_size'] = 1
    dataset = utils.FinData(
        data=data_dict['data'], target=data_dict['target'], era=data_dict['era'])
    dataloaders = utils.create_dataloaders(dataset=dataset, indexes={
                                           'train': train_idx, 'val': val_idx}, batch_size=p['batch_size'])
    model = SupAE(p)
    es = EarlyStopping(monitor='val_loss', patience=10,
                       min_delta=0.005, mode='min')
    trainer = pl.Trainer(max_epochs=100,
                         gpus=0,
                         callbacks=[es])
    trainer.fit(
        model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val'])
    torch.save(model.state_dict(), f'./saved_models/trained/trained_ae.pth')
    return model


def create_hidden_rep(model, data_dict):
    model.eval()
    index = np.linspace(
        0, data_dict['data'].shape[0]-1, data_dict['data'].shape[0], dtype='int').tolist()
    dataset = utils.FinData(
        data_dict['data'], target=data_dict['target'], era=data_dict['era'])
    batch_size = 5000
    dataloaders = utils.create_dataloaders(
        dataset, {'train': index}, batch_size=batch_size)
    hiddens = []
    predictions = []
    for i, batch in enumerate(dataloaders['train']):
        _, hidden, pred, _ = model(batch['data'].view(
            batch['data'].size(1), -1))
        hiddens.append(hidden.cpu().detach().numpy().tolist())
        predictions.append(pred.cpu().detach().numpy().tolist())
    hiddens = np.array([hiddens[i][j] for i in range(
        len(hiddens)) for j in range(len(hiddens[i]))])
    preds = np.array([predictions[i][j] for i in range(
        len(predictions)) for j in range(len(predictions[i]))])
    return {'hidden': hiddens, 'preds': preds}

In [None]:
from logging import root
from models.SupervisedAutoEncoder import create_hidden_rep
from operator import mod
import os
import random
import re
from typing import List, Tuple
import dotenv
import datatable as dt
from dotenv.main import load_dotenv
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from torch.utils.data import Dataset, Subset, BatchSampler, SequentialSampler, DataLoader


# from lightning_nn import Classifier


class FinData(Dataset):
    def __init__(self, data, target, era, hidden=None, mode='train', transform=None, cache_dir=None):
        self.data = data
        self.target = target
        self.mode = mode
        self.transform = transform
        self.cache_dir = cache_dir
        self.era = era
        self.hidden = hidden

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index.to_list()
        if self.transform:
            return self.transform(self.data.iloc[index].values)
        else:
            if type(index) is list:
                sample = {
                    'target': torch.Tensor(self.target[index].values),
                    'data':   torch.LongTensor(self.data[index]),
                    'era':    torch.Tensor(self.era[index].values),
                }
            else:
                sample = {
                    'target': torch.Tensor([self.target[index]]),
                    'data':   torch.LongTensor([self.data[index]]),
                    'era':    torch.Tensor([self.era[index]]),
                }
            if self.hidden is not None:
                sample['hidden'] = torch.Tensor(self.hidden['hidden'][index])
        return sample

    def __len__(self):
        return len(self.data)


def get_data_path(root_dir):
    dotenv_path = 'num_config.env'
    load_dotenv(dotenv_path=dotenv_path)
    curr_round = os.getenv('LATEST_ROUND')
    data_path = root_dir + '/numerai_dataset_' + str(curr_round)
    return data_path


def load_data(root_dir, mode, overide=None):
    data_path = get_data_path(root_dir=root_dir)
    if overide:
        data = dt.fread(overide).to_pandas()
    elif mode == 'train':
        data = dt.fread(data_path + '/numerai_training_data.csv').to_pandas()
    elif mode == 'test':
        data = dt.fread(data_path + '/numerai_tournament_data.csv').to_pandas()
    return data


def preprocess_data(data: pd.DataFrame, scale: bool = False, nn: bool = False, test=False, ordinal=False):
    """
    Preprocess the data.

    Parameters
    ----------
    data
        Pandas DataFrame
    scale
        scale data with unit std and 0 mean
    nn
        return data as np.array
    missing
        options to replace missing data with - mean, median, 0
    action
        options to create action value  - weight = (weight * resp) > 0
                                        - combined = (resp_cols) > 0
                                        - multi = each resp cols >0

    Returns
    -------
    """
    features = [col for col in data.columns if 'feature' in col]
    era = data['era']
    era = era.transform(lambda x: re.sub('[a-z]', '', x))
    if not test:
        era = era.astype('int')
    target = data['target']
    data = data[features]
    if scale:
        scaler = StandardScaler()
        data = scaler.fit_transform(data)
    if ordinal:
        oe = OrdinalEncoder()
        data = oe.fit_transform(data)
        # data = data.values
    if nn:
        data = data.values
    return data, target, features, era


def calc_data_mean(array, cache_dir=None, fold=None, train=True, mode='mean'):
    if train:
        if mode == 'mean':
            f_mean = np.nanmean(array, axis=0)
            if cache_dir and fold:
                np.save(f'{cache_dir}/f_{fold}_mean.npy', f_mean)
            elif cache_dir:
                np.save(f'{cache_dir}/f_mean.npy', f_mean)
            array = np.nan_to_num(array) + np.isnan(array) * f_mean
        if mode == 'median':
            f_med = np.nanmedian(array, axis=0)
            if cache_dir and fold:
                np.save(f'{cache_dir}/f_{fold}_median.npy', f_med)
            elif cache_dir:
                np.save(f'{cache_dir}/f_median.npy', f_med)
            array = np.nan_to_num(array) + np.isnan(array) * f_med
        if mode == 'zero':
            array = np.nan_to_num(array) + np.isnan(array) * 0
    if not train:
        if mode == 'mean':
            f_mean = np.load(f'{cache_dir}/f_mean.npy')
            array = np.nan_to_num(array) + np.isnan(array) * f_mean
        if mode == 'median':
            f_med = np.load(f'{cache_dir}/f_med.npy')
            array = np.nan_to_num(array) + np.isnan(array) * f_med
        if mode == 'zero':
            array = np.nan_to_num(array) + np.isnan(array) * 0
    return array


def weighted_mean(scores, sizes):
    largest = np.max(sizes)
    weights = [size / largest for size in sizes]
    return np.average(scores, weights=weights)


def create_dataloaders(dataset: Dataset, indexes: dict, batch_size):
    train_idx = indexes.get('train', None)
    val_idx = indexes.get('val', None)
    test_idx = indexes.get('test', None)
    dataloaders = {}
    if train_idx:
        train_set = Subset(dataset, train_idx)
        train_sampler = BatchSampler(
            train_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['train'] = DataLoader(
            dataset, sampler=train_sampler, num_workers=2, pin_memory=False, shuffle=False)
    if val_idx:
        val_set = Subset(dataset, val_idx)
        val_sampler = BatchSampler(
            val_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['val'] = DataLoader(
            dataset, sampler=val_sampler, num_workers=2, pin_memory=False, shuffle=False)
    if test_idx:
        test_set = Subset(dataset, test_idx)
        test_sampler = BatchSampler(
            test_set.indices, batch_size=batch_size, drop_last=False)
        dataloaders['test'] = DataLoader(
            dataset, sampler=test_sampler, num_workers=2, pin_memory=False, shuffle=False)
    return dataloaders


def seed_everything(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def load_model(path, p, pl_lightning, model):
    Classifier = model
    if os.path.isdir(path):
        models = []
        for file in os.listdir(path):
            if pl_lightning:
                model = Classifier.load_from_checkpoint(
                    checkpoint_path=path + '/' + file, params=p)
            else:
                model = Classifier(params=p)
                model.load_state_dict(torch.load(path + '/' + file))
            models.append(model)
        return models
    elif os.path.isfile(path):
        if pl_lightning:
            return Classifier.load_from_checkpoint(checkpoint_path=path, params=p)
        else:
            model = Classifier(params=p)
            model.load_state_dict(torch.load(path))
            return model


def init_weights(m, func):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight, nn.init.calculate_gain(func))


def create_predictions(root_dir: str = './data', models: dict = {}, hidden=True, ae=True):
    test_files_path, test_files_exist = check_test_files(root_dir)
    if test_files_exist:
        test_files = os.listdir(test_files_path)
    for file in test_files:
        df = load_data(root_dir='./data', mode='test',
                       overide=f'{test_files_path}/{file}')
        df['target'] = 0
        data, target, features, era = preprocess_data(data=df, ordinal=True)
        t_idx = np.arange(start=0, stop=len(era), step=1).tolist()
        data_dict = data_dict = {'data': data, 'target': target,
                                 'features': features, 'era': era}
        if models.get('ae', None):
            p_ae = models['ae'][1]
            p_ae['input_size'] = len(features)
            p_ae['output_size'] = 1
            model = models['ae'][0]
            model.eval()
        if not ae:
            hidden_pred = create_hidden_rep(
                model=model, data_dict=data_dict)
            data_dict['hidden_true'] = True
            df['prediction_ae'] = hidden_pred['preds']
        if models.get('ResNet', None):
            p_res = models['ResNet'][1]
            p_res['input_size'] = len(features)
            p_res['output_size'] = 1
            p_res['hidden_len'] = data_dict['hidden'].shape[-1]
            dataset = FinData(
                data=data_dict['data'], target=data_dict['target'], era=data_dict['era'], hidden=data_dict.get('hidden', None))
            dataloaders = create_dataloaders(
                dataset, indexes={'train': t_idx}, batch_size=p_res['batch_size'])
            model = models['ResNet'][0]
            model.eval()
            predictions = []
            for batch in dataloaders['train']:
                pred = model(batch['data'].view(
                    batch['data'].shape[1], -1), hidden=batch['hidden'].view(batch['hidden'].shape[1], -1))
                predictions.append(pred.cpu().detach().numpy().tolist())
            predictions = np.array([predictions[i][j] for i in range(
                len(predictions)) for j in range(len(predictions[i]))])
            df['prediction_resnet'] = predictions
        if models.get('xgboost', None):
            model_xgboost = models['xgboost'][0]
            p_xgboost = models['xgboost'][1]
            x_val = data_dict['data']
            df['prediction_xgb'] = model_xgboost.predict(x_val)
        if models.get('lgb', None):
            model_lgb = models['lgb'][0]
            p_lgb = models['lgb'][1]
            x_val = data_dict['data']
            df['prediction_lgb'] = model_lgb.predict(x_val)
            df = df[['id', 'prediction_lgb']]
        pred_path = f'{get_data_path(root_dir)}/predictions/{era[0]}'
        df.to_csv(f'{pred_path}.csv')


def check_test_files(root_dir='./data'):
    data_path = get_data_path(root_dir)
    test_files_path = f'{data_path}/test_files'
    # TODO Check to make sure all era's present
    if os.path.isdir(test_files_path):
        return test_files_path, True
    else:
        os.makedirs(test_files_path)
        df = load_data(root_dir=root_dir, mode='test')
        df['era'][df['era'] == 'eraX'] = 'era999'
        for era in df['era'].unique():
            path = f'{test_files_path}/{era}'
            df[df['era'] == era].to_csv(f'{path}.csv')
        return test_files_path, True


def create_prediction_file(root_dir='./data', eras=None):
    pred_path = f'{get_data_path(root_dir)}/predictions/'
    files = os.listdir(pred_path)
    files.sort()
    if eras:
        dfs = [pd.read_csv(f'{pred_path}{file}')
               for file in files if file != 'predictions.csv' and file in eras]
    else:
        dfs = [pd.read_csv(f'{pred_path}{file}')
               for file in files if file != 'predictions.csv']
    df = pd.concat(dfs)
    df = df[['id', 'prediction_lgb']]
    df.columns = ['id', 'prediction']
    df.to_csv(f'{pred_path}predictions.csv')

    return df

In [None]:
import torch
import copy
import os
import numpy as np
import pytorch_lightning as pl
import torch.nn as nn
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from data_loading.purged_group_time_series import PurgedGroupTimeSeriesSplit
from data_loading.utils import load_data, preprocess_data, FinData, create_dataloaders, calc_data_mean, init_weights


class ResNet(pl.LightningModule):
    def __init__(self, input_size, output_size, params):
        super(ResNet, self).__init__()
        dim_1 = params['dim_1']
        dim_2 = params['dim_2']
        dim_3 = params['dim_3']
        dim_4 = params['dim_4']
        dim_5 = params['dim_5']
        self.drop_prob = params['dropout']
        self.drop = nn.Dropout(self.drop_prob)
        self.lr = params['lr']
        self.activation = params['activation']()
        self.input_size = input_size
        self.output_size = output_size
        self.loss = params['loss']()
        if params['embedding']:
            cat_dims = [5 for i in range(input_size)]
            emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
            self.embedding_layers = nn.ModuleList(
                [nn.Embedding(x, y) for x, y in emb_dims]).to(self.device)
            self.num_embeddings = sum([y for x, y in emb_dims])
            if params.get('hidden_len', None):
                self.input_size = self.num_embeddings + params['hidden_len']
                self.d0 = nn.Linear(self.input_size, dim_1)
                self.d1 = nn.Linear(dim_1 + self.input_size, dim_2)
            else:
                self.d0 = nn.Linear(self.num_embeddings, dim_1)
                self.d1 = nn.Linear(dim_1 + self.num_embeddings, dim_2)

        else:
            self.d0 = nn.Linear(self.input_size, dim_1)
            self.d1 = nn.Linear(dim_1+self.input_size, dim_2)

        self.d2 = nn.Linear(dim_2 + dim_1, dim_3)
        self.d3 = nn.Linear(dim_3 + dim_2, dim_4)
        self.d4 = nn.Linear(dim_4 + dim_3, dim_5)
        self.out = nn.Linear(dim_4+dim_5, output_size)

        # Batch Norm
        if params['embedding']:
            if params['hidden_len']:
                self.bn_hidden = nn.BatchNorm1d(params['hidden_len'])
            self.bn0 = nn.BatchNorm1d(self.num_embeddings)
        else:
            self.bn0 = nn.BatchNorm1d(self.input_size)
        self.bn1 = nn.BatchNorm1d(dim_1)
        self.bn2 = nn.BatchNorm1d(dim_2)
        self.bn3 = nn.BatchNorm1d(dim_3)
        self.bn4 = nn.BatchNorm1d(dim_4)
        self.bn5 = nn.BatchNorm1d(dim_5)

    def forward(self, x, hidden):
        if getattr(self, 'num_embeddings', None):
            x = [emb_lay(x[:, i])
                 for i, emb_lay in enumerate(self.embedding_layers)]
            x = torch.cat(x, 1)
        if hidden is not None:
            hidden = self.bn_hidden(hidden)
            x = torch.cat([x, hidden], 1)
        # normalise inputs

        # block 0
        x1 = self.d0(x)
        x1 = self.bn1(x1)
        x1 = self.activation(x1)
        x1 = self.drop(x1)

        x = torch.cat([x, x1], 1)

        # block 1
        x2 = self.d1(x)
        x2 = self.bn2(x2)
        x2 = self.activation(x2)
        x2 = self.drop(x2)

        x = torch.cat([x1, x2], 1)

        # block 2
        x3 = self.d2(x)
        x3 = self.bn3(x3)
        x3 = self.activation(x3)
        x3 = self.drop(x3)

        x = torch.cat([x2, x3], 1)

        # block 3
        x4 = self.d3(x)
        x4 = self.bn4(x4)
        x4 = self.activation(x4)
        x4 = self.drop(x4)

        x = torch.cat([x3, x4], 1)

        # block 4
        x5 = self.d4(x)
        x5 = self.bn5(x5)
        x5 = self.activation(x5)
        x5 = self.drop(x5)

        x = torch.cat([x4, x5], 1)

        out = self.out(x)
        return out

    def training_step(self, batch, batch_idx):
        x, hidden, y = batch['data'], batch.get(
            'hidden', None), batch['target']
        x = x.view(x.size(1), -1)
        y = y.T
        if hidden is not None:
            hidden = hidden.view(hidden.size(1), -1)
        logits = self(x, hidden)
        loss = self.loss(input=logits, target=y)
        mse = mean_squared_error(y_true=y.cpu().numpy(),
                                 y_pred=logits.cpu().detach().numpy())
        self.log('train_mse', mse, on_step=False,
                 on_epoch=True, prog_bar=True)
        self.log('train_loss', loss, prog_bar=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        x, hidden, y = batch['data'], batch.get(
            'hidden', None), batch['target']
        x = x.view(x.size(1), -1)
        y = y.T
        if hidden is not None:
            hidden = hidden.view(hidden.size(1), -1)
        logits = self(x, hidden)
        loss = self.loss(input=logits,
                         target=y)
        mse = mean_squared_error(y_true=y.cpu().numpy(),
                                 y_pred=logits.cpu().detach().numpy())
        return {'loss': loss, 'y': y, 'logits': logits, 'mse': mse}

    def validation_epoch_end(self, val_step_outputs):
        epoch_loss = torch.tensor([x['loss'] for x in val_step_outputs]).mean()
        epoch_mse = torch.tensor([x['mse'] for x in val_step_outputs]).mean()
        self.log('val_loss', epoch_loss, prog_bar=True)
        self.log('val_mse', epoch_mse, prog_bar=True)

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def test_epoch_end(self, outputs):
        epoch_loss = torch.tensor([x['loss'] for x in outputs]).mean()
        epoch_mse = torch.tensor([x['mse'] for x in outputs]).mean()
        self.log('test_loss', epoch_loss)
        self.log('test_auc', epoch_mse)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=5, factor=0.1, min_lr=1e-7, eps=1e-08
        )
        return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_mse'}


def cross_val(p) -> object:
    data_ = load_data(root_dir='./data/', mode='train')
    data_, target_, features, date = preprocess_data(
        data_, nn=True, action='multi')
    input_size = data_.shape[-1]
    output_size = target_.shape[-1]
    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)
    models = []
    tb_logger = pl_loggers.TensorBoardLogger('logs/multiclass_')
    for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)):
        idx = np.concatenate([train_idx, val_idx])
        data = copy.deepcopy(data_[idx])
        target = copy.deepcopy(target_[idx])
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            os.path.join('models/', 'multi_class_fold_{}'.format(i)), monitor='val_auc', save_top_k=1, period=10,
            mode='max'
        )
        model = ResNet(input_size, output_size, p)
        if p['activation'] == nn.ReLU:
            model.apply(lambda m: init_weights(m, 'relu'))
        elif p['activation'] == nn.LeakyReLU:
            model.apply(lambda m: init_weights(m, 'leaky_relu'))
        train_idx = [i for i in range(0, max(train_idx) + 1)]
        val_idx = [i for i in range(len(train_idx), len(idx))]
        data[train_idx] = calc_data_mean(
            data[train_idx], './cache', train=True, mode='mean')
        data[val_idx] = calc_data_mean(
            data[val_idx], './cache', train=False, mode='mean')
        dataset = FinData(data=data, target=target, date=date, multi=True)
        dataloaders = create_dataloaders(
            dataset, indexes={'train': train_idx, 'val': val_idx}, batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_auc', patience=10,
                           min_delta=0.0005, mode='max')
        trainer = pl.Trainer(logger=tb_logger,
                             max_epochs=100,
                             gpus=1,
                             callbacks=[checkpoint_callback, es],
                             precision=16)
        trainer.fit(
            model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val'])
        torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth')
        models.append(model)
    return models, features


def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.nan_to_num(array) + np.isnan(array) * values
    return array


def test_model(models, features, cache_dir='cache'):
    env = janestreet.make_env()
    iter_test = env.iter_test()
    if type(models) == list:
        models = [model.eval() for model in models]
    else:
        models.eval()
    f_mean = np.load(f'{cache_dir}/f_mean.npy')
    for (test_df, sample_prediction_df) in tqdm(iter_test):
        if test_df['weight'].item() > 0:
            vals = torch.FloatTensor(
                fillna_npwhere(test_df[features].values, f_mean))
            if type(models) == list:
                # calc mean of each models prediction of each response rather than mean of all predicted responses by each model
                preds = [torch.sigmoid(model.forward(vals.view(1, -1))).detach().numpy()
                         for model in models]
                pred = np.mean(np.mean(preds, axis=0))
            else:
                pred = torch.sigmoid(models.forward(vals.view(1, -1))).item()
            sample_prediction_df.action = np.where(
                pred > 0.5, 1, 0).astype(int).item()
        else:
            sample_prediction_df.action = 0
        env.predict(sample_prediction_df)


def main():
    p = {'dim_1': 167,
         'dim_2': 454,
         'dim_3': 371,
         'dim_4': 369,
         'dim_5': 155,
         'activation': nn.LeakyReLU,
         'dropout': 0.21062362698532755,
         'lr': 0.0022252024054478523,
         'label_smoothing': 0.05564974140461841,
         'weight_decay': 0.04106097088288333,
         'amsgrad': True,
         'batch_size': 10072}
    models, features = cross_val(p)
    test_model(models, features)


if __name__ == '__main__':
    main()

In [None]:
import copy
import datetime
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import roc_auc_score, roc_curve
from torch.nn import functional as F
from torch.utils.data import DataLoader, Subset
from torch.utils.data.sampler import BatchSampler, SequentialSampler
from tqdm import tqdm
from purged_group_time_series import PurgedGroupTimeSeriesSplit
from group_time_split import GroupTimeSeriesSplit
from utils import load_data, preprocess_data, FinData


class Classifier(nn.Module):
    def __init__(self, input_size, output_size, dims, batch_size, learning_rate=0.05, early_stopping=10,
                 model_file='model.pth', fold=None):
        super(Classifier, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.dims = dims
        self.layer_list = nn.ModuleList()
        self.learning_rate = learning_rate
        self.loss = nn.BCELoss()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.early_stopping = early_stopping
        self.model_file = self.create_model_file(model_file, fold)
        self.batch_size = batch_size
        self.train_log = pd.DataFrame({'auc': [0], 'loss': [0]})
        self.val_log = pd.DataFrame({'auc': [0], 'loss': [0]})
        for i in range(len(self.dims) + 1):
            if i == 0:
                self.layer_list.append(
                    nn.Linear(self.input_size, self.dims[i]))
                self.layer_list.append(nn.BatchNorm1d(self.dims[i]))
            elif i == (len(self.dims)):
                self.layer_list.append(
                    nn.Linear(self.dims[i - 1], self.output_size))
            else:
                self.layer_list.append(
                    nn.Linear(self.dims[i - 1], self.dims[i]))
                self.layer_list.append(nn.BatchNorm1d(self.dims[i]))
        self.optimizer = torch.optim.Adam(
            self.parameters(), lr=self.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, patience=5, factor=0.1, min_lr=1e-7, eps=1e-08
        )

    def create_model_file(self, model_file, fold):
        if not os.path.isdir('models'):
            os.mkdir('models')
        return 'models/nn_fold_{fold}_{model_file}'

    def forward(self, x):
        for i, layer in enumerate(self.layer_list):
            x = F.dropout(F.leaky_relu(self.layer_list[i](x)), p=0.2)
        return torch.sigmoid(x)

    def training_step(self, batch):
        self.optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            x, y = batch['data'].to(
                self.device), batch['target'].to(self.device)
            x = x.reshape(x.size(1), -1)
            y = y.reshape(-1, 1)
            logits = self(x)
            loss = self.loss(input=logits,
                             target=y)
            loss.backward()
            self.optimizer.step()
            return {'loss': loss, 'preds': logits, 'target': y}

    def validation_step(self, batch):
        with torch.set_grad_enabled(False):
            x, y = batch['data'].to(
                self.device), batch['target'].to(self.device)
            x = x.reshape(x.size(1), -1)
            y = y.reshape(-1, 1)
            logits = self(x)
            loss = self.loss(logits,
                             target=y)
            self.scheduler.step(loss)
            return {'loss': loss, 'preds': logits, 'target': y}

    def eval_step(self, data):
        with torch.set_grad_enabled(False):
            x, y = data['data'].to(self.device), data['target'].to(self.device)
            x = x.reshape(x.size(1), -1)
            y = y.reshape(-1, 1)
            preds = self(x)
            return y, preds

    def batch_step_end_metrics(self, num_samples, batch_number, output, running_loss, running_metric):
        running_loss += output['loss'].item()
        running_metric += roc_auc_score(
            output['target'].detach().cpu().numpy(),
            output['preds'].detach().cpu().numpy())
        return running_loss, running_metric

    def epoch_end_metrics(self, outputs):
        auc = torch.tensor([roc_auc_score(
            out['target'].detach().cpu().numpy(),
            out['preds'].detach().cpu().numpy()) for out in outputs])
        loss = torch.stack([out['loss'] for out in outputs])
        return torch.mean(auc), torch.mean(loss)

    def log_results(self, phase, auc, loss):
        if phase == 'train':
            self.train_log = self.train_log.append(
                {'auc': auc.item(), 'loss': loss.item()}, ignore_index=True)
        if phase == 'val':
            self.val_log = self.val_log.append(
                {'auc': auc.item(), 'loss': loss.item()}, ignore_index=True)

    def training_loop(self, epochs, dataloaders):
        es_counter = 0
        auc = {'train': -np.inf, 'eval': -np.inf}
        loss = {'train': np.inf, 'eval': np.inf}
        best_auc = -np.inf
        for e, epoch in enumerate(range(epochs), 1):
            for phase in ['train', 'val']:
                bar = tqdm(dataloaders[phase])
                outs = []
                running_loss = 0.0
                running_auc = 0.0
                for b, batch in enumerate(bar, 1):
                    bar.set_description(f'Epoch {epoch} {phase}'.ljust(20))
                    if phase == 'train':
                        self.train()
                        out = self.training_step(batch)
                    elif phase == 'val':
                        self.eval()
                        out = self.validation_step(batch)
                    outs.append(out)
                    num_samples = batch_size * b
                    running_loss, running_auc = self.batch_step_end_metrics(
                        num_samples, b, out, running_loss, running_auc)
                    bar.set_postfix(loss=f'{running_loss / b:0.5f}',
                                    auc=f'{running_auc / b:0.5f}')
                auc[phase], loss[phase] = self.epoch_end_metrics(outs)
                self.log_results(phase, auc[phase], loss[phase])
                if phase == 'val' and auc['val'] > best_auc:
                    # print('auc_val: ' + auc['val'], 'best_auc: ' + best_auc)
                    best_auc = auc['val']
                    best_model_weights = copy.deepcopy(self.state_dict())
                    torch.save(best_model_weights, self.model_file)
                    es_counter = 0
            es_counter += 1
            if es_counter > self.early_stopping:
                print(
                    f'Early Stopping limit reached. Best Model saved to {self.model_file}')
                print(f'Best Metric achieved: {best_auc}')
                break


def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight, nn.init.calculate_gain('leaky_relu'))
        m.bias.data.fill_(1)


In [None]:
data = load_data('data/', mode='train', overide='filtered_train.csv')
data, target, features, date = preprocess_data(data, scale=True)
# %%
dataset = FinData(data=data, target=target, date=date)
# %%
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dims = [384, 896, 896, 394]
batch_size = 500
epochs = 100
gts = GroupTimeSeriesSplit(n_splits=5)
#gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=31)
train_model = True
eval_model = False
for i, (train_idx, val_idx) in enumerate(gts.split(data, groups=date)):
    if train_model:
        model = Classifier(input_size=len(features), output_size=1,
                           dims=dims, batch_size=batch_size,
                           model_file=f'nn_model_fold_{i}.pth').to(device=device)

        # model.apply(init_weights)
        train_set, val_set = Subset(
            dataset, train_idx), Subset(dataset, val_idx)
        train_sampler = BatchSampler(SequentialSampler(
            train_set), batch_size=batch_size, drop_last=False)
        val_sampler = BatchSampler(SequentialSampler(
            val_set), batch_size=batch_size, drop_last=False)
        dataloaders = {'train': DataLoader(dataset, sampler=train_sampler, num_workers=6),
                       'val': DataLoader(dataset, sampler=val_sampler, num_workers=6)}
        model.training_loop(epochs=epochs, dataloaders=dataloaders)
        model.train_log.to_csv(
            f'logs/train_fold_{i}_{str(datetime.datetime.now())}.csv')
        model.val_log.to_csv(
            f'logs/val_fold_{i}_{str(datetime.datetime.now())}.csv')
    if eval_model:
        model = Classifier(input_size=len(features), output_size=1,
                           dims=dims, batch_size=batch_size,
                           model_file=f'nn_model_fold_{i}_{datetime.datetime.now()}.pth').to(device=device)
        checkpoint = torch.load(model.model_file)
        model.load_state_dict(checkpoint)
        model.eval()
        val_set = Subset(dataset, val_idx)
        val_sampler = BatchSampler(SequentialSampler(
            val_set), batch_size=batch_size, drop_last=False)
        val_loader = DataLoader(dataset, sampler=val_sampler, num_workers=6)
        bar = tqdm(val_loader)
        all_preds = []
        all_y_true = []
        for b, batch in enumerate(bar, 1):
            bar.set_description(f'Evaluating Model')
            y_true, preds = model.eval_step(batch)
            all_y_true.append(y_true.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
        all_preds = np.concatenate(all_preds, axis=0)
        all_y_true = np.concatenate(all_y_true, axis=0)
        fpr, tpr, _ = roc_curve(all_y_true, all_preds)
        plt.plot(fpr, tpr, label='nn')
        plt.savefig(
            f'plots/val_fold_{i}_roc_curve.png')
        plt.close()
# %%


In [None]:
from numpy.lib.function_base import _parse_input_dimensions
from sklearn.metrics import roc_auc_score, roc_curve, plot_roc_curve
import torch
import numpy as np
from torch._C import dtype
import torch.nn as nn
from torch.nn import functional as F
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning import Trainer, trainer
import datatable as dt
import pandas as pd
from torch.nn.modules.loss import BCEWithLogitsLoss
from group_time_split import GroupTimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader, Subset, dataloader
from torch.utils.data.sampler import BatchSampler, SequentialSampler
from tqdm import tqdm
import copy
import datetime
import matplotlib.pyplot as plt


class Classifier(nn.Module):
    def __init__(self, input_size, output_size, dims, batch_size, learning_rate=0.05, early_stopping=10,
                 model_file='model.pth'):
        super(Classifier, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.dims = dims
        self.layer_list = nn.ModuleList()
        self.learning_rate = learning_rate
        self.loss = nn.BCELoss()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.early_stopping = early_stopping
        self.model_file = model_file
        self.batch_size = batch_size
        self.train_log = pd.DataFrame({'auc': [0], 'loss': [0]})
        self.val_log = pd.DataFrame({'auc': [0], 'loss': [0]})
        for i in range(len(self.dims)+1):
            if i == 0:
                self.layer_list.append(
                    nn.Linear(self.input_size, self.dims[i]))
                self.layer_list.append(nn.BatchNorm1d(self.dims[i]))
            elif i == (len(self.dims)):
                self.layer_list.append(
                    nn.Linear(self.dims[i-1], self.output_size))
            else:
                self.layer_list.append(nn.Linear(self.dims[i-1], self.dims[i]))
                self.layer_list.append(nn.BatchNorm1d(self.dims[i]))
        self.optimizer = torch.optim.Adam(
            self.parameters(), lr=self.learning_rate)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, patience=5, factor=0.1, min_lr=1e-7, eps=1e-08
        )

    def forward(self, x):
        for i, layer in enumerate(self.layer_list):
            x = F.dropout(F.leaky_relu(self.layer_list[i](x)), p=0.2)
        return torch.sigmoid(x)

    def training_step(self, batch):
        self.optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            x, y = batch['data'].to(
                self.device), batch['target'].to(self.device)
            x = x.reshape(x.size(1), -1)
            y = y.reshape(-1, 1)
            logits = self(x)
            loss = self.loss(input=logits,
                             target=y)
            loss.backward()
            self.optimizer.step()
            return {'loss': loss, 'preds': logits, 'target': y}

    def validation_step(self, batch):
        with torch.set_grad_enabled(False):
            x, y = batch['data'].to(
                self.device), batch['target'].to(self.device)
            x = x.reshape(x.size(1), -1)
            y = y.reshape(-1, 1)
            logits = self(x)
            loss = self.loss(logits,
                             target=y)
            self.scheduler.step(loss)
            return {'loss': loss, 'preds': logits, 'target': y}

    def eval_step(self, data):
        with torch.set_grad_enabled(False):
            x, y = data['data'].to(self.device), data['target'].to(self.device)
            x = x.reshape(x.size(1), -1)
            y = y.reshape(-1, 1)
            preds = self(x)
            return y, preds

    def batch_step_end_metrics(self, num_samples, batch_number, output, running_loss, running_metric):
        running_loss += output['loss'].item()
        running_metric += roc_auc_score(
            output['target'].detach().cpu().numpy(),
            output['preds'].detach().cpu().numpy())
        return running_loss, running_metric

    def epoch_end_metrics(self, outputs):
        auc = torch.tensor([roc_auc_score(
            out['target'].detach().cpu().numpy(),
            out['preds'].detach().cpu().numpy()) for out in outputs])
        loss = torch.stack([out['loss'] for out in outputs])
        return torch.mean(auc), torch.mean(loss)

    def log_results(self, phase, auc, loss):
        if phase == 'train':
            self.train_log = self.train_log.append(
                {'auc': auc.item(), 'loss': loss.item()}, ignore_index=True)
        if phase == 'val':
            self.val_log = self.val_log.append(
                {'auc': auc.item(), 'loss': loss.item()}, ignore_index=True)

    def training_loop(self, epochs, dataloaders):
        es_counter = 0
        auc = {'train': -np.inf, 'eval': -np.inf}
        loss = {'train': np.inf, 'eval': np.inf}
        best_auc = -np.inf
        for e, epoch in enumerate(range(epochs), 1):
            for phase in ['train', 'val']:
                bar = tqdm(dataloaders[phase])
                outs = []
                running_loss = 0.0
                running_auc = 0.0
                for b, batch in enumerate(bar, 1):
                    bar.set_description(f'Epoch {epoch} {phase}'.ljust(20))
                    if phase == 'train':
                        self.train()
                        out = self.training_step(batch)
                    elif phase == 'val':
                        self.eval()
                        out = self.validation_step(batch)
                    outs.append(out)
                    num_samples = batch_size*b
                    running_loss, running_auc = self.batch_step_end_metrics(
                        num_samples, b, out, running_loss, running_auc)
                    bar.set_postfix(loss=f'{running_loss/b:0.5f}',
                                    auc=f'{running_auc/b:0.5f}')
                auc[phase], loss[phase] = self.epoch_end_metrics(outs)
                self.log_results(phase, auc[phase], loss[phase])
                if phase == 'val' and auc['val'] > best_auc:
                    # print('auc_val: ' + auc['val'], 'best_auc: ' + best_auc)
                    best_auc = auc['val']
                    best_model_weights = copy.deepcopy(self.state_dict())
                    torch.save(best_model_weights, self.model_file)
                    es_counter = 0
            es_counter += 1
            if es_counter > self.early_stopping:
                print(
                    f'Early Stopping limit reached. Best Model saved to {self.model_file}')
                print(f'Best Metric achieved: {best_auc}')
                break


class FinData(Dataset):
    def __init__(self, data, target, mode='train', transform=None, cache_dir=None):
        self.data = data
        self.target = target
        self.mode = mode
        self.transform = transform
        self.cache_dir = cache_dir
        self.date = date

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index.to_list()
        if self.transform:
            return self.transform(self.data.iloc[index].values)
        else:
            if type(index) is list:
                sample = {
                    'target': torch.Tensor(self.target.iloc[index].values),
                    'data': torch.FloatTensor(self.data[index]),
                    'date': torch.Tensor(self.date.iloc[index].values)
                }
                return sample
            else:
                sample = {
                    'target': torch.Tensor(self.target.iloc[index]),
                    'data': torch.FloatTensor(self.data[index]),
                    'date': torch.Tensor(self.date.iloc[index])
                }
                return sample

    def __len__(self):
        return len(self.data)


def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight, nn.init.calculate_gain('leaky_relu'))
        m.bias.data.fill_(1)

In [None]:


def load_data(root_dir, mode, overide=None):
    if overide:
        data = dt.fread(overide).to_pandas()
    elif mode == 'train':
        data = dt.fread(root_dir+'train.csv').to_pandas()
    elif mode == 'test':
        data = dt.fread(root_dir+'example_test.csv').to_pandas()
    elif mode == 'sub':
        data = dt.fread(root_dir+'example_sample_submission.csv').to_pandas()
    return data


def preprocess_data(data):
    # data = data.query('weight > 0').reset_index(drop=True)
    data['action'] = ((data['resp'].values) > 0).astype('float32')
    features = [
        col for col in data.columns if 'feature' in col and col != 'feature_0']+['weight']
    for col in features:
        data[col].fillna(data[col].mean(), inplace=True)
    target = data['action']
    date = data['date']
    data = data[features]
    scaler = StandardScaler()
    data = scaler.fit_transform(data)

    return data, target, features, date


In [None]:
data = load_data('data/', mode='train', overide='filtered_train.csv')
data, target, features, date = preprocess_data(data)
# %%
dataset = FinData(data, target, features)
# %%
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dims = [384, 896, 896, 394]
batch_size = 1000
epochs = 100
gts = GroupTimeSeriesSplit()
train_model = False
eval_model = True
for i, (train_idx, val_idx) in enumerate(gts.split(data, groups=date)):
    if train_model:
        model = Classifier(input_size=len(features), output_size=1,
                           dims=dims, batch_size=batch_size,
                           model_file=f'nn_model_fold_{i}.pth').to(device=device)

        # model.apply(init_weights)
        train_set, val_set = Subset(
            dataset, train_idx), Subset(dataset, val_idx)
        train_sampler = BatchSampler(SequentialSampler(
            train_set), batch_size=batch_size, drop_last=False)
        val_sampler = BatchSampler(SequentialSampler(
            val_set), batch_size=batch_size, drop_last=False)
        dataloaders = {'train': DataLoader(dataset, sampler=train_sampler, num_workers=6),
                       'val': DataLoader(dataset, sampler=val_sampler, num_workers=6)}
        model.training_loop(epochs=epochs, dataloaders=dataloaders)
        model.train_log.to_csv(
            f'logs/train_fold_{i}_{str(datetime.datetime.now())}.csv')
        model.val_log.to_csv(
            f'logs/val_fold_{i}_{str(datetime.datetime.now())}.csv')
    if eval_model:
        model = Classifier(input_size=len(features), output_size=1,
                           dims=dims, batch_size=batch_size,
                           model_file=f'nn_model_fold_{i}.pth').to(device=device)
        checkpoint = torch.load(model.model_file)
        model.load_state_dict(checkpoint)
        model.eval()
        val_set = Subset(dataset, val_idx)
        val_sampler = BatchSampler(SequentialSampler(
            val_set), batch_size=batch_size, drop_last=False)
        val_loader = DataLoader(dataset, sampler=val_sampler, num_workers=6)
        bar = tqdm(val_loader)
        all_preds = []
        all_y_true = []
        for b, batch in enumerate(bar, 1):
            bar.set_description(f'Evaluating Model')
            y_true, preds = model.eval_step(batch)
            all_y_true.append(y_true.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
        all_preds = np.concatenate(all_preds, axis=0)
        all_y_true = np.concatenate(all_y_true, axis=0)
        fpr, tpr, _ = roc_curve(all_y_true, all_preds)
        plt.plot(fpr, tpr, label='nn')
        plt.savefig(
            f'plots/val_fold_{i}_roc_curve.png')
# %%
val_set[:]['target']
# %%


In [None]:
import copy
import os

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from numba import njit
from pytorch_lightning import Callback
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from sklearn.metrics import roc_auc_score
from torch.nn.modules import linear
from torch.nn.modules.batchnorm import BatchNorm1d
from tqdm import tqdm
from sklearn.metrics import mean_squared_error


class MetricsCallback(Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)


class Classifier(pl.LightningModule):
    def __init__(self, input_size, output_size, params=None,
                 model_path='models/'):
        super(Classifier, self).__init__()
        dim_1 = params['dim_1']
        dim_2 = params['dim_2']
        dim_3 = params['dim_3']
        dim_4 = params['dim_4']
        dim_5 = params['dim_5']
        self.dropout_prob = params['dropout']
        self.lr = params['lr']
        self.activation = params['activation']
        self.input_size = input_size
        self.output_size = output_size
        self.loss = params['loss']
        self.weight_decay = params['weight_decay']
        self.amsgrad = params['amsgrad']
        self.label_smoothing = params['label_smoothing']
        self.train_log = pd.DataFrame({'auc': [0], 'loss': [0]})
        self.val_log = pd.DataFrame({'auc': [0], 'loss': [0]})
        self.model_path = model_path
        self.encoder = nn.Sequential(
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, dim_1, bias=False),
            nn.BatchNorm1d(dim_1),
            self.activation(),
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(dim_1, dim_2, bias=False),
            nn.BatchNorm1d(dim_2),
            self.activation(),
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(dim_2, dim_3, bias=False),
            nn.BatchNorm1d(dim_3),
            self.activation(),
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(dim_3, dim_4, bias=False),
            nn.BatchNorm1d(dim_4),
            self.activation(),
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(dim_4, dim_5, bias=False),
            nn.BatchNorm1d(dim_5),
            self.activation(),
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(dim_5, self.output_size, bias=False)
        )
        self.encoder_1l = nn.Sequential(
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, dim_1, bias=False),
            nn.BatchNorm1d(dim_1),
            self.activation(),
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(dim_1, self.output_size, bias=False)
        )

    def forward(self, x):
        out = self.encoder_1l(x)
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch['data'], batch['target']
        x = x.view(x.size(1), -1)
        y = y.T
        logits = self(x)
        loss = self.loss(input=logits, target=y)
        mse = mean_squared_error(y_true=y.cpu().numpy(),
                                 y_pred=logits.cpu().detach().numpy())
        self.log('train_mse', mse, on_step=False,
                 on_epoch=True, prog_bar=True)
        self.log('train_loss', loss, prog_bar=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        x, y = batch['data'], batch['target']
        x = x.view(x.size(1), -1)
        y = y.T
        logits = self(x)
        loss = self.loss(input=logits,
                         target=y)
        mse = mean_squared_error(y_true=y.cpu().numpy(),
                                 y_pred=logits.cpu().detach().numpy())
        return {'loss': loss, 'y': y, 'logits': logits, 'mse': mse}

    def validation_epoch_end(self, val_step_outputs):
        epoch_loss = torch.tensor([x['loss'] for x in val_step_outputs]).mean()
        epoch_mse = torch.tensor([x['mse'] for x in val_step_outputs]).mean()
        self.log('val_loss', epoch_loss, prog_bar=True)
        self.log('val_mse', epoch_mse, prog_bar=True)

    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)

    def test_epoch_end(self, outputs):
        epoch_loss = torch.tensor([x['loss'] for x in outputs]).mean()
        epoch_auc = torch.tensor([x['auc'] for x in outputs]).mean()
        self.log('test_loss', epoch_loss)
        self.log('test_auc', epoch_auc)

    def predict(self, batch):
        self.eval()
        x, y = batch['data'], batch['target']
        x = x.view(x.size(1), -1)
        x = self(x)
        return torch.sigmoid(x.view(-1))

    def prediction_loop(self, dataloader, return_tensor=True):
        bar = tqdm(dataloader)
        preds = []
        for batch in bar:
            preds.append(self.predict(batch))
        if return_tensor:
            return torch.cat(preds, dim=0)
        else:
            return preds

    def configure_optimizers(self):
        # weight_decay = self.weight_decay,
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr,
                                     amsgrad=self.amsgrad)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=5, factor=0.1, min_lr=1e-7, eps=1e-08
        )
        return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_mse'}


def init_weights(m, func):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight, nn.init.calculate_gain(func))
        # m.bias.data.fill_(1)


def train_cross_val(p):
    data_ = load_data(root_dir='./data/', mode='train')
    data_, target_, features, date = preprocess_data(data_, nn=True)

    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)

    input_size = data_.shape[-1]
    output_size = 1
    tb_logger = pl_loggers.TensorBoardLogger('logs/')
    models = []
    for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)):
        idx = np.concatenate([train_idx, val_idx])
        data = copy.deepcopy(data_[idx])
        target = copy.deepcopy(target_[idx])
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            os.path.join('models/', "fold_{}".format(i)), monitor="val_auc", mode='max', save_top_k=1, period=10)
        model = Classifier(input_size=input_size,
                           output_size=output_size, params=p)
        if p['activation'] == nn.ReLU:
            model.apply(lambda m: init_weights(m, 'relu'))
        elif p['activation'] == nn.LeakyReLU:
            model.apply(lambda m: init_weights(m, 'leaky_relu'))
        train_idx = [i for i in range(0, max(train_idx) + 1)]
        val_idx = [i for i in range(len(train_idx), len(idx))]
        data[train_idx] = calc_data_mean(
            data[train_idx], './cache', train=True, mode='mean')
        data[val_idx] = calc_data_mean(
            data[val_idx], './cache', train=False, mode='mean')
        dataset = FinData(data=data, target=target, date=date)
        dataloaders = create_dataloaders(
            dataset, indexes={'train': train_idx, 'val': val_idx}, batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_auc', patience=10,
                           min_delta=0.0005, mode='max')
        trainer = pl.Trainer(logger=tb_logger,
                             max_epochs=500,
                             gpus=1,
                             callbacks=[checkpoint_callback, es],
                             precision=16)
        trainer.fit(
            model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val'])
        torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth')
        models.append(model)
    return models, features


def final_train(p, load=False):
    data_ = load_data(root_dir='./data/', mode='train')
    data, target, features, date = preprocess_data(data_, nn=True)
    input_size = data.shape[-1]
    output_size = 1
    train_idx, val_idx = date[date <= 450].index.values.tolist(
    ), date[date > 450].index.values.tolist()
    data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True)
    data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False)
    checkpoint_callback = pl.callbacks.ModelCheckpoint(filepath='models/full_train',
                                                       monitor="val_auc", mode='max', save_top_k=1, period=10)
    model = Classifier(input_size=input_size,
                       output_size=output_size, params=p)
    if p['activation'] == nn.ReLU:
        model.apply(lambda m: init_weights(m, 'relu'))
    elif p['activation'] == nn.LeakyReLU:
        model.apply(lambda m: init_weights(m, 'leaky_relu'))
    dataset = FinData(data, target, date)
    dataloaders = create_dataloaders(
        dataset, indexes={'train': train_idx, 'val': val_idx}, batch_size=p['batch_size'])
    es = EarlyStopping(monitor='val_auc', patience=10,
                       min_delta=0.0005, mode='max')
    trainer = pl.Trainer(max_epochs=500,
                         gpus=1,
                         callbacks=[checkpoint_callback, es],
                         precision=16)
    trainer.fit(
        model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val'])
    torch.save(model.state_dict(), 'models/final_train.pth')
    return model, features


def fillna_npwhere(array, values):
    if np.isnan(array.sum()):
        array = np.nan_to_num(array) + np.isnan(array) * values
    return array


def test_model(models, features, cache_dir='cache'):
    env = janestreet.make_env()
    iter_test = env.iter_test()
    if type(models) == list:
        models = [model.eval() for model in models]
    else:
        models.eval()
    f_mean = np.load(f'{cache_dir}/f_mean.npy')
    for (test_df, sample_prediction_df) in tqdm(iter_test):
        if test_df['weight'].item() > 0:
            vals = torch.FloatTensor(
                fillna_npwhere(test_df[features].values, f_mean))
            if type(models) == list:
                preds = [torch.sigmoid(model.forward(vals.view(1, -1))).item()
                         for model in models]
                pred = np.median(preds)
            else:
                pred = torch.sigmoid(models.forward(vals.view(1, -1))).item()
            sample_prediction_df.action = np.where(
                pred > 0.5, 1, 0).astype(int).item()
        else:
            sample_prediction_df.action = 0
        env.predict(sample_prediction_df)


In [None]:
import datetime
import gc

import datatable as dt
import joblib
import lightgbm as lgb
import neptune
import neptunecontrib.monitoring.optuna as opt_utils
import numpy as np
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from data_loading import utils
from data_loading import purged_group_time_series as pgs


def optimize(trial: optuna.trial.Trial, datasets):
    p = {'learning_rate': trial.suggest_uniform('learning_rate', 1e-4, 1e-1),
         'max_depth': trial.suggest_int('max_depth', 5, 30),
         'max_leaves': trial.suggest_int('max_leaves', 5, 50),
         'subsample': trial.suggest_uniform('subsample', 0.3, 1.0),
         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1.0),
         'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
         'lambda': trial.suggest_uniform('lambda', 0.05, 0.2),
         'alpha': trial.suggest_uniform('alpha', 0.05, 0.2),
         'objective': 'regression',
         'booster': 'gbtree',
         'tree_method': 'gpu_hist',
         'verbosity': 1,
         'n_jobs': 10,
         'eval_metric': 'mse'}
    print('Choosing parameters:', p)
    scores = []
    sizes = []
    # gts = GroupTimeSeriesSplit()
    gts = pgs.PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=31)
    for i, (tr_idx, val_idx) in enumerate(gts.split(datasets['data'], groups=datasets['era'])):
        sizes.append(len(tr_idx))
        x_tr, x_val = datasets['data'][tr_idx], datasets['data'][val_idx]
        y_tr, y_val = datasets['target'][tr_idx], datasets['target'][val_idx]
        d_tr = xgb.DMatrix(x_tr, y_tr)
        d_val = xgb.DMatrix(x_val, y_val)
        clf = xgb.train(p, d_tr, 1000, [
            (d_val, 'eval')], early_stopping_rounds=50, verbose_eval=True)
        val_pred = clf.predict(d_val)
        score = mean_squared_error(y_val, val_pred)
        scores.append(score)
        del clf, val_pred, d_tr, d_val, x_tr, x_val, y_tr, y_val, score
        rubbish = gc.collect()
    print(scores)
    avg_score = utils.weighted_mean(scores, sizes)
    print('Avg Score:', avg_score)
    return avg_score


def loptimize(trial, datasets):
    p = {'learning_rate': trial.suggest_uniform('learning_rate', 1e-4, 1e-1),
         'max_leaves': trial.suggest_int('max_leaves', 5, 100),
         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.99),
         'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 0.99),
         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 1000),
         'lambda_l1': trial.suggest_uniform('lambda_l1', 0.005, 0.05),
         'lambda_l2': trial.suggest_uniform('lambda_l2', 0.005, 0.05),
         'boosting': trial.suggest_categorical('boosting', ['gbdt', 'goss', 'rf']),
         'objective': 'binary',
         'verbose': 1,
         'n_jobs': 10,
         'metric': 'auc'}
    if p['boosting'] == 'goss':
        p['bagging_freq'] = 0
        p['bagging_fraction'] = 1.0
    scores = []
    sizes = []
    # gts = GroupTimeSeriesSplit()
    gts = pgs.PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=31)
    for i, (tr_idx, val_idx) in enumerate(gts.split(datasets['data'], groups=datasets['era'])):
        sizes.append(len(tr_idx))
        x_tr, x_val = datasets['data'][tr_idx], datasets['data'][val_idx]
        y_tr, y_val = datasets['target'][tr_idx], datasets['target'][val_idx]
        train = lgb.Dataset(x_tr, label=y_tr)
        val = lgb.Dataset(x_val, label=y_val)
        clf = lgb.train(p, train, 1000, valid_sets=[
            val], early_stopping_rounds=50, verbose_eval=True)
        preds = clf.predict(x_val)
        score = mean_squared_error(y_val, preds)
        scores.append(score)
        del clf, preds, train, val, x_tr, x_val, y_tr, y_val, score
        rubbish = gc.collect()
    print(scores)
    avg_score = utils.weighted_mean(scores, sizes)
    print('Avg Score:', avg_score)
    return avg_score


def main():
    api_token = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzMGQ1MWZiNy1iYjNlLTQ3NDctOTE4OS1lNzhlNmVlYmUwMzYifQ=='
    neptune.init(api_token=api_token,
                 project_qualified_name='kramerji/Numerai')
    data = utils.load_data('data/', mode='train')
    data, target, features, era = utils.preprocess_data(data, nn=True)
    datasets = {'data': data, 'target': target,
                'features': features, 'era': era}
    print('creating XGBoost Trials')
    xgb_exp = neptune.create_experiment('XGBoost_HPO')
    xgb_neptune_callback = opt_utils.NeptuneCallback(experiment=xgb_exp)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda x: optimize(x, datasets), n_trials=10,
                   callbacks=[xgb_neptune_callback])
    joblib.dump(
        study, f'HPO/xgb_hpo_{str(datetime.datetime.now().date())}.pkl')
    print('Creating LightGBM Trials')
    lgb_exp = neptune.create_experiment('LGBM_HPO')
    lgbm_neptune_callback = opt_utils.NeptuneCallback(experiment=lgb_exp)
    study = optuna.create_study(direction='minimize')
    study.optimize(loptimize, n_trials=10, callbacks=[lgbm_neptune_callback])
    joblib.dump(
        study, f'HPO/lgb_hpo_{str(datetime.datetime.now().date())}.pkl')


if __name__ == '__main__':
    main()


In [None]:
import datetime
import os
import copy
import joblib
import neptune
import neptunecontrib.monitoring.optuna as opt_utils
import optuna
import pytorch_lightning as pl
import torch.nn as nn
from optuna.integration import PyTorchLightningPruningCallback
from pytorch_lightning import Callback
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import Subset, BatchSampler, SequentialSampler, DataLoader
import torch
import numpy as np
from models.resnet import ResNet as Classifier
from data_loading.purged_group_time_series import PurgedGroupTimeSeriesSplit
from data_loading.utils import load_data, preprocess_data, FinData, weighted_mean, seed_everything, calc_data_mean, \
    create_dataloaders, load_model
from models.SupervisedAutoEncoder import SupAE, train_ae_model, create_hidden_rep


class MetricsCallback(Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)


def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight, nn.init.calculate_gain('leaky_relu'))
        m.bias.data.fill_(1)


def create_param_dict(trial, trial_file=None):
    if trial and not trial_file:
        dim_1 = trial.suggest_int('dim_1', 500, 2000)
        dim_2 = trial.suggest_int('dim_2', 1000, 3000)
        dim_3 = trial.suggest_int('dim_3', 1000, 3000)
        dim_4 = trial.suggest_int('dim_4', 500, 1000)
        dim_5 = trial.suggest_int('dim_5', 100, 250)
        act_func = trial.suggest_categorical(
            'activation', ['relu', 'leaky_relu', 'gelu', 'silu'])
        act_dict = {'relu': nn.ReLU, 'leaky_relu': nn.LeakyReLU,
                    'gelu': nn.GELU, 'silu': nn.SiLU}
        act_func = act_dict[act_func]
        dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
        lr = trial.suggest_uniform('lr', 0.00005, 0.05)
        p = {'dim_1': dim_1, 'dim_2': dim_2, 'dim_3': dim_3,
             'dim_4': dim_4, 'dim_5': dim_5, 'activation': act_func, 'dropout': dropout,
             'lr': lr, 'loss': nn.MSELoss, 'embedding': True}
    elif trial and trial_file:
        p = joblib.load(trial_file).best_params
        if not p.get('dim_5', None):
            p['dim_5'] = 75
        if not p.get('label_smoothing', None):
            p['label_smoothing'] = 0.094
        act_dict = {'relu': nn.ReLU,
                    'leaky_relu': nn.LeakyReLU, 'gelu': nn.GELU}
        act_func = trial.suggest_categorical(
            'activation', ['leaky_relu', 'gelu'])
        p['activation'] = act_dict[p['activation']]
    return p


def optimize(trial: optuna.Trial, data_dict):
    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)
    input_size = data_dict['data'].shape[-1]
    output_size = 1
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        os.path.join('hpo/trials/', "trial_resnet_{}".format(trial.number)), monitor="val_mse", mode='min')
    logger = MetricsCallback()
    metrics = []
    sizes = []
    # trial_file = 'HPO/nn_hpo_2021-01-05.pkl'
    trial_file = None
    p = create_param_dict(trial, trial_file)
    p['batch_size'] = trial.suggest_int('batch_size', 8000, 15000)
    if data_dict.get('hidden_true', None):
        p['hidden_len'] = data_dict['hidden']['hidden'].shape[-1]
    for i, (train_idx, val_idx) in enumerate(gts.split(data_dict['data'], groups=data_dict['era'])):
        model = Classifier(input_size, output_size, params=p)
        # model.apply(init_weights)
        dataset = FinData(
            data=data_dict['data'], target=data_dict['target'], era=data_dict['era'], hidden=data_dict.get('hidden', None))
        dataloaders = create_dataloaders(
            dataset, indexes={'train': train_idx, 'val': val_idx}, batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_mse', patience=10,
                           min_delta=0.0005, mode='min')
        print("pl.Trainer...")
        trainer = pl.Trainer(logger=False,
                             max_epochs=500,
                             gpus = 0,
                             callbacks=[checkpoint_callback, logger, PyTorchLightningPruningCallback(
                                 trial, monitor='val_mse'), es],
                             precision=32)
        print("Trainer.fit...")
        trainer.fit(
            model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val'])
        val_loss = logger.metrics[-1]['val_loss'].item()
        metrics.append(val_loss)
        sizes.append(len(train_idx))
    metrics_mean = weighted_mean(metrics, sizes)
    return metrics_mean


def main(train_ae):
    seed_everything(0)
    data = load_data(root_dir='./data/', mode='train')
    data, target, features, era = preprocess_data(
        data, ordinal=True)
    data_dict = {'data': data, 'target': target,
                 'features': features, 'era': era}
    if train_ae:
        print("Train AE...")
        model = train_ae_model(data_dict=data_dict)
    else:
        print("Load AE Model...")
        p = joblib.load('./saved_models/parameters/ae_params.pkl')
        p['input_size'] = len(data_dict['features'])
        p['output_size'] = 1
        model = load_model('./saved_models/trained/trained_ae.pth',
                           p=p, pl_lightning=False, model=SupAE)

    data_dict['hidden'] = create_hidden_rep(model, data_dict)
    data_dict['hidden_true'] = True
    api_token = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzMGQ1MWZiNy1iYjNlLTQ3NDctOTE4OS1lNzhlNmVlYmUwMzYifQ=='
    neptune.init(api_token=api_token,
                 project_qualified_name='kramerji/Numerai')
    nn_exp = neptune.create_experiment('Resnet_HPO')
    nn_neptune_callback = opt_utils.NeptuneCallback(experiment=nn_exp)
    study = optuna.create_study(direction='minimize')
    print("Optuna Study...")
    study.optimize(lambda trial: optimize(trial, data_dict=data_dict), n_trials=100,
                   callbacks=[nn_neptune_callback])
    joblib.dump(
        study, f'./hpo/params/nn_hpo_train_{train_ae}_{str(datetime.datetime.now().date())}.pkl')


if __name__ == '__main__':
    main()


In [None]:
import datetime
import gc
import copy
import datatable as dt
import joblib
import lightgbm as lgb
import neptune
import neptunecontrib.monitoring.optuna as opt_utils
import numpy as np
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from data_loading import utils
from data_loading import purged_group_time_series as pgs


def optimize(trial: optuna.trial.Trial, data_dict: dict):
    p = {'learning_rate':    trial.suggest_uniform('learning_rate', 1e-4, 1e-1),
         'max_depth':        trial.suggest_int('max_depth', 5, 30),
         'max_leaves':       trial.suggest_int('max_leaves', 5, 50),
         'subsample':        trial.suggest_uniform('subsample', 0.3, 1.0),
         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1.0),
         'min_child_weight': trial.suggest_int('min_child_weight', 5, 100),
         'lambda':           trial.suggest_uniform('lambda', 0.05, 0.2),
         'alpha':            trial.suggest_uniform('alpha', 0.05, 0.2),
         'objective':        'reg:squarederror',
         'booster':          'gbtree',
         'tree_method':      'gpu_hist',
         'verbosity':        1,
         'n_jobs':           10,
         'eval_metric':      'rmse'}
    print('Choosing parameters:', p)
    scores = []
    sizes = []
    # gts = GroupTimeSeriesSplit()']

    gts = pgs.PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10)
    for i, (tr_idx, val_idx) in enumerate(gts.split(data_dict['data'], groups=data_dict['era'])):
        x_tr, x_val = data_dict['data'][tr_idx], data_dict['data'][val_idx]
        y_tr, y_val = data_dict['target'][tr_idx], data_dict['target'][val_idx]
        d_tr = xgb.DMatrix(x_tr, label=y_tr)
        d_val = xgb.DMatrix(x_val, label=y_val)
        clf = xgb.train(p, d_tr, 500, [
            (d_val, 'eval')], early_stopping_rounds=50, verbose_eval=True)
        val_pred = clf.predict(d_val)
        score = mean_squared_error(y_val, val_pred)
        scores.append(score)
        sizes.append(len(tr_idx) + len(val_idx))
        del clf, val_pred, d_tr, d_val, x_tr, x_val, y_tr, y_val, score
        rubbish = gc.collect()
    print(scores)
    avg_score = utils.weighted_mean(scores, sizes)
    print('Avg Score:', avg_score)
    return avg_score


def loptimize(trial, data_dict: dict):
    p = {'learning_rate':    trial.suggest_uniform('learning_rate', 1e-4, 1e-1),
         'max_leaves':       trial.suggest_int('max_leaves', 5, 100),
         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.99),
         'bagging_freq':     trial.suggest_int('bagging_freq', 1, 10),
         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 0.99),
         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 1000),
         'lambda_l1':        trial.suggest_uniform('lambda_l1', 0.005, 0.05),
         'lambda_l2':        trial.suggest_uniform('lambda_l2', 0.005, 0.05),
         'boosting':         trial.suggest_categorical('boosting', ['gbdt', 'goss', 'rf']),
         'objective':        'regression',
         'verbose':          1,
         'n_jobs':           10,
         'metric':           'mse'}
    if p['boosting'] == 'goss':
        p['bagging_freq'] = 0
        p['bagging_fraction'] = 1.0
    scores = []
    sizes = []
    # gts = GroupTimeSeriesSplit()
    gts = pgs.PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10)
    for i, (tr_idx, val_idx) in enumerate(gts.split(data_dict['data'], groups=data_dict['era'])):
        sizes.append(len(tr_idx) + len(val_idx))
        x_tr, x_val = data_dict['data'][tr_idx], data_dict['data'][val_idx]
        y_tr, y_val = data_dict['target'][tr_idx], data_dict['target'][val_idx]
        train = lgb.Dataset(x_tr, label=y_tr)
        val = lgb.Dataset(x_val, label=y_val)
        clf = lgb.train(p, train, 500, valid_sets=[
            val], early_stopping_rounds=50, verbose_eval=True)
        preds = clf.predict(x_val)
        score = mean_squared_error(y_val, preds)
        scores.append(score)
        del clf, preds, train, val, x_tr, x_val, y_tr, y_val, score
        rubbish = gc.collect()
    print(scores)
    avg_score = utils.weighted_mean(scores, sizes)
    print('Avg Score:', avg_score)
    return avg_score


def main():
    api_token = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzMGQ1MWZiNy1iYjNlLTQ3NDctOTE4OS1lNzhlNmVlYmUwMzYifQ=='
    neptune.init(api_token=api_token,
                 project_qualified_name='kramerji/Numerai')
    data = utils.load_data('data/', mode='train')
    data, target, features, era = utils.preprocess_data(data, nn=True)
    data_dict = {'data':     data, 'target': target,
                 'features': features, 'era': era}
    print('creating XGBoost Trials')
    xgb_exp = neptune.create_experiment('XGBoost_HPO')
    xgb_neptune_callback = opt_utils.NeptuneCallback(experiment=xgb_exp)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: optimize(trial, data_dict),
                   n_trials=100, callbacks=[xgb_neptune_callback])
    joblib.dump(
        study, f'hpo/params/xgb_hpo_{str(datetime.datetime.now().date())}.pkl')
    print('Creating LightGBM Trials')
    lgb_exp = neptune.create_experiment('LGBM_HPO')
    lgbm_neptune_callback = opt_utils.NeptuneCallback(experiment=lgb_exp)
    study = optuna.create_study(direction='minimize')
    study.optimize(lambda trial: loptimize(trial, data_dict),
                   n_trials=100, callbacks=[lgbm_neptune_callback])
    joblib.dump(
        study, f'hpo/params/lgb_hpo_{str(datetime.datetime.now().date())}.pkl')


if __name__ == '__main__':
    main()


In [None]:
import datetime
import os
import copy
import joblib
import neptune
import neptunecontrib.monitoring.optuna as opt_utils
import optuna
import pytorch_lightning as pl
import torch.nn as nn
from optuna.integration import PyTorchLightningPruningCallback
from pytorch_lightning import Callback
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import Subset, BatchSampler, SequentialSampler, DataLoader
import torch
import numpy as np
from models.SupervisedAutoEncoder import SupAE
from data_loading.purged_group_time_series import PurgedGroupTimeSeriesSplit
from data_loading.utils import load_data, preprocess_data, FinData, weighted_mean, seed_everything, calc_data_mean, \
    create_dataloaders


class MetricsCallback(Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)


def create_param_dict(trial, trial_file=None):
    if trial and not trial_file:
        dim_1 = trial.suggest_int('dim_1', 500, 1000)
        dim_2 = trial.suggest_int('dim_2', 250, 500)
        dim_3 = trial.suggest_int('dim_3', 100, 250)
        hidden = trial.suggest_int('hidden', 50, 200)
        act_func = trial.suggest_categorical(
            'activation', ['relu', 'leaky_relu', 'gelu', 'silu'])
        act_dict = {'relu': nn.ReLU, 'leaky_relu': nn.LeakyReLU,
                    'gelu': nn.GELU, 'silu': nn.SiLU}
        act_func = act_dict[act_func]
        dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
        lr = trial.suggest_uniform('lr', 0.00005, 0.05)
        recon_loss_factor = trial.suggest_uniform('recon_loss_factor', 0.1, 1)
        p = {'dim_1': dim_1, 'dim_2': dim_2, 'dim_3': dim_3, 'hidden': hidden,
             'activation': act_func, 'dropout': dropout,
             'lr': lr, 'recon_loss_factor': recon_loss_factor, 'loss_sup_ae': nn.MSELoss,
             'loss_recon': nn.MSELoss,
             'embedding': True}
    elif trial and trial_file:
        p = joblib.load(trial_file).best_params
        if not p.get('dim_5', None):
            p['dim_5'] = 75
        if not p.get('label_smoothing', None):
            p['label_smoothing'] = 0.094
        act_dict = {'relu': nn.ReLU,
                    'leaky_relu': nn.LeakyReLU, 'gelu': nn.GELU}
        act_func = trial.suggest_categorical(
            'activation', ['leaky_relu', 'gelu'])
        p['activation'] = act_dict[p['activation']]
    return p


def optimize(trial: optuna.Trial, data_dict):
    gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)
    input_size = data_dict['data'].shape[-1]
    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        os.path.join('hpo/checkpoints/', "trial_ae_{}".format(trial.number)), monitor="val_sup_loss", mode='min')
    logger = MetricsCallback()
    metrics = []
    sizes = []
    # trial_file = 'HPO/nn_hpo_2021-01-05.pkl'
    trial_file = None
    p = create_param_dict(trial, trial_file)
    p['batch_size'] = trial.suggest_int('batch_size', 500, 2000)
    p['input_size'] = input_size
    p['output_size'] = 1
    print(f'Running Trail with params: {p}')
    for i, (train_idx, val_idx) in enumerate(gts.split(data_dict['data'], groups=data_dict['era'])):
        model = SupAE(params=p)
        # model.apply(init_weights)
        dataset = FinData(
            data=data_dict['data'], target=data_dict['target'], era=data_dict['era'])
        dataloaders = create_dataloaders(
            dataset, indexes={'train': train_idx, 'val': val_idx}, batch_size=p['batch_size'])
        es = EarlyStopping(monitor='val_loss', patience=10,
                           min_delta=0.0005, mode='min')
        trainer = pl.Trainer(logger=False,
                             max_epochs=100,
                             gpus=1,
                             callbacks=[checkpoint_callback, logger, PyTorchLightningPruningCallback(
                                 trial, monitor='val_sup_loss'), es],
                             precision=16)
        trainer.fit(
            model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val'])
        val_loss = logger.metrics[-1]['val_sup_loss'].item()
        metrics.append(val_loss)
        sizes.append(len(train_idx))
    metrics_mean = weighted_mean(metrics, sizes)
    return metrics_mean


def main():
    seed_everything(0)
    data = load_data(root_dir='./data/', mode='train')
    data, target, features, era = preprocess_data(
        data, ordinal=True)
    api_token = 'eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIzMGQ1MWZiNy1iYjNlLTQ3NDctOTE4OS1lNzhlNmVlYmUwMzYifQ=='
    neptune.init(api_token=api_token,
                 project_qualified_name='kramerji/Numerai')
    nn_exp = neptune.create_experiment('SupAE_HPO')
    nn_neptune_callback = opt_utils.NeptuneCallback(experiment=nn_exp)
    study = optuna.create_study(direction='minimize')
    data_dict = {'data': data, 'target': target,
                 'features': features, 'era': era}
    study.optimize(lambda trial: optimize(trial, data_dict=data_dict), n_trials=100,
                   callbacks=[nn_neptune_callback])
    joblib.dump(
        study, f'hpo/params/SupAEnn_hpo_{str(datetime.datetime.now().date())}.pkl')


if __name__ == '__main__':
    main()


In [None]:
from hpo import gbm_hpo, nn_hpo, ae_hpo
from dotenv import load_dotenv
import os
import numerapi


def credentials():
    dotenv_path = 'num_config.env'
    load_dotenv(dotenv_path=dotenv_path)
    pub_id = os.getenv('PUBLIC_ID')
    priv_key = os.getenv('PRIVATE_KEY')
    latest_round = os.getenv('LATEST_ROUND')
    #pub_id = "C4Q5XUHAH3MSMAHHHRHSQWHG2SUW5Q54"
    #priv_key = "QSBJY72HUPPDJBP3UF7JPFHRK4CV3ITWOEWQNWXD44RUTVORXQCB5BYIQK7I4CMD"
    #latest_round = 156
    return {'PUBLIC_ID': pub_id, 'PRIVATE_KEY': priv_key, 'LATEST_ROUND': latest_round}


def download_data(api: numerapi.NumerAPI, keys):
    if int(keys['LATEST_ROUND']) == api.get_current_round():
        return int(keys['LATEST_ROUND'])
    else:
        LATEST_ROUND = api.get_current_round()
        api.download_current_dataset('./data')
        return LATEST_ROUND


def update_env_file(env_vars):
    with open('num_config.env', 'w') as f:
        f.write(f'LATEST_ROUND={env_vars["LATEST_ROUND"]}\n')
        f.write(f'PUBLIC_ID={env_vars["PUBLIC_ID"]}\n')
        f.write(f'PRIVATE_KEY={env_vars["PRIVATE_KEY"]}\n')

def create_preds():
    pass
def main():
    keys = credentials()
    numapi = numerapi.NumerAPI(
        verbosity='INFO', public_id=keys['PUBLIC_ID'], secret_key=keys['PRIVATE_KEY'])
    keys['LATEST_ROUND'] = download_data(numapi, keys)
    update_env_file(keys)
    #gbm_hpo.main()
    #ae_hpo.main()
    print("nn_hpo...")
    nn_hpo.main(train_ae=False)

if __name__ == '__main__':
    main()
