In [None]:
import pickle 

import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf

import shutil
import os
os.chdir('..')

from performance_forecasting.data_creator import download_statement_data_for_exchanges

In [None]:
def get_predictions(regressor, input_data, targets, sess, batch_size, ops, dtype=np.float32):
    preds = None
    op_results = {str(i): [] for i in range(len(ops))}

    for j in range(0, len(targets), batch_size):              #  
        op_result = sess.run(
            ops,
            {
                regressor.x_placeholder: input_data[j: j+batch_size],
                regressor.y_placeholder: targets[j: j+batch_size]
            }
        )
        
        for i, res in enumerate(op_result):
            op_results[str(i)].append(res)
    
    mod_results = []
    for i in range(len(ops)):
        if type(op_results[str(i)][0]) == dtype:
            mod_results.append(op_results[str(i)])
        else:
            mod_results.append(np.concatenate(op_results[str(i)], axis=0))
        
    return mod_results
            

def plot_predictions(regressor, input_data, targets, sess, batch_size, ops, dtype=np.float32):
    results = pd.DataFrame()

    losses, preds = get_predictions(
        regressor, input_data, targets, sess, batch_size, ops, dtype
    )
    print(targets)
    results['pred_0'] = preds[:, 0]
    results['pred_1'] = preds[:, 1]
    results['label_0'] = targets[:, 0]
    results['label_1'] = targets[:, 1]

    results = results.sort_values(by='label_0')

    plt = px.scatter(
        results,
        x='pred_0',
        y='label_0'
    )
    plt.show()
    
    plt = px.scatter(
        results,
        x='pred_1',
        y='label_1'
    )
    plt.show()
    return np.array(losses)

In [None]:
def save_model(sess, save_dir, regressor, save_info, scores):
    shutil.rmtree(save_dir)

    tf.compat.v1.saved_model.simple_save(
        sess, save_dir,
        inputs=regressor.save_params['input'],
        outputs=regressor.save_params['output']
    )
    
    return save_info.append({
        'test': scores[0],
        'val': scores[1],
        'val_test': scores[2]
    }, ignore_index=True)
           
    
# layer_sizes, activations, dropouts, use_batch_bools,
def train_model_with_settings(
    regressor,
    params,
    iterations,
    plot=False,
    savedir='performance_forecasting/models/',
    print_losses=True,
    save_options=None
):    
    saver = tf.compat.v1.train.Saver(max_to_keep=2)
    save_info = pd.DataFrame(columns=['best_val', 'best_test'], data=np.zeros([1, 2])) 
    
    runs = []
    
    for path in os.listdir(savedir):
        try:
            runs.append(int(path))
        except:
            pass
        
    save_dir = savedir + str(max(runs) + 1) + "/"
    os.makedirs(save_dir)
    
    params['activations'] = str(params['activations'])
    
    filename = save_dir+"model" 
    pickle.dump(params,  open(filename+".pkl", 'wb'))
    
    save_dir =  save_dir + 'tf/'
    os.makedirs(save_dir)
    

    with tf.compat.v1.Session() as sess:
        init = tf.compat.v1.global_variables_initializer()
        sess.run(init)

        epoch_loss = 0 
        mean_test_losses = []
        mean_epoch_losses = []
        mean_val_losses = []
        min_val_test_losses = [] #np.inf

        for i in tqdm(range(iterations)):
            indexes = np.random.randint(0, high=len(regressor.y_train), size=batch_size)

            loss_val, _, yy_pred = sess.run(
                [regressor.loss, regressor.optim_op, regressor.y_pred],
                {
                    regressor.x_placeholder: regressor.X_train[indexes],
                    regressor.y_placeholder: regressor.y_train[indexes]
                }
            )
            #print(yy_pred)

            epoch_loss += loss_val

            if i % 500 == 0:
                train_loss = epoch_loss/500
                mean_epoch_losses.append(train_loss)
                epoch_loss = 0

                if plot and i % 10000 == 0:
                    losses = plot_predictions(
                        regressor, regressor.X_test, regressor.y_test,
                        sess, batch_size,
                        [regressor.abs_loss, regressor.y_pred],
                        dtype=np.float32
                    )
                    val_losses = plot_predictions(
                        regressor, regressor.X_val, regressor.y_val, 
                        sess, batch_size, 
                        [regressor.abs_loss, regressor.y_pred],
                        dtype=np.float32
                    )
                else:
                    losses = np.array(get_predictions(
                        regressor, regressor.X_test, regressor.y_test,
                        sess, batch_size,
                        [regressor.abs_loss],
                        dtype=np.float32
                    )[0])
                    val_losses = np.array(get_predictions(
                        regressor, regressor.X_val, regressor.y_val,
                        sess, batch_size,
                        [regressor.abs_loss],
                        dtype=np.float32
                    )[0])
                    
                #print(val_losses)

                val_loss, test_loss = np.mean(val_losses), np.mean(losses)
                
                #print(val_losses.shape, losses.shape)
                
                val_test_loss = np.mean(
                    np.concatenate([val_losses, losses], axis=0)
                )

                if print_losses:
                    print(f"Losses: train={train_loss}, test={test_loss}, val={val_loss}, val_test={val_test_loss}")


                if len(mean_test_losses) > 0 and save_options['save']:

                    if save_options['save_by'] == 'test' and test_loss < np.amin(mean_test_losses):
                        print(f'New best test loss ({test_loss}) saving model..')
                        save_info = save_model(
                            sess, save_dir, regressor, save_info,
                            [test_loss, val_loss, val_test_loss]
                        )


                    elif save_options['save_by'] == 'val' and val_loss < np.amin(mean_val_losses):
                        print(f'New best val loss ({val_loss}) saving model..')
                        save_info = save_model(
                            sess, save_dir, regressor, save_info,
                            [test_loss, val_loss, val_test_loss]
                        )
                        
                        
                    elif save_options['save_by'] == 'both' and val_test_loss < np.amin(min_val_test_losses):
                        print(f'New best val_test loss ({val_test_loss}) saving model..')
                        save_info = save_model(
                            sess, save_dir, regressor, save_info,
                            [test_loss, val_loss, val_test_loss]
                        )
                        min_val_test_loss = val_test_loss

                


                mean_val_losses.append(val_loss)
                mean_test_losses.append(test_loss)
                min_val_test_losses.append(val_test_loss)
                
        save_info.to_csv(save_dir+'best_model_info.csv')
        test_min_indx = np.argmin(mean_test_losses)
        return (
            mean_epoch_losses[test_min_indx], 
            mean_test_losses[test_min_indx], 
            mean_val_losses[test_min_indx],
            min_val_test_losses[test_min_indx]
        )
        #return np.min(mean_epoch_losses), np.min(mean_test_losses), np.min(mean_val_losses), np.min(min_val_test_losses)

In [None]:
def absolute_diff(t0, t1):
    return tf.compat.v1.reduce_mean(tf.compat.v1.abs(t0 - t1))

def linear(t):
    return t

class TF_vOne_Regressor():
    def __init__(
        self,
        params,
        dtype,
        seed
    ):
        
        dataset_names = ['train', 'test', 'val']
        dataset = {}
        dataset_affix = params['dataset_affix']

        for key in dataset_names:
            dataset['x_'+key] = pd.read_csv(
                f'data/performance_forecasting/{key}_input{dataset_affix}.csv', 
                index_col=0
            ).drop('symbol', axis=1).values
            dataset['y_'+key] = pd.read_csv(
                f'data/performance_forecasting/{key}_output{dataset_affix}.csv', 
                index_col=0
            ).drop('symbol', axis=1).values

        self.X_train, self.y_train = dataset['x_train'], dataset['y_train']
        self.X_test, self.y_test = dataset['x_test'], dataset['y_test']
        self.X_val, self.y_val = dataset['x_val'], dataset['y_val']
        
        params['in_shape'] = dataset['x_train'].shape[1]
        params['out_shape'] = dataset['y_train'].shape[1]
    
        self.x_placeholder = tf.compat.v1.placeholder(
            dtype=dtype, shape=(None, params['in_shape']), name='x_placeholder_saved'
        )
        self.y_placeholder = tf.compat.v1.placeholder(
            dtype=dtype, shape=(None, params['out_shape']), name='y_placeholder_saved'
        )
        self.initializer = tf.keras.initializers.GlorotNormal(seed=seed)

        self.layer_weights = []
        self.build_weights(params, dtype)

        self.layer = self.x_placeholder
        self.build_layers(params)
        
        self.y_pred = tf.identity(tf.linalg.matmul(
            self.layer, self.layer_weights[-1]#, name='y_pred'
        ), name="prediction")

        optim = params['optimizer'](
            learning_rate=params['learning_rate']
        )

        self.loss = params['loss_function'](self.y_placeholder, self.y_pred)
        self.abs_loss = tf.compat.v1.losses.absolute_difference(self.y_placeholder, self.y_pred)
        self.optim_op = optim.minimize(self.loss)
        
        self.dataset = dataset

        self.save_params = {
            "input": {
                "x_placeholder_saved": self.x_placeholder,
                "y_placeholder_saved": self.y_placeholder
            },
            "output": {
                "prediction": self.y_pred
            }
        }
        
        

        
    def build_weights(self, params, dtype):
        last_out_size = params['in_shape']

        for i, size in enumerate(list(params['layer_sizes'])+[params['out_shape']]):
            self.layer_weights.append(tf.Variable(
                self.initializer([last_out_size, size], dtype=dtype),
                name='l'+str(i+1),
                dtype=dtype
            ))
            last_out_size = size 


    def build_layers(self, params):
        for i, activation in enumerate(params['activations']):
            self.layer = tf.linalg.matmul(self.layer, self.layer_weights[i])

            if params['use_batch_bools'][i]:
                self.layer = tf.compat.v1.layers.batch_normalization(self.layer)

            self.layer = params['activations'][i](self.layer)
            self.layer = tf.compat.v1.layers.dropout(self.layer, rate=params['activations'][i])
            print(self.layer.shape)

In [None]:
# TF static tensor graph build (tf v1)
tf.compat.v1.disable_eager_execution()

neural_width_scaling = 55 # default was 60
batch_size = 50
seed = 1221
dtype = tf.float32


optim_functions = [
    tf.compat.v1.train.RMSPropOptimizer,
    tf.compat.v1.train.AdamOptimizer,
    tf.compat.v1.train.AdagradOptimizer
]

loss_functions = [
    tf.compat.v1.losses.absolute_difference,
    tf.compat.v1.losses.mean_squared_error,
    tf.compat.v1.losses.huber_loss
]

activation_functions = [
    tf.nn.relu6,
    tf.nn.relu,
    tf.nn.gelu,
    tf.nn.tanh,
    tf.nn.sigmoid,
    linear
]

learning_rates = [0.1, 0.05, 0.01, 0.001, 0.0005]

dataset_affix = "bn_NO10EMA_UP21bn"
runs_filename = f"performance_forecasting/runs/runs_{dataset_affix}.csv"

num_searches = 10000
search_run = False
iterations = 30000 if search_run else 200000
run_model = 1


if os.path.isfile(runs_filename):
    runs_df = pd.read_csv(runs_filename, index_col=0)
else:
    runs_df = pd.DataFrame()


if search_run:
    for _ in range(num_searches):
        num_layers = random.randint(0, 7) # 8)
        params = {
            'layer_sizes': np.clip(np.clip(
                    np.random.f(10, 50, size=num_layers), 0, 5
                )*neural_width_scaling, 3, 300).astype(int),
            'activations': np.random.choice(activation_functions, size=num_layers, replace=True),
            'dropouts': np.random.uniform(0, 0.9, num_layers),
            'use_batch_bools': np.random.choice([True, False], size=num_layers, replace=True),
            'loss_function': random.sample(loss_functions, 1)[0],
            'optimizer': random.sample(optim_functions, 1)[0],
            'learning_rate': random.sample(learning_rates, 1)[0],
            'dataset_affix': '_'+dataset_affix
        }
        
        regressor = TF_vOne_Regressor(
            params,
            dtype=dtype,
            seed=1221
        )

        train_loss, test_loss, val_loss, val_test_loss = train_model_with_settings(
            regressor, params, iterations, 
            plot=False, print_losses=False,
            save_options={
                'save_by': 'both', 'save': False
            }
        )

        print(
            f"Train Loss = {train_loss}, Test loss = {test_loss},"
            f"Val loss = {val_loss}, Val Test Loss {val_test_loss}"
        )

        #layers = {key: ', '.join(val.astype(str)) for key, val in layers.items()}
        
        for key, val in params.items():
            if key in ['layer_sizes', 'activations', 'dropouts', 'use_batch_bools']:
                params[key] = ', '.join(val.astype(str))

        runs_df = runs_df.append(
            {**params, **{
                'train_loss':train_loss, 'test_loss': test_loss, 
                'val_loss': val_loss, 'val_test_loss': val_test_loss
            }},
            ignore_index = True
        )
        runs_df.sort_values(by='test_loss').to_csv(runs_filename)
        
else:
    
    if run_model == 0:    ## best (didnt search that long) bilion no 10 ema
        params = {
            'layer_sizes': [55, 37],
            'activations': [tf.nn.gelu, tf.nn.relu],
            'dropouts': [
                0.807198580715569, 0.26161063031293397
            ],
            'use_batch_bools': [False, True],
            'loss_function': tf.compat.v1.losses.absolute_difference,
            'optimizer': tf.compat.v1.train.RMSPropOptimizer,
            'learning_rate': 0.001,
            'dataset_affix': '_bn_NO10EMA_UP21bn'
        }
    

        regressor = TF_vOne_Regressor(
            params,
            dtype=dtype,
            seed=1221
        )

        train_loss, test_loss, val_loss = train_model_with_settings(
            regressor, params, iterations, plot=False, save_options={
                'save_by': 'test', 'save': True
            }
        )


    elif run_model == 1:   ## 2nd best log (could do a better search)
        params = {
            'layer_sizes': [98, 47, 8, 125, 28],
            'activations': [tf.nn.sigmoid, tf.nn.gelu, linear, linear, tf.nn.tanh],
            'dropouts': [
                0.1995318542098317, 0.8236512364029288, 0.6098110059963907, 
                0.6208227090084751, 0.8886921709358674
            ],
            'use_batch_bools': [False, True, True, True, True],
            'loss_function': tf.compat.v1.losses.absolute_difference,
            'optimizer': tf.compat.v1.train.RMSPropOptimizer,
            'learning_rate': 0.001,
            'dataset_affix': '_sqrt_NO10EMA_UP21bn'
        }

        regressor = TF_vOne_Regressor(
            params,
            dtype=dtype,
            seed=1221
        )

        train_loss, test_loss, val_loss = train_model_with_settings(
            regressor, params, iterations, plot=True, save_options={
                'save_by': 'val', 'save': True
            }
        )

        
    
    print(f"Train Loss = {train_loss}, Test loss = {test_loss}, Val loss = {val_loss}")