In [None]:
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

import plotly.express as px

In [None]:
def one_hot_encoding(df, columns):
    encoder = OneHotEncoder(drop=None).fit(
        df[columns].values
    )
    encodings = encoder.transform(df[columns]).toarray()
    
    encodings = pd.DataFrame(
        data=encodings,
        columns=['encoded_'+str(n) for n in range(encodings.shape[1])]
    ).astype(np.int32)
    
    return pd.concat(
        [df, encodings],
        axis=1
    )


def add_scaled_vars(df, scale_columns, write_to_new_df=True):
    min_maxes = []
    write_df = pd.DataFrame() if write_to_new_df else df
    
    scale_columns = df.columns if scale_columns == 'all' else scale_columns
    
    for column in scale_columns:
        max_val, min_val = df[column].max(), df[column].min()
        write_df[column+'_scaled'] = (df[column] - min_val) / (max_val - min_val)
        
        min_maxes.append([max_val, min_val])
        
    return write_df, pd.DataFrame(
        data=np.array(min_maxes).T, columns=scale_columns, index=['max', 'min']
    )
    
    
def plot_data(df, columns, ticker):
    ticker = ticker+'_'
    columns = [ticker+var for var in columns]
    
    plt = px.line(
        df,
        x='timestamp',
        y=columns
    )
    plt.show()

In [None]:
def get_test_train(df, columns):
    return df.loc[df['is_test'] == 0, columns], df.loc[df['is_test'] == 1, columns]

In [None]:
dir_path = "../data/streams/XRPEUR/numeric/preprocessed/"

In [None]:
data = pd.DataFrame()
targets = pd.DataFrame()

use_smooth_data = True # test preprocessed, smooth preprocessed, unprocessed & maybe smooth unprocessed

paths = os.listdir(dir_path)

if not use_smooth_data:
    paths = [path for path in paths if 'smooth' not in path] 

else:
    paths = [path for path in paths if 'smooth' in path or 'target' in path] 

    
for path in paths:
    df = pd.read_csv(dir_path+path, index_col=0)
    
    if 'data' in path:
        data = data.append(df, ignore_index=True)
        
    elif 'targets' in path:
        targets = targets.append(df, ignore_index=True)
        
        
# remove data with nan targets
non_nan_mask = ~np.isnan(targets['label'].values)
targets = targets[non_nan_mask].reset_index(drop=True)
data = data[non_nan_mask].reset_index(drop=True)

In [None]:
# zero one scale data 
scaled_data, scaled_min_maxes = add_scaled_vars(
    data, 'all', write_to_new_df=True
)
scaled_data['is_test'] = targets['is_test']

# encode label
targets = one_hot_encoding(targets, ['label'])

In [None]:
# OVERMPING:
# 0: 10076  rows , 1: 4345  rows, 2: 2149  rows, 3: 133 rows
encoding_cols = targets.columns[targets.columns.str.contains('encoded')]
                                
masks = targets[encoding_cols].astype(bool)
oversamps = [2, 1, 1, 5, 35, 3]

oversamp_targets = pd.DataFrame()
oversamp_data = pd.DataFrame()

for i, oversamp in enumerate(oversamps):
    mask = masks['encoded_'+str(i)].values
    print(f"Class {i} num rows = {len(targets.iloc[mask])}")
    
    for n in range(oversamp):
        #print(f'Adding {i}')
        oversamp_targets = oversamp_targets.append(targets[mask], ignore_index=True)
        oversamp_data = oversamp_data.append(scaled_data[mask], ignore_index=True)
        


print('After..')
masks = oversamp_targets[encoding_cols].astype(bool)

for i in range(len(encoding_cols)):
    mask = masks['encoded_'+str(i)].values
    print(f"Class {i} num rows = {len(oversamp_targets.iloc[mask])}")

In [None]:
#y_train, y_test = get_test_train(oversamp_targets, ['label'])

X_train, X_test = get_test_train(
    oversamp_data,
    oversamp_data.columns
)

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

"""
   Having errors trying to save model from notebook, rerun modeling for now.
   Errors wont be present when experiment is converted to python codebase (out of notebook)
"""

rerun_modeling = True
rerun_grid_search = False

model_file_name = "catboost" 

best_params = {
    'depth': 15, 'l2_leaf_reg': 35, 'learning_rate': 0.01
}


# find the best params for CatBoost 
if rerun_grid_search:
    parameters = {
        'depth': [3, 6, 12, 15],
        'learning_rate': [0.001, 0.01, 0.1],
        'l2_leaf_reg': [2, 15, 25, 35]
    }

    catboost_regressor = CatBoostClassifier(loss_function='MultiClass', iterations=3000)

    Grid_CBC = HalvingGridSearchCV(
        estimator=catboost_regressor,
        param_grid=parameters,
        cv=5, n_jobs=-1
    )

    search = Grid_CBC.fit(X_train, y_train)
    
    best_params = search.best_params_
    catboost_regressor = search.best_estimator_
    
    #catboost_regressor.save_model(model_file_name)
    
    
# Train CatBoost if best params are known
if rerun_modeling:
    catboost_regressor = CatBoostClassifier(
        loss_function='MultiClass',
        iterations=3000,
        **best_params
    )
    catboost_regressor.fit(X_train, y_train)
    #catboost_regressor.save_model(model_file_name)
    

# else:
#     catboost_regressor = CatBoostRegressor().load_model(model_file_name)


In [None]:
# y_train, y_test = get_test_train(
#     oversamp_targets,
#     list(oversamp_targets.columns[oversamp_targets.columns.str.contains('encoded')])
# )

y_train, y_test = get_test_train(oversamp_targets, ['label'])

In [None]:
from torch.utils.data import Dataset

class salesDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, item):
        return self.x[item], self.y[item] 
    
    
class compDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, item):
        p = np.random.randint(0, self.__len__(), 1)[0]
        
        return self.x[item], self.x[p], int(self.y[item] != self.y[p]), self.y[item] 

In [1]:
str(True)

'True'

In [None]:
import torch
import torch.nn as nn

import torch.nn.functional as F


class NN(nn.Module):
    def __init__(
        self, 
        layers_sizes
    ):
        super(NN, self).__init__()
        
        self.layers = []
        
        for _, (in_size, out_size, use_bn, activation, dropout) in layers_sizes.items():
            self.layers.append(nn.Linear(in_size, out_size))
            
            if use_bn:
                self.layers.append(nn.BatchNorm1d(out_size))
                
            if activation is not None:
                self.layers.append(activation())
                
            if dropout and dropout > 0:
                self.layers.append(nn.Dropout(p=dropout))

        self.layers = nn.Sequential(*self.layers)


    def forward(self, batch):
        return self.layers(batch)
    
    
class NN_split(nn.Module):
    def __init__(
        self, 
        layers_sizes
    ):
        super(NN_split, self).__init__()
        
        self.layers = nn.ModuleList()
        
        for _, (in_size, out_size, use_bn, activation, dropout) in layers_sizes.items():
            sub_layer = []
            sub_layer.append(nn.Linear(in_size, out_size))
            
            if use_bn:
                sub_layer.append(nn.BatchNorm1d(out_size))
                
            if activation is not None:
                sub_layer.append(activation())
                
            if dropout and dropout > 0:
                sub_layer.append(nn.Dropout(p=dropout))

            self.layers.append(nn.Sequential(*sub_layer))  


    def forward(self, batch):
        x = batch
        
        for layer in self.layers:
            x = layer(x)
            
        return x

In [15]:
import numpy as np 

l = np.array([0, 0, 1, 1])

a = np.array([
    [0.1, 0.1, 0.1],
    [0.5, 0.5, 0.5],
    [0.9, 0.9, 0.9],
    [0.89, 0.87, 0.88]
])

b = np.array([
    [0, 0, 0],
    [0.51, 0.52, 0.49],
    [0.89, 0.87, 0.88],
    [0.02, 0, 0.05]
])

a_d = np.array([-2, -0.89, 1, 1.07])
b_d = np.array([-1, -0.94, -1.1, 0.9])

l0 = np.abs(l - np.mean(np.abs(a-b), axis=1))
print(l0)

#vec_diff = np.mean(np.abs(a-b), axis=1)
#print(vec_diff, np.abs(a_d-b_d))
#l1 = (1 - l)*np.abs(vec_diff - np.abs(a_d-b_d)) #+ l*np.maximum((np.abs(a_d)+np.abs(b_d))-vec_diff)
#print(l1)

np.abs((a_d - b_d) - np.mean(np.abs(a-b), axis=1))

[0.1        0.01333333 0.98       0.14333333]


array([1.1       , 0.03666667, 2.08      , 0.68666667])

In [None]:
from  torch.nn import functional as f

class Trainer:

    def __init__(
        self, model, optimiser, loss_function,
        device, batch_size, epochs
    ):
        self.model = model
        self.optimiser = optimiser
        self.loss_function = loss_function 
        self.device = device
        self.batch_size = batch_size
        self.epochs = epochs
        

    def validate(self, val_loader):
        with torch.no_grad(): # disable autograd engine (no backprop)
            self.model.eval() # dropout, batchnorm in inference mode

            valid_loss = 0.0
            counter = 0
            for x, y in val_loader:
                x = x.to(self.device)
                y = y.to(self.device)
                
                loss = self.compute_loss(x, y)
                valid_loss += loss.item()
                counter += 1
            valid_loss /= counter
        self.model.train()

        return valid_loss
    
    def compute_loss(self, x, y):
        h = f.softmax(self.model.forward(x), dim=1)
        return self.loss_function(h, y)#y.to(torch.long))
    
    def train(self, train_loader, val_loader, model_save_path):
        best_valid_loss = np.inf
        train_losses = []
        val_losses = []
        
        weights_losses = []

        for epoch in range(self.epochs):
            epoch_loss = 0

            for x, y in train_loader:
                self.optimiser.zero_grad()
                
                x = x.to(self.device)
                y = y.to(self.device)
                
                loss = self.compute_loss(x, y)
                epoch_loss += loss.item()
                
#                 weights_losses.append(
#                     loss +
#                     list(model.state_dict()['layers.4.0.weight'].cpu().detach().numpy().flatten())
#                 )

                loss.backward() # compute gradients

                self.optimiser.step() # Update params based on gradients

            valid_loss = self.validate(val_loader)
            train_losses.append(epoch_loss)
            val_losses.append(valid_loss)
            
            if epoch % 10 == 0:
                print('Epoch {} of {}'.format(epoch, self.epochs))
                print(f"\tEpoch Loss={epoch_loss}, Validation Loss={valid_loss}")
            
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                print(f'\tEpoch({epoch}), Val Loss({valid_loss}), Saving new best model')
                torch.save(
                    self.model.state_dict(),
                    model_save_path
                )
        return pd.DataFrame(data=np.array([train_losses, val_losses]).T, columns=['training_loss', 'vaidation_loss'])
    
    
    
class CompTrainer:

    def __init__(
        self, model, optimiser, loss_function,
        device, batch_size, epochs
    ):
        self.model = model
        self.optimiser = optimiser
        self.loss_function = loss_function 
        self.device = device
        self.batch_size = batch_size
        self.epochs = epochs
        
    def compute_loss(self, x_0, x_1, y):
        vec_0 = self.model.forward(x_0.to(self.device))
        vec_1 = self.model.forward(x_1.to(self.device))
        
        loss = torch.abs(y - torch.mean(torch.abs(vec_0-vec_1), axis=1))
        
        #print(vec_0.shape, vec_1.shape, loss.shape)
        
        return loss.mean()
    
    def get_vecs(self, train_loader, val_loader):
        r = {'train': train_loader, 'val': val_loader}
        vec_df = 
        
        with torch.no_grad(): # disable autograd engine (no backprop)
            self.model.eval() # dropout, batchnorm in inference mode
            vecs = []
        
            for n, loader in r.items():
                for x, _, _, cls  in loader:
                    v = self.model.forward(x.to(self.device)).cpu().detach().numpy().flatten()
                    vec_df = vec_df.append(
                        {'v0': v[0], 'v1': v[1], 'v2': v[2], 'class': cls, 'set': n},
                        ignore_index=True
                    )
                    
        self.model.train()
        return vec_df

    def validate(self, val_loader):
        with torch.no_grad(): # disable autograd engine (no backprop)
            self.model.eval() # dropout, batchnorm in inference mode

            valid_loss = 0.0
            counter = 0
            for x_0, x_1, y, _ in val_loader:
                y = y.to(self.device)
                
                loss = self.compute_loss(x_0, x_1, y)
                valid_loss += loss.item()
                counter += 1
            valid_loss /= counter
        self.model.train()

        return valid_loss
    
    def train(self, train_loader, val_loader, model_save_path):
        best_valid_loss = np.inf
        train_losses = []
        val_losses = []
        
        weights_losses = []

        for epoch in range(self.epochs):
            epoch_loss = 0

            for x_0, x_1, y, _ in train_loader:
                self.optimiser.zero_grad()
                
                y = y.to(self.device)
                
                loss = self.compute_loss(x_0, x_1, y)
                epoch_loss += loss.item()

                loss.backward() # compute gradients

                self.optimiser.step() # Update params based on gradients

            valid_loss = self.validate(val_loader)
            train_losses.append(epoch_loss)
            val_losses.append(valid_loss)
            
            if epoch % 10 == 0:
                print('Epoch {} of {}'.format(epoch, self.epochs))
                print(f"\tEpoch Loss={epoch_loss}, Validation Loss={valid_loss}")
            
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                print(f'\tEpoch({epoch}), Val Loss({valid_loss}), Saving new best model')
                torch.save(
                    self.model.state_dict(),
                    model_save_path
                )
        return pd.DataFrame(data=np.array([train_losses, val_losses]).T, columns=['training_loss', 'vaidation_loss'])

In [None]:
"""
TODO:

    * GET val score by/per class
    
    * Visualize learning:
        * visualize 2d weights & loss
        * visualize 2d prediction values & see if classes are clustered
            * maybe introduce some contrastive loss
    
    * Continue enembe research
        * Try random search & train for 10-30 epochs
        
    * Impement param search
    
    * compre, processesd, unprocessesd & smooth
        
    * make more classes e.g. >1% change, >5% change


"""
from torch import nn, optim
from torch.utils.data import DataLoader

device = 'cuda'
batch_size = 32
epochs = 200
learning_rate = 0.0001
weight_reg = 0.0001

model_save_path = 'model/model.pth'
losses_save_path = 'data/losses.csv'

retrain = False #False, True
load_pretrained_model = False 

layer_sizes = {
    'layer_0': [X_train.shape[1], 125, False, nn.RReLU, 0.166],
    'layer_1': [125, 500, False, nn.ELU, 0.33],
    'layer_2': [500, 5, False, nn.GELU, 0],
    'layer_3': [5, 33, False, nn.GELU, None], # RReLU, GELU
    #'layer_4': [2, 1, None, nn.GELU, None],
    'layer_5': [33, 3, False, nn.Tanh, None], # y_train.shape[1]
}

loss_function = nn.L1Loss() #nn.MSELoss() #nn.BCELoss() #nn.MSELoss() #nn.L1Loss()  

train_loader = DataLoader(
    compDataset(  #compDataset,  salesDataset
        X_train.values.astype(np.float32),
        y_train.values.astype(np.float32),
    ),
    batch_size=batch_size, shuffle=True
)

val_loader = DataLoader(
    compDataset(
        X_test.values.astype(np.float32),
        y_test.values.astype(np.float32),
    ),
    batch_size=batch_size, shuffle=True
)

model = NN_split(layer_sizes).to(device)
#print(model)
optimiser = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_reg)

trainer = CompTrainer(  # CompTrainer, Trainer
    model, optimiser, loss_function, device, batch_size, epochs
)

if retrain:
    if load_pretrained_model:
        model.load_state_dict(torch.load(model_save_path))
        
    losses = trainer.train(train_loader, val_loader, model_save_path)
    losses.to_csv(losses_save_path, index=False)
    
else:
    model.load_state_dict(torch.load(model_save_path))
    losses = pd.read_csv(losses_save_path)

In [None]:
trainer.get_vecs(train_loader, val_loader)

In [None]:
list(model.state_dict()['layers.4.0.weight'].cpu().detach().numpy().flatten())

In [None]:
epochs = 30
weight_reg = 0.0001  #0.005
model_2_save_path = 'model/model_2.pth'
losses_2_save_path = 'data/losses_2.csv'

retrain = False #False, True

model_2 = NN(layer_sizes).to(device)

optimiser_2 = optim.Adam(model_2.parameters(), lr=learning_rate, weight_decay=weight_reg)

trainer_2 = Trainer(
    model_2, optimiser_2, loss_function, device, batch_size, epochs
)

if retrain:
    losses_2 = trainer_2.train(train_loader, val_loader, model_2_save_path)
    losses_2.to_csv(losses_2_save_path, index=False)
    
else:
    model_2.load_state_dict(torch.load(model_2_save_path))
    losses_2 = pd.read_csv(losses_2_save_path)

In [None]:
## Try random search & train for 10-30 epochs

from collections import OrderedDict

#epochs = 30
weight_reg = 0.01
join_weights = [
    [0, 1], [1, 0],
    [0.1, 0.9], [0.9, 0.1], 
    [0.2, 0.8], [0.2, 0.8], 
    [0.4, 0.6], [0.6, 0.4],
    [0.5, 0.5]
]

model_join_save_path = 'model/model_join.pth'
losses_join_save_path = 'data/losses_join.csv'


model_join = NN(layer_sizes).to(device)
optimiser_join = optim.Adam(model_join.parameters(), lr=learning_rate, weight_decay=weight_reg)

#for _ in range(100):
for join_weight in join_weights:
    #print(join_weight)
    join_state_dict = OrderedDict()
    #r = np.clip(np.random.normal(0.5, scale=0.4, size=1), 0, 1)[0]
    #print("Split: ", r, r-1)
    print("Split: ", join_weight[0], join_weight[1])

    for key_0, val_0 in torch.load(model_save_path).items():
        for key_1, val_1 in torch.load(model_2_save_path).items():
            if key_0 == key_1:
                #join_state_dict[key_0] = val_0*r + val_1*(1-r)
                join_state_dict[key_0] = val_0*join_weight[0] + val_1*join_weight[1]


    model_join.load_state_dict(join_state_dict)    

    trainer_join = Trainer(
        model_join, optimiser_join, loss_function, device, batch_size, epochs
    )

    val_loss = trainer_join.validate(val_loader)
    print("Val Score:", val_loss, "\n")
    
    

join_weight = [0.5, 0.5]
join_state_dict = OrderedDict()

for key_0, val_0 in torch.load(model_save_path).items():
    for key_1, val_1 in torch.load(model_2_save_path).items():
        if key_0 == key_1:
            join_state_dict[key_0] = val_0*join_weight[0] + val_1*join_weight[1]


model_join.load_state_dict(join_state_dict)    

trainer_join = Trainer(
    model_join, optimiser_join, loss_function, device, batch_size, epochs
)

val_loss = trainer_join.validate(val_loader)
print("Val Score:", val_loss, "\n")

losses_join = trainer_join.train(train_loader, val_loader, model_join_save_path)

In [None]:
# plot taining losses
fig = px.line(
    losses,
    x=losses.index, 
    y="training_loss", 
    title="Neural Net Training Losses",
    #color='is_test'
)
fig.show()

fig = px.line(
    losses,
    x=losses.index, 
    y="vaidation_loss", 
    title="Neural Net Validation Losses",
    #color='is_test'
)
fig.show()

In [None]:
def groupped_plot(group):
    fig = px.scatter(
        group,
        x="timestamp", 
        y=["is_test", "pred", "class"], 
    )
    fig.show() 

def plot_predictions(df, predictions):
    df['pred']  = np.argmax(predictions, axis=1) 
    df['class'] = np.argmax(df[df.columns[df.columns.str.contains('encoded')]].values, axis=1)  
    
    display(df)
    df.groupby(['class']).apply(groupped_plot)
    
    
def plot_predictions_2(df, predictions): 
    #df['is_test_mod'] = df['is_test']*4 -2
    
    for i, col in enumerate(df.columns[df.columns.str.contains('encoded')]):
        df['pred'] = predictions[:, i]
        fig = px.scatter(
            df,
            x=df.index, 
            y=[col, 'pred'],#, 'is_test_mod'], 
        )
        fig.show()

In [None]:
# Display Model Predictions

potting_model = model # model, model_2, model_join


potting_model.eval()
all_data = torch.Tensor(scaled_data[scaled_data.columns].values.astype(np.float32)).to(device)
predictions = f.softmax(potting_model.forward(all_data), dim=1).cpu().detach().numpy()

mask = targets['is_test'] == 1

plot_predictions_2(
    targets.loc[mask].reset_index(drop=True),
    predictions[mask]
)

In [None]:
potting_model = model # model, model_2, model_join


potting_model.eval()
all_data = torch.Tensor(scaled_data[scaled_data.columns].values.astype(np.float32)).to(device)
vec = pd.DataFrame(
    columns=['v0', 'v1', 'v2'],
    data=potting_model.forward(all_data).cpu().detach().numpy()
)
vec['is_test'] = targets['is_test']
vec['label'] = targets['label']


fig = px.scatter_3d(vec[vec['is_test'] == 1], x='v0', y='v1', z='v2',
              color='label')
fig.show()

fig = px.scatter_3d(vec[vec['is_test'] == 0], x='v0', y='v1', z='v2',
              color='label')
fig.show()

In [None]:
# layer_sizes = {
#     'layer_0': [X_train.shape[1], 300, True, nn.RReLU],
#     'layer_1': [300, 100, True, nn.RReLU],
#     'layer_2': [100, 7, True, nn.RReLU],
#     'layer_3': [7, y_train.shape[1], False, None],
# }


# layer_sizes = {
#     'layer_0': [X_train.shape[1], 100, True, nn.RReLU, 0],
#     'layer_1': [100, 300, False, nn.RReLU, 0.33],
#     'layer_2': [300, 25, True, nn.RReLU, None],
#     'layer_3': [25, y_train.shape[1], False, None, None],
# } nn.L1Loss()

# layer_sizes = {
#     'layer_0': [X_train.shape[1], 133, True, nn.Tanh, 0],
#     'layer_1': [133, 175, True, nn.RReLU, 0.15],
#     'layer_2': [175, 25, True, nn.RReLU, None],
#     'layer_3': [25, y_train.shape[1], False, None, None],
# } nn.L1Loss()



# layer_sizes = {
#     'layer_0': [X_train.shape[1], 66, True, nn.Tanh, 0],
#     'layer_1': [66, 175, True, nn.RReLU, 0.15],
#     'layer_2': [175, 25, True, nn.RReLU, None],
#     'layer_3': [25, 2, True, nn.RReLU, None],
#     'layer_4': [2, 15, True, nn.RReLU, None],
#     'layer_5': [15, y_train.shape[1], False, None, None],
# }


# for param, val in model.state_dict().items():
#     print(param, val.shape)