In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Callable, List, Dict, Tuple, Optional
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import itertools

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Loading Data
We've provided a training set and a validation set.

In [3]:
with open("train.pkl", "rb") as f:
    train_dataset = pickle.load(f)

with open("validation.pkl", "rb") as f:
    val_dataset = pickle.load(f)

#### Model Evaluation
To evaluate the performance of a model, we compare the forecasted OAS of the model to the actual OAS of the trade. We clip the error to be between -10 and 10.

In [4]:
def evaluate_model(model: Callable[[pd.DataFrame], float], dataset: List[Dict]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Model should be a function that takes a dataframe of historical trades and returns a predicted OAS. 
    """
    errors = []
    for i in tqdm(range(len(dataset))):
        observation = dataset[i]
        prev_trades = observation["prev_trades"]
        if len(prev_trades) == 0:
            continue
        predicted_oas = model(prev_trades)
        errors.append(
            {
                "id": i,
                "bias": np.clip(observation["oas"] - predicted_oas, -25, 25),
                "error": np.clip(abs(observation["oas"] - predicted_oas), 0, 25)
            }
        )
    errors_df = pd.DataFrame(errors).set_index("id")
    return errors_df.describe(), errors_df

#### Example model: latest trade
Here is an example model that simply returns the OAS of the most recent trade.

In [33]:
def last_dealer_trade_model(prev_trades: pd.DataFrame) -> float:
    filt = prev_trades[prev_trades['trade_type'] == 'dealer_dealer']
    if len(filt) > 0:
        return filt.iloc[-1]['oas']
    return prev_trades.iloc[-1]["oas"]

In [5]:
def last_trade_model(prev_trades: pd.DataFrame) -> float:
    return prev_trades.iloc[-1]["oas"]

In [6]:
evaluate_model(last_trade_model, val_dataset)[0]

100%|██████████| 52062/52062 [00:06<00:00, 8469.43it/s] 


Unnamed: 0,bias,error
count,52062.0,52062.0
mean,0.575663,3.818279
std,6.486537,5.275133
min,-25.0,0.0
25%,-1.591775,0.8022
50%,0.24135,1.92335
75%,2.242475,4.2594
max,25.0,25.0


#### Your Model Here

In [7]:
class DataModelFormatter:
    def format_dataset(self, dataset: List[Dict]) -> List[Dict]:
        formatted_data = []
        for observation in tqdm(dataset):
            prev_trades = observation.get("prev_trades")
            if prev_trades is None or len(prev_trades) == 0:
                continue
            formatted = self._process_trades(prev_trades, observation['oas'])
            formatted_data.append(formatted)
        return formatted_data

    def _process_trades(self, trades: pd.DataFrame, target_oas: float) -> Dict:
        obs_oas = []
        trade_types = []
        quantities = []
        times = []
        
        curr_dealer_oas = []
        curr_cust_oas = []
        
        last_ts = 0
        last_quantity = 0
        last_type = ''
        
        for _, row in trades.reset_index().iloc[::-1].iterrows():
            if last_ts != row['ts_diff_hrs'] or last_quantity != row['quantity']:
                agg = self._finalize_group(curr_dealer_oas, curr_cust_oas, last_ts, last_quantity, last_type)
                if agg is not None:
                    avg_oas, trade, quantity, time_val = agg
                    obs_oas.append(avg_oas)
                    trade_types.append(trade)
                    quantities.append(quantity)
                    times.append(time_val)
                curr_dealer_oas, curr_cust_oas = [], []
            
            if row['trade_type'] != 'dealer_dealer':
                curr_cust_oas.append(row['oas'])
            else:
                curr_dealer_oas.append(row['oas'])
            
            last_ts = row['ts_diff_hrs']
            last_quantity = row['quantity']
            last_type = row['trade_type']
        
        agg = self._finalize_group(curr_dealer_oas, curr_cust_oas, last_ts, last_quantity, last_type)
        if agg is not None:
            avg_oas, trade, quantity, time_val = agg
            obs_oas.append(avg_oas)
            trade_types.append(trade)
            quantities.append(quantity)
            times.append(time_val)
        
        return {
            'target_oas': target_oas,
            'data_oas': obs_oas,
            'data_type': trade_types,
            'data_time': times,
            'data_quantity': quantities
        }
    
    def _finalize_group(
        self,
        curr_dealer_oas: List[float],
        curr_cust_oas: List[float],
        last_ts: float,
        last_quantity: float,
        last_type: str
    ) -> Optional[Tuple[float, str, float, float]]:

        if not (curr_dealer_oas or curr_cust_oas):
            return None
        if curr_dealer_oas and curr_cust_oas:
            return np.mean(curr_dealer_oas), 'combo_dc', last_quantity, last_ts
        elif len(curr_cust_oas) >= 2:
            return np.mean(curr_cust_oas), 'combo_cc', last_quantity, last_ts
        elif len(curr_dealer_oas) >= 2:
            return np.mean(curr_dealer_oas), 'combo_dd', last_quantity, last_ts
        elif curr_dealer_oas:
            return np.mean(curr_dealer_oas), 'dealer_dealer', last_quantity, last_ts
        elif curr_cust_oas:
            return np.mean(curr_cust_oas), last_type, last_quantity, last_ts
        return None


In [8]:
formatter = DataModelFormatter()
#formatted_train = formatter.format_dataset(train_dataset)
formatted_val = formatter.format_dataset(val_dataset)

100%|██████████| 156186/156186 [01:48<00:00, 1445.56it/s]
100%|██████████| 52062/52062 [00:38<00:00, 1340.70it/s]


In [None]:
#df_train = pd.DataFrame(formatted_train)
df_val = pd.DataFrame(formatted_val)

### Model

In [13]:
def fit_weighted_average(
    values: List[float], 
    weights: List[float], 
    half_life: float = 1
) -> float:
    
    if len(values) != len(weights):
        raise ValueError("The length of values and weights must be the same.")
    
    decay_factor = 0.5 ** (1 / half_life)
    exp_weights = [decay_factor ** i for i in range(len(values))]
    
    combined_weights = [w * ew for w, ew in zip(weights, exp_weights)]
    weighted_average = sum(v * cw for v, cw in zip(values, combined_weights)) / sum(combined_weights)
    
    return weighted_average

In [14]:
def f_weight_quantity(i, cutoff=1000000, low_val=0.65):
    return 1 if i > cutoff else low_val

def f_offset_type(data_type: str, offsets: dict):
    return offsets.get(data_type, 0)

def f_weight_type(data_type: str, trade_weights: dict):
    return trade_weights.get(data_type, 0)

In [15]:
def get_prediction(row, half_life, offsets, trade_weights, low_val, cutoff=1000000):
    weights_size = np.array([
        f_weight_quantity(quantity, cutoff, low_val) 
        for quantity in row['data_quantity']
    ])
    weights_type = np.array([
        f_weight_type(dt, trade_weights) 
        for dt in row['data_type']
    ])
    offset_values = np.array([
        f_offset_type(dt, offsets) 
        for dt in row['data_type']
    ])
    observations = np.array(row['data_oas'])
    
    return fit_weighted_average(
        values=offset_values + observations, 
        weights=weights_size * weights_type, 
        half_life=half_life
    )

def compute_score(true_values, predictions):
    return np.mean(abs(np.array(true_values) - np.array(predictions)))

In [None]:
params = {
  'offsets': 
  {
    'dealer_buy': 1.1,
    'dealer_dealer': -0.15,
    'dealer_sell': -1.25
  },
  'trade_weights': 
  {
    'dealer_dealer': 1.1,
    'dealer_buy': 0.65,
    'dealer_sell': 0.55,
    'combo_dd': 1.2,
    'combo_dc': 1.0,
    'combo_cc': 0.85
  },
  'low_val': 0.5,
  'half_life': 1.0
}

In [62]:
df_val['prediction'] = df_val.apply(lambda row: get_prediction(row, 
                                                  params['half_life'], 
                                                  params['offsets'], 
                                                  params['trade_weights'], 
                                                  params['low_val'], 
                                                  1000000),
                                                  axis=1)

In [63]:
df_val['error_prediction'] = np.clip(abs(df_val['prediction'] - df_val['target_oas']), 0, 25)
df_val['bias_prediction'] = np.clip(df_val['prediction'] - df_val['target_oas'], -25, 25)

In [64]:
df_val['last_oas_adj'] = df_val['data_oas'].apply(lambda x: x[0])

df_val['error_last_adj'] = np.clip(abs(df_val['last_oas_adj'] - df_val['target_oas']), 0, 25)
df_val['bias_last_adj'] = np.clip(df_val['last_oas_adj'] - df_val['target_oas'], -25, 25)

# Evaluation

In [72]:
evaluate_model(last_dealer_trade_model, val_dataset)[0]

  0%|          | 0/52062 [00:00<?, ?it/s]

100%|██████████| 52062/52062 [00:08<00:00, 6278.14it/s]


Unnamed: 0,bias,error
count,52062.0,52062.0
mean,0.08591,3.679828
std,6.117173,4.887309
min,-25.0,0.0
25%,-1.8406,0.837725
50%,0.1356,1.98595
75%,2.112175,4.298075
max,25.0,25.0


In [73]:
evaluate_model(last_trade_model, val_dataset)[0]

100%|██████████| 52062/52062 [00:02<00:00, 21036.11it/s]


Unnamed: 0,bias,error
count,52062.0,52062.0
mean,0.575663,3.818279
std,6.486537,5.275133
min,-25.0,0.0
25%,-1.591775,0.8022
50%,0.24135,1.92335
75%,2.242475,4.2594
max,25.0,25.0


In [74]:
df_val[['error_prediction', 'bias_prediction', 'error_last_adj', 'bias_last_adj']].describe()

Unnamed: 0,error_prediction,bias_prediction,error_last_adj,bias_last_adj
count,52062.0,52062.0,52062.0,52062.0
mean,3.396038,-0.503067,3.634983,-0.489189
std,4.539121,5.646577,5.016305,6.175545
min,3.8e-05,-25.0,0.0,-25.0
25%,0.823066,-2.284634,0.7856,-2.1724
50%,1.882608,-0.389449,1.88325,-0.2311
75%,3.917811,1.439982,4.076575,1.571175
max,25.0,25.0,25.0,25.0
