In [1]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Callable, List, Dict, Tuple, Optional
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import itertools



In [2]:
import warnings
warnings.filterwarnings('ignore')

#### Loading Data
We've provided a training set and a validation set.

In [3]:
with open("train.pkl", "rb") as f:
    train_dataset = pickle.load(f)

with open("validation.pkl", "rb") as f:
    val_dataset = pickle.load(f)

#### Model Evaluation
To evaluate the performance of a model, we compare the forecasted OAS of the model to the actual OAS of the trade. We clip the error to be between -10 and 10.

In [4]:
def evaluate_model(model: Callable[[pd.DataFrame], float], dataset: List[Dict]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Model should be a function that takes a dataframe of historical trades and returns a predicted OAS. 
    """
    errors = []
    for i in tqdm(range(len(dataset))):
        observation = dataset[i]
        prev_trades = observation["prev_trades"]
        if len(prev_trades) == 0:
            continue
        predicted_oas = model(prev_trades)
        errors.append(
            {
                "id": i,
                "bias": np.clip(observation["oas"] - predicted_oas, -25, 25),
                "error": np.clip(abs(observation["oas"] - predicted_oas), 0, 25)
            }
        )
    errors_df = pd.DataFrame(errors).set_index("id")
    return errors_df.describe(), errors_df

#### Example model: latest trade
Here is an example model that simply returns the OAS of the most recent trade.

In [5]:
def last_dealer_trade_model(prev_trades: pd.DataFrame) -> float:
    filt = prev_trades[prev_trades['trade_type'] == 'dealer_dealer']
    if len(filt) > 0:
        return filt.iloc[-1]['oas']
    return prev_trades.iloc[-1]["oas"]

def mean_dealer_trade_model(prev_trades: pd.DataFrame) -> float:
    filt = prev_trades[prev_trades['trade_type'] == 'dealer_dealer']
    if len(filt) > 0:
        return filt['oas'].mean()
    
    return prev_trades.iloc[-1]["oas"]



In [6]:
def last_trade_model(prev_trades: pd.DataFrame) -> float:
    return prev_trades.iloc[-1]["oas"]

In [7]:
evaluate_model(last_trade_model, val_dataset)[0]

100%|██████████| 52062/52062 [00:05<00:00, 9668.57it/s] 


Unnamed: 0,bias,error
count,52062.0,52062.0
mean,0.575663,3.818279
std,6.486537,5.275133
min,-25.0,0.0
25%,-1.591775,0.8022
50%,0.24135,1.92335
75%,2.242475,4.2594
max,25.0,25.0


In [8]:
evaluate_model(last_dealer_trade_model, val_dataset)[0]

100%|██████████| 52062/52062 [00:10<00:00, 5159.30it/s]


Unnamed: 0,bias,error
count,52062.0,52062.0
mean,0.08591,3.679828
std,6.117173,4.887309
min,-25.0,0.0
25%,-1.8406,0.837725
50%,0.1356,1.98595
75%,2.112175,4.298075
max,25.0,25.0


In [9]:
evaluate_model(mean_dealer_trade_model, val_dataset)[0]

100%|██████████| 52062/52062 [00:11<00:00, 4701.90it/s]


Unnamed: 0,bias,error
count,52062.0,52062.0
mean,-0.267138,4.186084
std,6.816734,5.386612
min,-25.0,0.0
25%,-2.3987,0.964684
50%,-0.011836,2.26686
75%,2.150884,4.899404
max,25.0,25.0


#### Your Model Here

In [10]:
class DataModelFormatter:
    def format_dataset(self, dataset: List[Dict]) -> List[Dict]:
        formatted_data = []
        for observation in tqdm(dataset):
            prev_trades = observation.get("prev_trades")
            if prev_trades is None or len(prev_trades) == 0:
                continue
            formatted = self._process_trades(prev_trades, observation['oas'])
            formatted_data.append(formatted)
        return formatted_data

    def _process_trades(self, trades: pd.DataFrame, target_oas: float) -> Dict:
        obs_oas = []
        trade_types = []
        quantities = []
        times = []
        
        curr_dealer_oas = []
        curr_cust_oas = []
        
        last_ts = 0
        last_quantity = 0
        last_type = ''
        
        for _, row in trades.reset_index().iloc[::-1].iterrows():
            if last_ts != row['ts_diff_hrs'] or last_quantity != row['quantity']:
                agg = self._finalize_group(curr_dealer_oas, curr_cust_oas, last_ts, last_quantity, last_type)
                if agg is not None:
                    avg_oas, trade, quantity, time_val = agg
                    obs_oas.append(avg_oas)
                    trade_types.append(trade)
                    quantities.append(quantity)
                    times.append(time_val)
                curr_dealer_oas, curr_cust_oas = [], []
            
            if row['trade_type'] != 'dealer_dealer':
                curr_cust_oas.append(row['oas'])
            else:
                curr_dealer_oas.append(row['oas'])
            
            last_ts = row['ts_diff_hrs']
            last_quantity = row['quantity']
            last_type = row['trade_type']
        
        agg = self._finalize_group(curr_dealer_oas, curr_cust_oas, last_ts, last_quantity, last_type)
        if agg is not None:
            avg_oas, trade, quantity, time_val = agg
            obs_oas.append(avg_oas)
            trade_types.append(trade)
            quantities.append(quantity)
            times.append(time_val)
        
        return {
            'target_oas': target_oas,
            'data_oas': obs_oas,
            'data_type': trade_types,
            'data_time': times,
            'data_quantity': quantities
        }
    
    def _finalize_group(
        self,
        curr_dealer_oas: List[float],
        curr_cust_oas: List[float],
        last_ts: float,
        last_quantity: float,
        last_type: str
    ) -> Optional[Tuple[float, str, float, float]]:

        if not (curr_dealer_oas or curr_cust_oas):
            return None
        if curr_dealer_oas and curr_cust_oas:
            return np.mean(curr_dealer_oas), 'combo_dc', last_quantity, last_ts
        elif len(curr_cust_oas) >= 2:
            return np.mean(curr_cust_oas), 'combo_cc', last_quantity, last_ts
        elif len(curr_dealer_oas) >= 2:
            return np.mean(curr_dealer_oas), 'combo_dd', last_quantity, last_ts
        elif curr_dealer_oas:
            return np.mean(curr_dealer_oas), 'dealer_dealer', last_quantity, last_ts
        elif curr_cust_oas:
            return np.mean(curr_cust_oas), last_type, last_quantity, last_ts
        return None


In [11]:
formatter = DataModelFormatter()
formatted_data = formatter.format_dataset(train_dataset)

  0%|          | 0/156186 [00:00<?, ?it/s]

100%|██████████| 156186/156186 [01:49<00:00, 1428.09it/s]


In [12]:
df_train = pd.DataFrame(formatted_data)

# Clean for modeling

In [None]:
df_train['last_data_type'] = df_train['data_type'].apply(lambda x: x[0])
df_train['last_oas'] = df_train['data_oas'].apply(lambda x: x[0])

df_train['error_last'] = np.clip(abs(df_train['last_oas'] - df_train['target_oas']), 0, 25)
df_train['bias_last'] = np.clip(df_train['last_oas'] - df_train['target_oas'], -25, 25)

In [14]:
df_train_cleaned = df_train[df_train['error_last'] < 25]

# Modeling

In [15]:
def fit_weighted_average(
    values: List[float], 
    weights: List[float], 
    half_life: float = 1
) -> float:
    
    if len(values) != len(weights):
        raise ValueError("The length of values and weights must be the same.")
    
    decay_factor = 0.5 ** (1 / half_life)
    exp_weights = [decay_factor ** i for i in range(len(values))]
    
    combined_weights = [w * ew for w, ew in zip(weights, exp_weights)]
    weighted_average = sum(v * cw for v, cw in zip(values, combined_weights)) / sum(combined_weights)
    
    return weighted_average

In [16]:
def f_weight_quantity(i, cutoff=1000000, low_val=0.65):
    return 1 if i > cutoff else low_val

def f_offset_type(data_type: str, offsets: dict):
    return offsets.get(data_type, 0)

def f_weight_type(data_type: str, trade_weights: dict):
    return trade_weights.get(data_type, 0)

In [17]:
def get_prediction(row, half_life, offsets, trade_weights, low_val, cutoff=1000000):
    weights_size = np.array([
        f_weight_quantity(quantity, cutoff, low_val) 
        for quantity in row['data_quantity']
    ])
    weights_type = np.array([
        f_weight_type(dt, trade_weights) 
        for dt in row['data_type']
    ])
    offset_values = np.array([
        f_offset_type(dt, offsets) 
        for dt in row['data_type']
    ])
    observations = np.array(row['data_oas'])
    
    return fit_weighted_average(
        values=offset_values + observations, 
        weights=weights_size * weights_type, 
        half_life=half_life
    )

def compute_score(true_values, predictions):
    return np.mean(abs(np.array(true_values) - np.array(predictions)))

In [18]:
def grid_search(data, true_values, offsets_grid, trade_weights_grid, low_vals, half_life_values, cutoff=1000000):
    best_score = float('inf')
    best_params = None

    for offsets in offsets_grid:
        for trade_weights in trade_weights_grid:
            for low_val in low_vals:
                for half_life in half_life_values:
                    predictions = data.apply(lambda row: get_prediction(row, 
                                                  half_life, 
                                                  offsets, 
                                                  trade_weights, 
                                                  low_val, 
                                                  cutoff),
                                                  axis=1).values
                    score = compute_score(true_values, predictions)
                    
                    if score < best_score:
                        best_score = score
                        best_params = {
                            'offsets': offsets,
                            'trade_weights': trade_weights,
                            'low_val': low_val,
                            'half_life': half_life,
                            'cutoff': cutoff
                        }
    return best_params, best_score



In [19]:
def grid_search(data, true_values, offsets_grid, trade_weights_grid, low_vals, half_life_values, cutoff=1000000):
    best_score = float('inf')
    best_params = None

    combinations = list(itertools.product(offsets_grid, trade_weights_grid, low_vals, half_life_values))
    for offsets, trade_weights, low_val, half_life in tqdm(combinations, total=len(combinations)):
        predictions = data.apply(lambda row: get_prediction(row, 
                                        half_life, 
                                        offsets, 
                                        trade_weights, 
                                        low_val, 
                                        cutoff),
                                        axis=1).values
        score = compute_score(true_values, predictions)
                    
        if score < best_score:
            best_score = score
            best_params = {
                'offsets': offsets,
                'trade_weights': trade_weights,
                'low_val': low_val,
                'half_life': half_life,
                'cutoff': cutoff
            }
    return best_params, best_score



In [20]:
params = {}

## Final results

In [21]:
params_benchmark = {'offsets': {'dealer_buy': 1.25, 'dealer_dealer': -0.2, 'dealer_sell': -1.45},
  'trade_weights': {'dealer_dealer': 1,
   'dealer_buy': 0.65,
   'dealer_sell': 0.65,
   'combo_dd': 1.1,
   'combo_dc': 0.9,
   'combo_cc': 0.75},
  'low_val': 0.6,
  'half_life': 1,
  'cutoff': 1000000}

params = {
  'offsets': 
  {
    'dealer_buy': 1.1,
    'dealer_dealer': -0.15,
    'dealer_sell': -1.25
  },
  'trade_weights': 
  {
    'dealer_dealer': 1.1,
    'dealer_buy': 0.65,
    'dealer_sell': 0.55,
    'combo_dd': 1.2,
    'combo_dc': 1.0,
    'combo_cc': 0.85
  },
  'low_val': 0.5,
  'half_life': 1.0
}


In [22]:
df_train_cleaned['prediction'] = df_train_cleaned.apply(lambda row: get_prediction(row, 
                                                  params['half_life'], 
                                                  params['offsets'], 
                                                  params['trade_weights'], 
                                                  params['low_val'], 
                                                  1000000),
                                                  axis=1)

df_train_cleaned['prediction_benchmark'] = df_train_cleaned.apply(lambda row: get_prediction(row, 
                                                  params_benchmark['half_life'], 
                                                  params_benchmark['offsets'], 
                                                  params_benchmark['trade_weights'], 
                                                  params_benchmark['low_val'], 
                                                  1000000),
                                                  axis=1)

In [32]:
df_train_cleaned['error_prediction'] = np.clip(abs(df_train_cleaned['prediction'] - df_train_cleaned['target_oas']), 0, 25)
df_train_cleaned['bias_prediction'] = np.clip(df_train_cleaned['prediction'] - df_train_cleaned['target_oas'], -25, 25)

df_train_cleaned['error_prediction_benchmark'] = np.clip(abs(df_train_cleaned['prediction_benchmark'] - df_train_cleaned['target_oas']), 0, 25)
df_train_cleaned['bias_prediction_benchmark'] = np.clip(df_train_cleaned['prediction_benchmark'] - df_train_cleaned['target_oas'], -25, 25)

In [33]:
df_train_cleaned[['error_prediction', 'error_prediction_benchmark', 'error_last']].describe()

Unnamed: 0,error_prediction,error_prediction_benchmark,error_last
count,151246.0,151246.0,151246.0
mean,3.012738,3.085823,3.049101
std,3.815835,3.863435,3.926786
min,0.0,0.0,0.0
25%,0.770284,0.802027,0.7068
50%,1.748013,1.809229,1.6866
75%,3.620874,3.707556,3.631775
max,25.0,25.0,24.9996


# All data

In [34]:
df_train['prediction'] = df_train.apply(lambda row: get_prediction(row, 
                                                  params['half_life'], 
                                                  params['offsets'], 
                                                  params['trade_weights'], 
                                                  params['low_val'], 
                                                  1000000),
                                                  axis=1)
df_train['prediction_benchmark'] = df_train.apply(lambda row: get_prediction(row, 
                                                  params_benchmark['half_life'], 
                                                  params_benchmark['offsets'], 
                                                  params_benchmark['trade_weights'], 
                                                  params_benchmark['low_val'], 
                                                  1000000),
                                                  axis=1)

df_train['error_prediction'] = np.clip(abs(df_train['prediction'] - df_train['target_oas']), 0, 25)
df_train['bias_prediction'] = np.clip(df_train['prediction'] - df_train['target_oas'], -25, 25)

df_train['error_prediction_benchmark'] = np.clip(abs(df_train['prediction_benchmark'] - df_train['target_oas']), 0, 25)
df_train['bias_prediction_benchmark'] = np.clip(df_train['prediction_benchmark'] - df_train['target_oas'], -25, 25)

In [35]:
df_train[['error_prediction', 'error_prediction_benchmark', 'error_last']].describe()

Unnamed: 0,error_prediction,error_prediction_benchmark,error_last
count,156186.0,156186.0,156186.0
mean,3.610279,3.694306,3.743385
std,5.088859,5.148312,5.448868
min,0.0,0.0,0.0
25%,0.796581,0.831635,0.7333
50%,1.825873,1.891763,1.76945
75%,3.945334,4.03902,3.981175
max,25.0,25.0,25.0


## Search offsets

In [101]:
dealer_buy_vals = np.arange(1.1, 1.4 + 0.001, 0.1)
dealer_dealer_vals = np.arange(-0.35, -0.15 + 0.001, 0.1)
dealer_sell_vals = np.arange(-1.25, -1.65 - 0.001, -0.1)

offset_grid = [
    {'dealer_buy': db, 'dealer_dealer': dd, 'dealer_sell': ds}
    for db, dd, ds in itertools.product(dealer_buy_vals, dealer_dealer_vals, dealer_sell_vals)
]

In [102]:
grid_search_results = grid_search(
    df_train_cleaned, 
    df_train_cleaned['target_oas'], 
    offset_grid,
    [{
        'dealer_dealer': 1,
        'dealer_buy': .65,
        'dealer_sell': .65,
        'combo_dd': 1.1,
        'combo_dc': 0.9,
        'combo_cc': .75
    }],
    low_vals = [0.6],
    half_life_values = [1]
)

100%|██████████| 60/60 [02:31<00:00,  2.53s/it]


In [None]:
params['offsets'] = grid_search_results[0]['offsets']

## Search weights

In [115]:
base_params = {
    'dealer_dealer': 1,
    'dealer_buy': 0.65,
    'dealer_sell': 0.65,
    'combo_dd': 1.1,
    'combo_dc': 0.9,
    'combo_cc': 0.75
}

param_grid = {key: np.arange(value - 0.1, value + 0.1 + 1e-6, 0.1).tolist()
              for key, value in base_params.items()}

combinations_weights = [dict(zip(param_grid.keys(), combination))
                for combination in itertools.product(*param_grid.values())]

In [119]:
grid_search_results = grid_search(
    df_train_cleaned, 
    df_train_cleaned['target_oas'], 
    [params['offsets']],
    combinations_weights,
    low_vals = [0.6],
    half_life_values = [1]
)

100%|██████████| 729/729 [30:35<00:00,  2.52s/it]


In [123]:
params['trade_weights'] = grid_search_results[0]['trade_weights']

## Search weighting val

In [125]:
grid_search_results = grid_search(
    df_train_cleaned, 
    df_train_cleaned['target_oas'], 
    [params['offsets']],
    [params['trade_weights']],
    low_vals = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9],
    half_life_values = [1]
)

100%|██████████| 9/9 [00:22<00:00,  2.53s/it]


In [129]:
params['low_val'] = grid_search_results[0]['low_val']

## Search half_life val

In [130]:
grid_search_results = grid_search(
    df_train_cleaned, 
    df_train_cleaned['target_oas'], 
    [params['offsets']],
    [params['trade_weights']],
    low_vals = [params['low_val']],
    half_life_values = np.arange(1, 2 + 0.001, 0.1)
)

100%|██████████| 11/11 [00:28<00:00,  2.55s/it]


In [134]:
params['half_life'] = grid_search_results[0]['half_life']