# Compare Datasets

Created by Mitas Ray on 2024-11-16.

This notebook is used to compare two datasets. The procedure is to 
1. restrict the datasets to the same datetime window
2. perform high-level analysis on the values in the dataset
3. train a model with these datasets and see similar accuracy results

To run the notebook,
1. on linux: use `ficc_python/requirements_py310.txt`, and use `>>> pip install jupyter`
2. on mac: use `ficc_python/requirements_py310_mac_jupyter.txt`

In [18]:
# loads the autoreload extension
%load_ext autoreload
# automatically reloads all imported modules when their source code changes
%autoreload 2

In [20]:
from datetime import datetime

import pandas as pd


# importing from parent directory: https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
sys.path.insert(0,'../')


from ficc.utils.auxiliary_functions import get_ys_trade_history_features
from ficc.utils.gcp_storage_functions import download_data

from automated_training_auxiliary_variables import CATEGORICAL_FEATURES, BINARY, NON_CAT_FEATURES, NUM_TRADES_IN_HISTORY_YIELD_SPREAD_MODEL, BUCKET_NAME, MODEL_TO_CUMULATIVE_DATA_PICKLE_FILENAME
from automated_training_auxiliary_functions import STORAGE_CLIENT, MODEL_NAME_TO_KERAS_MODEL, check_that_model_is_supported, fit_encoders, create_input, train_and_evaluate_model, create_summary_of_results, get_optional_arguments_for_process_data
from set_random_seed import set_seed


set_seed()

In [2]:
MODEL = 'yield_spread_with_similar_trades'

Get data from Google Cloud Storage.

In [3]:
old_data = download_data(STORAGE_CLIENT, BUCKET_NAME, MODEL_TO_CUMULATIVE_DATA_PICKLE_FILENAME[MODEL])
new_data = download_data(STORAGE_CLIENT, 'sp_data_for_modeling', 'processed_data_from_jesse_tests_trade_history_same_issue_5_yr_mat_bucket_1_materialized_2024-10.pkl')

File processed_data_yield_spread_with_similar_trades.pkl downloaded from Google cloud bucket: automated_training
File processed_data_from_jesse_tests_trade_history_same_issue_5_yr_mat_bucket_1_materialized_2024-10.pkl downloaded from Google cloud bucket: sp_data_for_modeling


Restrict the data between a start and end datetime.

In [4]:
def string_to_datetime(datetime_as_string: datetime | str) -> datetime:
    if isinstance(datetime_as_string, datetime): return datetime_as_string
    string_format = '%Y-%m-%d %H:%M:%S'
    try:
        return datetime.strptime(datetime_as_string, string_format)
    except Exception as e:
        print(f'{datetime_as_string} must be in {string_format} format')
        raise e


def restrict_data_to_specified_time_window(data: pd.DataFrame, 
                                           datetime_column_name: str, 
                                           start_datetime: datetime | str, 
                                           end_datetime: datetime | str) -> pd.DataFrame:
    '''Return a truncated version of `data` with values of `datetime_column_name` between 
    `start_datetime` and `end_datetime`.'''
    start_datetime, end_datetime = string_to_datetime(start_datetime), string_to_datetime(end_datetime)
    after_start_datetime = data[datetime_column_name] >= start_datetime
    before_end_datetime = data[datetime_column_name] <= end_datetime
    rows_to_keep = after_start_datetime & before_end_datetime
    rows_remaining = rows_to_keep.sum()
    print(f'{len(data) - rows_remaining} rows removed from the original {len(data)} rows. {rows_remaining} rows remain.')
    return data[rows_to_keep]

In [5]:
october_1_start_of_day = '2024-10-01 00:00:00'
october_31_end_of_day = '2024-10-31 23:59:59'

In [6]:
def restrict_data_to_october_on_trade_datetime(data):
    return restrict_data_to_specified_time_window(data, 'trade_datetime', october_1_start_of_day, october_31_end_of_day)

In [7]:
old_data = restrict_data_to_october_on_trade_datetime(old_data)

8211991 rows removed from the original 9374027 rows. 1162036 rows remain.


In [8]:
new_data = restrict_data_to_october_on_trade_datetime(new_data)

0 rows removed from the original 1167232 rows. 1167232 rows remain.


Compare datasets.

In [None]:
def compare_shapes(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    print('\n=== Dataset Shapes ===')
    print(f'Dataset 1 Shape: {df1.shape}')
    print(f'Dataset 2 Shape: {df2.shape}')


def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    print('\n=== Column Comparison ===')
    print(f'Dataset 1 Columns: {df1.columns.tolist()}')
    print(f'Dataset 2 Columns: {df2.columns.tolist()}')
    
    print('\n=== Data Types ===')
    print('Dataset 1 Data Types:')
    print(df1.dtypes)
    print('\nDataset 2 Data Types:')
    print(df2.dtypes)

    # print('\n=== Unique Values per Column ===')
    # print('Dataset 1 Unique Values:')
    # print(df1.nunique())
    # print('\nDataset 2 Unique Values:')
    # print(df2.nunique())

    print('\n=== Common and Unique Columns ===')
    common_cols = set(df1.columns).intersection(set(df2.columns))
    unique_to_df1 = set(df1.columns) - set(df2.columns)
    unique_to_df2 = set(df2.columns) - set(df1.columns)
    print(f'Common Columns: {common_cols}')
    print(f'Columns only in Dataset 1: {unique_to_df1}')
    print(f'Columns only in Dataset 2: {unique_to_df2}')


def missing_values(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    print('\n=== Missing Values ===')
    print('Dataset 1 Missing Values:')
    print(df1.isnull().sum())
    print('\nDataset 2 Missing Values:')
    print(df2.isnull().sum())


def statistical_summary(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    print('\n=== Statistical Summary ===')
    print('Dataset 1 Summary:')
    print(df1.describe(include='all'))
    print('\nDataset 2 Summary:')
    print(df2.describe(include='all'))

In [None]:
compare_shapes(old_data, new_data)
compare_columns(old_data, new_data)
missing_values(old_data, new_data)
# statistical_summary(old_data, new_data)


=== Dataset Shapes ===
Dataset 1 Shape: (1162036, 141)
Dataset 2 Shape: (1167232, 139)

=== Column Comparison ===
Dataset 1 Columns: ['rtrs_control_number', 'cusip', 'yield', 'is_callable', 'refund_date', 'accrual_date', 'dated_date', 'next_sink_date', 'coupon', 'delivery_date', 'trade_date', 'trade_datetime', 'par_call_date', 'interest_payment_frequency', 'is_called', 'is_non_transaction_based_compensation', 'is_general_obligation', 'callable_at_cav', 'extraordinary_make_whole_call', 'make_whole_call', 'has_unexpired_lines_of_credit', 'escrow_exists', 'incorporated_state_code', 'trade_type', 'par_traded', 'maturity_date', 'settlement_date', 'next_call_date', 'issue_amount', 'maturity_amount', 'issue_price', 'orig_principal_amount', 'max_amount_outstanding', 'dollar_price', 'calc_date', 'purpose_sub_class', 'called_redemption_type', 'calc_day_cat', 'previous_coupon_payment_date', 'instrument_primary_name', 'purpose_class', 'call_timing', 'call_timing_in_part', 'sink_frequency', 'sink_

Train yield spread with similar trades model.

In [11]:
def get_num_features_for_each_trade_in_history() -> int:
    optional_arguments_for_process_data = get_optional_arguments_for_process_data(MODEL)
    use_treasury_spread = optional_arguments_for_process_data.get('use_treasury_spread', False)
    trade_history_features = get_ys_trade_history_features(use_treasury_spread)
    return len(trade_history_features)

In [12]:
def train_model(data: pd.DataFrame, 
                last_trade_date_for_training_dataset: str):
    '''Heavily inspired by `automated_trianing_auxiliary_functions::train_model(...)`. The main changes are: 
    (1) assume that we are using the yield spread with similar trades model,
    (2) do not have an exclusions function
    (3) do not restrict the test set to just a single day
    '''
    check_that_model_is_supported(MODEL)
    encoders, fmax = fit_encoders(data, CATEGORICAL_FEATURES, MODEL)
    test_data = data[data.trade_date > last_trade_date_for_training_dataset]    # `test_data` can only contain trades after `last_trade_date_for_training_dataset`
    train_data = data[data.trade_date <= last_trade_date_for_training_dataset]    # `train_data` only contains trades before and including `last_trade_date_for_training_dataset`
    training_set_info = f'Training set contains {len(train_data)} trades ranging from trade datetimes of {train_data.trade_datetime.min()} to {train_data.trade_datetime.max()}'
    test_set_info = f'Test set contains {len(test_data)} trades ranging from trade datetimes of {test_data.trade_datetime.min()} to {test_data.trade_datetime.max()}'
    print(training_set_info)
    print(test_set_info)

    x_train, y_train = create_input(train_data, encoders, MODEL)
    x_test, y_test = create_input(test_data, encoders, MODEL)

    keras_model = MODEL_NAME_TO_KERAS_MODEL[MODEL]
    untrained_model = keras_model(x_train, 
                                  NUM_TRADES_IN_HISTORY_YIELD_SPREAD_MODEL, 
                                  get_num_features_for_each_trade_in_history(), 
                                  CATEGORICAL_FEATURES, 
                                  NON_CAT_FEATURES, 
                                  BINARY, 
                                  fmax)
    trained_model, mae, history = train_and_evaluate_model(untrained_model, x_train, y_train, x_test, y_test)
    result_df = create_summary_of_results(trained_model, test_data, x_test, y_test)
    return result_df

In [23]:
train_model(old_data, '2024-10-25')    # Monday 2024-10-28 - Thursday 2024-10-31 is the test set

INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Initialized pandarallel with 5 cores
Training set contains 927394 trades ranging from trade datetimes of 2024-10-01 00:00:00 to 2024-10-25 19:06:49
Test set contains 234642 trades ranging from trade datetimes of 2024-10-28 06:00:01 to 2024-10-31 18:35:10
BEGIN create_input
END create_input. Execution time: 0:00:03.103
BEGIN create_input
END create_input. Execution time: 0:00:00.760




Epoch 1/100


2024-11-15 18:22:03.544234: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




2024-11-15 18:22:56.603387: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
  1/235 [..............................] - ETA: 4:26

2024-11-15 19:05:43.876133: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


|                                 |   Mean Absolute Error |   Trade Count |
|:--------------------------------|----------------------:|--------------:|
| Entire set                      |                12.523 |        234642 |
| Dealer-Dealer                   |                13.17  |         86072 |
| Bid Side / Dealer-Purchase      |                13.827 |         63992 |
| Offered Side / Dealer-Sell      |                10.877 |         84578 |
| AAA                             |                10.867 |         38685 |
| Investment Grade                |                12.116 |        189164 |
| Trade size >= 100k              |                10.47  |         52245 |
| Last trade <= 7 days            |                10.705 |        165565 |
| 7 days < Last trade <= 14 days  |                14.084 |         20191 |
| 14 days < Last trade <= 28 days |                16.423 |         15173 |
| 28 days < Last trade            |                18.756 |         33713 |


Unnamed: 0,Mean Absolute Error,Trade Count
Entire set,12.523,234642
Dealer-Dealer,13.17,86072
Bid Side / Dealer-Purchase,13.827,63992
Offered Side / Dealer-Sell,10.877,84578
AAA,10.867,38685
Investment Grade,12.116,189164
Trade size >= 100k,10.47,52245
Last trade <= 7 days,10.705,165565
7 days < Last trade <= 14 days,14.084,20191
14 days < Last trade <= 28 days,16.423,15173


In [24]:
train_model(new_data, '2024-10-25')    # Monday 2024-10-28 - Thursday 2024-10-31 is the test set

In PRODUCTION mode (to change to TESTING mode, set `TESTING` to `True`); all files and models will be saved and NUM_EPOCHS=100
INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Initialized pandarallel with 5 cores
Training set contains 932190 trades ranging from trade datetimes of 2024-10-01 06:00:01 to 2024-10-25 19:06:49
Test set contains 235042 trades ranging from trade datetimes of 2024-10-28 06:00:01 to 2024-10-31 18:35:10
BEGIN create_input
END create_input. Execution time: 0:00:03.438
BEGIN create_input
END create_input. Execution time: 0:00:00.758




Epoch 1/100


2024-11-15 19:06:03.463104: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }




2024-11-15 19:06:54.531013: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
  1/236 [..............................] - ETA: 4:24

2024-11-15 19:51:48.763461: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" model: "0" frequency: 2400 num_cores: 10 environment { key: "cpu_instruction_set" value: "ARM NEON" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 16384 l2_cache_size: 524288 l3_cache_size: 524288 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }


|                                 |   Mean Absolute Error |   Trade Count |
|:--------------------------------|----------------------:|--------------:|
| Entire set                      |                12.636 |        235042 |
| Dealer-Dealer                   |                13.21  |         86165 |
| Bid Side / Dealer-Purchase      |                13.844 |         63984 |
| Offered Side / Dealer-Sell      |                11.144 |         84893 |
| AAA                             |                10.754 |         36581 |
| Investment Grade                |                12.295 |        174979 |
| Trade size >= 100k              |                10.899 |         52581 |
| Last trade <= 7 days            |                10.871 |        166053 |
| 7 days < Last trade <= 14 days  |                14.163 |         20158 |
| 14 days < Last trade <= 28 days |                16.323 |         15160 |
| 28 days < Last trade            |                18.768 |         33671 |


Unnamed: 0,Mean Absolute Error,Trade Count
Entire set,12.636,235042
Dealer-Dealer,13.21,86165
Bid Side / Dealer-Purchase,13.844,63984
Offered Side / Dealer-Sell,11.144,84893
AAA,10.754,36581
Investment Grade,12.295,174979
Trade size >= 100k,10.899,52581
Last trade <= 7 days,10.871,166053
7 days < Last trade <= 14 days,14.163,20158
14 days < Last trade <= 28 days,16.323,15160
