# Compare Datasets

Created by Mitas Ray on 2024-11-16.

This notebook is used to compare two datasets. The procedure is to 
1. restrict the datasets to the same datetime window
2. perform high-level analysis on the values in the dataset
3. train a model with these datasets and see similar accuracy results

To run the notebook, use `ficc_python/requirements_py310.txt`, and use `>>> pip install jupyter`.

In [None]:
import os
from datetime import datetime

import pandas as pd

from google.cloud import storage


# importing from parent directory: https://stackoverflow.com/questions/714063/importing-modules-from-parent-folder
import sys
sys.path.insert(0,'../')

from ficc.utils.auxiliary_functions import get_ys_trade_history_features
from ficc.utils.gcp_storage_functions import download_data

from automated_training_auxiliary_variables import CATEGORICAL_FEATURES, BINARY, NON_CAT_FEATURES, NUM_TRADES_IN_HISTORY_YIELD_SPREAD_MODEL, BUCKET_NAME, MODEL_TO_CUMULATIVE_DATA_PICKLE_FILENAME
from automated_training_auxiliary_functions import MODEL_NAME_TO_KERAS_MODEL, check_that_model_is_supported, fit_encoders, create_input, train_and_evaluate_model, create_summary_of_results, get_optional_arguments_for_process_data
from set_random_seed import set_seed


set_seed()

In [None]:
def get_creds():
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/mitas/ficc/ficc/mitas_creds.json'
    return None


def get_storage_client():
    get_creds()
    return storage.Client()


STORAGE_CLIENT = get_storage_client()

In [None]:
MODEL = 'yield_spread_with_similar_trades'

Get data from Google Cloud Storage.

In [None]:
old_data = download_data(STORAGE_CLIENT, BUCKET_NAME, MODEL_TO_CUMULATIVE_DATA_PICKLE_FILENAME[MODEL])
new_data = download_data(STORAGE_CLIENT, 'sp_data_for_modeling', 'processed_data_from_jesse_tests_trade_history_same_issue_5_yr_mat_bucket_1_materialized_2024-10.pkl')

Restrict the data between a start and end datetime.

In [None]:
def string_to_datetime(datetime_as_string: datetime | str) -> datetime:
    if isinstance(datetime_as_string, datetime): return datetime_as_string
    string_format = '%Y-%m-%d %H:%M:%S'
    try:
        return datetime.strptime(datetime_as_string, string_format)
    except Exception as e:
        print(f'{datetime_as_string} must be in {string_format} format')
        raise e


def restrict_data_to_specified_time_window(data: pd.DataFrame, 
                                           datetime_column_name: str, 
                                           start_datetime: datetime | str, 
                                           end_datetime: datetime | str) -> pd.DataFrame:
    '''Return a truncated version of `data` with values of `datetime_column_name` between 
    `start_datetime` and `end_datetime`.'''
    start_datetime, end_datetime = string_to_datetime(start_datetime), string_to_datetime(end_datetime)
    after_start_datetime = data[datetime_column_name] >= start_datetime
    before_end_datetime = data[datetime_column_name] <= end_datetime
    rows_to_keep = after_start_datetime & before_end_datetime
    rows_remaining = rows_to_keep.sum()
    print(f'{len(data) - rows_remaining} rows removed from the original {len(data)} rows. {rows_remaining} rows remain.')
    return data[rows_to_keep]

In [None]:
october_1_start_of_day = '2024-10-01 00:00:00'
october_31_end_of_day = '2024-10-31 23:59:59'

In [None]:
def restrict_data_to_october_on_trade_datetime(data):
    return restrict_data_to_specified_time_window(data, 'trade_datetime', october_1_start_of_day, october_31_end_of_day)

In [None]:
old_data = restrict_data_to_october_on_trade_datetime(old_data)

In [None]:
new_data = restrict_data_to_october_on_trade_datetime(new_data)

Compare datasets.

In [None]:
def compare_shapes(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    print('\n=== Dataset Shapes ===')
    print(f'Dataset 1 Shape: {df1.shape}')
    print(f'Dataset 2 Shape: {df2.shape}')


def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    print('\n=== Column Comparison ===')
    print(f'Dataset 1 Columns: {df1.columns.tolist()}')
    print(f'Dataset 2 Columns: {df2.columns.tolist()}')
    
    print('\n=== Data Types ===')
    print('Dataset 1 Data Types:')
    print(df1.dtypes)
    print('\nDataset 2 Data Types:')
    print(df2.dtypes)

    print('\n=== Unique Values per Column ===')
    print('Dataset 1 Unique Values:')
    print(df1.nunique())
    print('\nDataset 2 Unique Values:')
    print(df2.nunique())

    print('\n=== Common and Unique Columns ===')
    common_cols = set(df1.columns).intersection(set(df2.columns))
    unique_to_df1 = set(df1.columns) - set(df2.columns)
    unique_to_df2 = set(df2.columns) - set(df1.columns)
    print(f'Common Columns: {common_cols}')
    print(f'Columns only in Dataset 1: {unique_to_df1}')
    print(f'Columns only in Dataset 2: {unique_to_df2}')


def statistical_summary_and_missing_values(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    print('\n=== Missing Values ===')
    print('Dataset 1 Missing Values:')
    print(df1.isnull().sum())
    print('\nDataset 2 Missing Values:')
    print(df2.isnull().sum())
    
    print('\n=== Statistical Summary ===')
    print('Dataset 1 Summary:')
    print(df1.describe(include='all'))
    print('\nDataset 2 Summary:')
    print(df2.describe(include='all'))

Train yield spread with similar trades model.

In [None]:
def get_num_features_for_each_trade_in_history() -> int:
    optional_arguments_for_process_data = get_optional_arguments_for_process_data(MODEL)
    use_treasury_spread = optional_arguments_for_process_data.get('use_treasury_spread', False)
    trade_history_features = get_ys_trade_history_features(use_treasury_spread)
    return len(trade_history_features)

In [None]:
def train_model(data: pd.DataFrame, 
                last_trade_date_for_training_dataset: str):
    '''Heavily inspired by `automated_trianing_auxiliary_functions::train_model(...)`. The main changes are: 
    (1) assume that we are using the yield spread with similar trades model,
    (2) do not have an exclusions function
    (3) do not restrict the test set to just a single day
    '''
    check_that_model_is_supported(MODEL)
    encoders, fmax = fit_encoders(data, CATEGORICAL_FEATURES, MODEL)
    test_data = data[data.trade_date > last_trade_date_for_training_dataset]    # `test_data` can only contain trades after `last_trade_date_for_training_dataset`
    train_data = data[data.trade_date <= last_trade_date_for_training_dataset]    # `train_data` only contains trades before and including `last_trade_date_for_training_dataset`
    training_set_info = f'Training set contains {len(train_data)} trades ranging from trade datetimes of {train_data.trade_datetime.min()} to {train_data.trade_datetime.max()}'
    test_set_info = f'Test set contains {len(test_data)} trades ranging from trade datetimes of {test_data.trade_datetime.min()} to {test_data.trade_datetime.max()}'
    print(training_set_info)
    print(test_set_info)

    x_train, y_train = create_input(train_data, encoders, MODEL)
    x_test, y_test = create_input(test_data, encoders, MODEL)

    keras_model = MODEL_NAME_TO_KERAS_MODEL[MODEL]
    untrained_model = keras_model(x_train, 
                                  NUM_TRADES_IN_HISTORY_YIELD_SPREAD_MODEL, 
                                  get_num_features_for_each_trade_in_history(), 
                                  CATEGORICAL_FEATURES, 
                                  NON_CAT_FEATURES, 
                                  BINARY, 
                                  fmax)
    trained_model, mae, history = train_and_evaluate_model(untrained_model, x_train, y_train, x_test, y_test)
    result_df = create_summary_of_results(trained_model, test_data, x_test, y_test)
    return result_df

In [None]:
train_model(old_data, '2024-10-25')    # Monday 2024-10-28 - Thursday 2024-10-31 is the test set

In [None]:
train_model(old_data, '2024-10-25')    # Monday 2024-10-28 - Thursday 2024-10-31 is the test set