In [80]:
import os
import dotenv
import sys
dotenv.load_dotenv()
sys.path.append(os.getenv('LIBRARY_PATH'))

import pandas as pd
import numpy as np

# Custom imports
from finance import implied_volatility, black_scholes

# Initial EDA

In [None]:
data = pd.read_csv(os.getenv('DATA_PATH'))
# full data check
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 641103 entries, 0 to 641102
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Timestamp        641103 non-null  object 
 1   Stock Price      613990 non-null  float64
 2   Symbol           641103 non-null  object 
 3   Option Type      641103 non-null  object 
 4   Strike Price     641103 non-null  float64
 5   Ask Price        641103 non-null  float64
 6   Bid Price        641103 non-null  float64
 7   Expiration Date  641103 non-null  object 
dtypes: float64(4), object(4)
memory usage: 44.0+ MB


## Removing all non-trading days

In [None]:
# Filter to exchange trading days (NYSE) - removes weekends & market holidays
import pandas_market_calendars as mcal

# ensure Timestamp is datetime
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

start = data['Timestamp'].dt.date.min()
end = data['Timestamp'].dt.date.max()
print(f"Data covers {start} through {end}")

nyse = mcal.get_calendar('NYSE')
schedule = nyse.schedule(start_date=start, end_date=end)
trading_days = pd.to_datetime(schedule.index).date

# Filter rows whose date is in the NYSE trading days
data_trading = data[data['Timestamp'].dt.date.isin(trading_days)].copy()

print('Rows before:', len(data))
print('Rows after (NYSE trading days):', len(data_trading))
print('Rows removed:', len(data) - len(data_trading))
print("NaNs in 'Stock Price' after filtering:", data_trading['Stock Price'].isna().sum())

# quick missing summary
missing_summary_trading = pd.DataFrame({
    'Missing Values': data_trading.isnull().sum(),
    'Percentage Missing': (data_trading.isnull().sum() / len(data_trading)) * 100
}).sort_values('Missing Values', ascending=False)

print('\nMissing summary after filtering to trading days:')
print(missing_summary_trading)

Data covers 2024-08-09 through 2025-10-21
Rows before: 1198749
Rows after (NYSE trading days): 1122660
Rows removed: 76089
NaNs in 'Stock Price' after filtering: 0

Missing summary after filtering to trading days:
                 Missing Values  Percentage Missing
Timestamp                     0                 0.0
Stock Price                   0                 0.0
Symbol                        0                 0.0
Option Type                   0                 0.0
Strike Price                  0                 0.0
Ask Price                     0                 0.0
Bid Price                     0                 0.0
Expiration Date               0                 0.0


## Validating data

In [79]:
# Check for Stock Price values encoded as empty arrays or empty strings
import inspect

def is_empty_array(x):
    return isinstance(x, (list, tuple, np.ndarray)) and len(x) == 0

def is_empty_string(x):
    return isinstance(x, str) and x.strip() == ''

mask_empty_array = data_trading['Stock Price'].apply(is_empty_array)
mask_empty_string = data_trading['Stock Price'].apply(is_empty_string)

count_empty_array = mask_empty_array.sum()
count_empty_string = mask_empty_string.sum()

print(f"Empty-array encoded Stock Price count: {count_empty_array}")
print(f"Empty-string encoded Stock Price count: {count_empty_string}")

# Show example rows (first 10) for each case
if count_empty_array:
    print('\nExamples of empty-array encoded rows:')
    display(data_trading.loc[mask_empty_array].head(10))
else:
    print('\nNo empty-array encoded rows found.')

if count_empty_string:
    print('Examples of empty-string encoded rows:')
    display(data_trading.loc[mask_empty_string].head(10))
else:
    print('No empty-string encoded rows found.')

# Combined mask
mask_combined = mask_empty_array | mask_empty_string
print('\nTotal problematic rows:', mask_combined.sum())

Empty-array encoded Stock Price count: 0
Empty-string encoded Stock Price count: 0

No empty-array encoded rows found.
No empty-string encoded rows found.

Total problematic rows: 0
