# Exploration notebook
Experiments and test of code related to the trading platform development

In [1]:
from defusedxml.lxml import tostring
from pandas import isna

# Test: invalid ticker
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
df = fetch_single_ticker('invalid','2020-01-01','2020-04-01')
print(df.head())
print(len(df))

Empty DataFrame
Columns: [(Adj Close, INVALID), (Close, INVALID), (High, INVALID), (Low, INVALID), (Open, INVALID), (Volume, INVALID), (ticker, )]
Index: []
0


In [2]:
df

Price,Adj Close,Close,High,Low,Open,Volume,ticker
Ticker,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID,Unnamed: 7_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2


In [2]:
# Testing the MultiIndex flattening
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
df = fetch_single_ticker('AAPL', '2020-01-01', '2020-04-01')
print(df.columns)
print(type(df['Close']))
print(df.head())

Index(['Close', 'High', 'Low', 'Open', 'Volume', 'ticker'], dtype='str', name='Price')
<class 'pandas.Series'>
Price           Close       High        Low       Open     Volume ticker
Date                                                                    
2020-01-02  72.468254  72.528574  71.223252  71.476592  135480400   AAPL
2020-01-03  71.763733  72.523762  71.539345  71.696175  146322800   AAPL
2020-01-06  72.335556  72.374162  70.634539  70.885472  118387200   AAPL
2020-01-07  71.995338  72.600945  71.775773  72.345189  108872000   AAPL
2020-01-08  73.153511  73.455110  71.698596  71.698596  132079200   AAPL


In [4]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker, fetch_multiple_tickers
# Single ticker
df_single = fetch_single_ticker('AAPL', '2020-01-01', '2020-04-01')
print(df_single.columns)
print(df_single.head())

# Multiple tickers
df_multi = fetch_multiple_tickers(['AAPL', 'MSFT'], '2020-01-01', '2020-04-01')
print(df_multi.columns)
print(df_multi.head())
print(df_multi.tail())  # Check that MSFT is there too

Index(['Close', 'High', 'Low', 'Open', 'Volume', 'ticker'], dtype='str', name='Price')
Price           Close       High        Low       Open     Volume ticker
Date                                                                    
2020-01-02  72.468254  72.528574  71.223252  71.476592  135480400   AAPL
2020-01-03  71.763733  72.523762  71.539345  71.696175  146322800   AAPL
2020-01-06  72.335556  72.374162  70.634539  70.885472  118387200   AAPL
2020-01-07  71.995338  72.600945  71.775773  72.345189  108872000   AAPL
2020-01-08  73.153511  73.455110  71.698596  71.698596  132079200   AAPL
Price            Close        High         Low        Open    Volume ticker
Date                                                                       
2020-03-25  139.878815  146.933692  137.517673  141.773449  75638200   MSFT
2020-03-26  148.628403  149.152047  141.259338  141.287899  64568100   MSFT
2020-03-27  142.525574  147.466844  142.049536  144.477330  57042300   MSFT
2020-03-30  152.550888

In [6]:
# Test: data validator
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError

df = fetch_single_ticker('AAPL','2020-01-01','2020-04-01')
validate_price_data(df, 'AAPL')

True

In [7]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError
df_bad = fetch_single_ticker("INVALIDTICKER123", "2020-01-01", "2024-12-31")
try:
    validate_price_data(df_bad, "INVALIDTICKER123")
except ValidationError as e:
    print(f"Caught error: {e}")

Caught error: INVALIDTICKER123: DataFrame is empty


**Very important:** with try/except, you catch the error and the code continues
Without it, code crashes and stops

In [8]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError
df_bad = fetch_single_ticker("INVALIDTICKER123", "2020-01-01", "2024-12-31")
validate_price_data(df_bad, "INVALIDTICKER123")

ValidationError: INVALIDTICKER123: DataFrame is empty

In [5]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.random.randn(100,3),columns=['x1','x2','x3'])
mask = np.random.rand(*df.shape) < .1
df[mask] = np.nan
print(df.head(20))

          x1        x2        x3
0   0.113056 -0.565357  0.909327
1   0.134470       NaN -0.868787
2  -0.620914 -0.054439       NaN
3  -0.023989  2.462723  1.348902
4        NaN  0.346422  1.142800
5   0.078458  1.396542  0.887082
6   2.228574       NaN  2.384298
7  -0.847676  1.131055 -3.241828
8  -0.628673 -0.524610  1.742299
9  -1.143617  1.023033  0.617805
10 -1.016975 -0.916055  2.129524
11  0.685013 -0.096506 -0.432321
12  0.569582  1.427032  0.786720
13 -0.684962  1.109491  2.097108
14       NaN  0.358366  0.297987
15 -0.838806  0.045742 -1.120709
16  0.451549 -1.705134  2.178698
17 -0.940016 -1.215677 -0.278393
18 -2.194852  0.175265  0.655580
19 -1.220247  1.053399 -0.638570


In [9]:
import pandas as pd
(df.isna()).equals(pd.isna(df))

True

In [16]:
df = df.dropna(subset=['x2','x3'])
print(df.head(20))

          x1        x2        x3
0  -0.032694 -0.101761  0.154412
1  -0.891296 -3.557627 -1.600192
3   1.572951  0.751664 -0.291565
4  -0.794763 -0.312585 -0.786671
5        NaN -0.730446  0.939559
6   0.141630  1.431034 -1.456043
7  -0.184637 -0.582920  0.669096
8        NaN  0.315332  0.289813
9   0.531977  0.403473  0.293648
10       NaN -1.173398  0.356709
12  0.714706 -0.693910 -0.179518
13  2.899102 -0.401486  1.299297
14 -1.062255 -1.988981  0.620155
15  0.996751 -2.110202 -0.031035
17  1.886987  0.825758 -0.461981
21  1.144471 -0.288578  1.670863
23       NaN -0.966695 -0.926441
24 -0.310343 -0.782934  0.095891
26  0.062755  1.746890  0.422747
27  0.761371 -0.664287 -1.373694


In [17]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data, save_processed_data

df = fetch_single_ticker('AAPL','2020-01-01','2020-04-01')
validate_price_data(df, 'AAPL')
df_processed = process_ticker_data(df , 'AAPL', fill_method='ffill', check_outliers=True)
print(df_processed.head(10))
print(df_processed.columns)

Price           Close       High        Low       Open     Volume ticker  \
Date                                                                       
2020-01-02  72.468262  72.528582  71.223259  71.476600  135480400   AAPL   
2020-01-03  71.763733  72.523762  71.539345  71.696175  146322800   AAPL   
2020-01-06  72.335564  72.374169  70.634547  70.885479  118387200   AAPL   
2020-01-07  71.995361  72.600968  71.775796  72.345212  108872000   AAPL   
2020-01-08  73.153496  73.455095  71.698581  71.698581  132079200   AAPL   
2020-01-09  74.707321  74.900342  73.879735  74.130660  170108400   AAPL   
2020-01-10  74.876244  75.440844  74.374386  74.941394  140644800   AAPL   
2020-01-13  76.475914  76.502459  75.074081  75.192313  121532000   AAPL   
2020-01-14  75.443222  76.623082  75.320175  76.413170  161954400   AAPL   
2020-01-15  75.119911  76.123634  74.688019  75.242966  121923600   AAPL   

Price       is_outlier  daily_return  log_return  
Date                                

In [1]:
windows = {
    'mom_5d': 5,
    'mom_21d': 21,
'mom_63d': 63,
    'mom_252d': 25
}
for name, window in windows.items():
    print([name, window])

['mom_5d', 5]
['mom_21d', 21]
['mom_63d', 63]
['mom_252d', 25]


In [4]:
list(windows.keys())

['mom_5d', 'mom_21d', 'mom_63d', 'mom_252d']

In [18]:
k=5
'mom_'+str(k)

'mom_5'

In [1]:
# Test the full pipeline
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data
from src.data_pipelines.features import build_features

# Full pipeline
ticker = 'AAPL'
df = fetch_single_ticker(ticker, '2020-01-01', '2024-12-31')
validate_price_data(df, ticker)
df = process_ticker_data(df, ticker)
df = build_features(df, ticker)

print(df.columns.tolist())
print(df.tail())

['Close', 'High', 'Low', 'Open', 'Volume', 'ticker', 'is_outlier', 'daily_return', 'log_return', 'volume_ma_21d', 'volume_ratio', 'mom_5d', 'mom_21d', 'mom_63d', 'mom_252d', 'ma_21d', 'ma_50d', 'dist_from_ma_21d', 'dist_from_ma_50d', 'vol_21d', 'vol_63d', 'range_hl', 'mom_21_252', 'mom_1_21']
Price            Close        High         Low        Open    Volume ticker  \
Date                                                                          
2024-12-23  254.120667  254.498945  252.308854  253.622918  40858800   AAPL   
2024-12-24  257.037476  257.047410  254.140559  254.339671  23234700   AAPL   
2024-12-26  257.853760  258.928914  256.470034  257.027510  27237100   AAPL   
2024-12-27  254.439240  257.535253  251.920632  256.669145  42355300   AAPL   
2024-12-30  251.064499  252.358649  249.621030  251.094363  35557500   AAPL   

Price       is_outlier  daily_return  log_return  volume_ma_21d  ...  \
Date                                                             ...   
2024-12-

In [2]:
# Look at correlations between features and future returns
df['future_return_5d'] = df['Close'].pct_change(5).shift(-5)  # 5-day forward return

correlations = df[['mom_21d', 'mom_63d', 'vol_21d', 'dist_from_ma_21d', 'future_return_5d']].corr()
print(correlations['future_return_5d'])

Price
mom_21d             0.014625
mom_63d            -0.076283
vol_21d             0.009191
dist_from_ma_21d    0.003530
future_return_5d    1.000000
Name: future_return_5d, dtype: float64
