# Exploration notebook
Experiments and test of code related to the trading platform development

In [1]:
# Test: invalid ticker
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
df = fetch_single_ticker('invalid','2020-01-01','2020-04-01')
print(df.head())
print(len(df))

Empty DataFrame
Columns: [(Adj Close, INVALID), (Close, INVALID), (High, INVALID), (Low, INVALID), (Open, INVALID), (Volume, INVALID), (ticker, )]
Index: []
0


In [2]:
df

Price,Adj Close,Close,High,Low,Open,Volume,ticker
Ticker,INVALID,INVALID,INVALID,INVALID,INVALID,INVALID,Unnamed: 7_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2


In [2]:
# Testing the MultiIndex flattening
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
df = fetch_single_ticker('AAPL', '2020-01-01', '2020-04-01')
print(df.columns)
print(type(df['Close']))
print(df.head())

Index(['Close', 'High', 'Low', 'Open', 'Volume', 'ticker'], dtype='str', name='Price')
<class 'pandas.Series'>
Price           Close       High        Low       Open     Volume ticker
Date                                                                    
2020-01-02  72.468254  72.528574  71.223252  71.476592  135480400   AAPL
2020-01-03  71.763733  72.523762  71.539345  71.696175  146322800   AAPL
2020-01-06  72.335556  72.374162  70.634539  70.885472  118387200   AAPL
2020-01-07  71.995338  72.600945  71.775773  72.345189  108872000   AAPL
2020-01-08  73.153511  73.455110  71.698596  71.698596  132079200   AAPL


In [4]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker, fetch_multiple_tickers
# Single ticker
df_single = fetch_single_ticker('AAPL', '2020-01-01', '2020-04-01')
print(df_single.columns)
print(df_single.head())

# Multiple tickers
df_multi = fetch_multiple_tickers(['AAPL', 'MSFT'], '2020-01-01', '2020-04-01')
print(df_multi.columns)
print(df_multi.head())
print(df_multi.tail())  # Check that MSFT is there too

Index(['Close', 'High', 'Low', 'Open', 'Volume', 'ticker'], dtype='str', name='Price')
Price           Close       High        Low       Open     Volume ticker
Date                                                                    
2020-01-02  72.468254  72.528574  71.223252  71.476592  135480400   AAPL
2020-01-03  71.763733  72.523762  71.539345  71.696175  146322800   AAPL
2020-01-06  72.335556  72.374162  70.634539  70.885472  118387200   AAPL
2020-01-07  71.995338  72.600945  71.775773  72.345189  108872000   AAPL
2020-01-08  73.153511  73.455110  71.698596  71.698596  132079200   AAPL
Price            Close        High         Low        Open    Volume ticker
Date                                                                       
2020-03-25  139.878815  146.933692  137.517673  141.773449  75638200   MSFT
2020-03-26  148.628403  149.152047  141.259338  141.287899  64568100   MSFT
2020-03-27  142.525574  147.466844  142.049536  144.477330  57042300   MSFT
2020-03-30  152.550888

In [6]:
# Test: data validator
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError

df = fetch_single_ticker('AAPL','2020-01-01','2020-04-01')
validate_price_data(df, 'AAPL')

True

In [7]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError
df_bad = fetch_single_ticker("INVALIDTICKER123", "2020-01-01", "2024-12-31")
try:
    validate_price_data(df_bad, "INVALIDTICKER123")
except ValidationError as e:
    print(f"Caught error: {e}")

Caught error: INVALIDTICKER123: DataFrame is empty


**Very important:** with try/except, you catch the error and the code continues
Without it, code crashes and stops

In [8]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError
df_bad = fetch_single_ticker("INVALIDTICKER123", "2020-01-01", "2024-12-31")
validate_price_data(df_bad, "INVALIDTICKER123")

ValidationError: INVALIDTICKER123: DataFrame is empty

In [15]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.random.randn(100,3),columns=['x1','x2','x3'])
mask = np.random.rand(*df.shape) < .1
df[mask] = np.nan
print(df.head(20))

          x1        x2        x3
0  -0.032694 -0.101761  0.154412
1  -0.891296 -3.557627 -1.600192
2   0.842677 -0.836937       NaN
3   1.572951  0.751664 -0.291565
4  -0.794763 -0.312585 -0.786671
5        NaN -0.730446  0.939559
6   0.141630  1.431034 -1.456043
7  -0.184637 -0.582920  0.669096
8        NaN  0.315332  0.289813
9   0.531977  0.403473  0.293648
10       NaN -1.173398  0.356709
11 -0.648314 -1.171524       NaN
12  0.714706 -0.693910 -0.179518
13  2.899102 -0.401486  1.299297
14 -1.062255 -1.988981  0.620155
15  0.996751 -2.110202 -0.031035
16       NaN       NaN -0.060624
17  1.886987  0.825758 -0.461981
18 -1.832832  1.819151       NaN
19  0.300410       NaN  0.911268


In [16]:
df = df.dropna(subset=['x2','x3'])
print(df.head(20))

          x1        x2        x3
0  -0.032694 -0.101761  0.154412
1  -0.891296 -3.557627 -1.600192
3   1.572951  0.751664 -0.291565
4  -0.794763 -0.312585 -0.786671
5        NaN -0.730446  0.939559
6   0.141630  1.431034 -1.456043
7  -0.184637 -0.582920  0.669096
8        NaN  0.315332  0.289813
9   0.531977  0.403473  0.293648
10       NaN -1.173398  0.356709
12  0.714706 -0.693910 -0.179518
13  2.899102 -0.401486  1.299297
14 -1.062255 -1.988981  0.620155
15  0.996751 -2.110202 -0.031035
17  1.886987  0.825758 -0.461981
21  1.144471 -0.288578  1.670863
23       NaN -0.966695 -0.926441
24 -0.310343 -0.782934  0.095891
26  0.062755  1.746890  0.422747
27  0.761371 -0.664287 -1.373694


In [17]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data, save_processed_data

df = fetch_single_ticker('AAPL','2020-01-01','2020-04-01')
validate_price_data(df, 'AAPL')
df_processed = process_ticker_data(df , 'AAPL', fill_method='ffill', check_outliers=True)
print(df_processed.head(10))
print(df_processed.columns)

Price           Close       High        Low       Open     Volume ticker  \
Date                                                                       
2020-01-02  72.468262  72.528582  71.223259  71.476600  135480400   AAPL   
2020-01-03  71.763733  72.523762  71.539345  71.696175  146322800   AAPL   
2020-01-06  72.335564  72.374169  70.634547  70.885479  118387200   AAPL   
2020-01-07  71.995361  72.600968  71.775796  72.345212  108872000   AAPL   
2020-01-08  73.153496  73.455095  71.698581  71.698581  132079200   AAPL   
2020-01-09  74.707321  74.900342  73.879735  74.130660  170108400   AAPL   
2020-01-10  74.876244  75.440844  74.374386  74.941394  140644800   AAPL   
2020-01-13  76.475914  76.502459  75.074081  75.192313  121532000   AAPL   
2020-01-14  75.443222  76.623082  75.320175  76.413170  161954400   AAPL   
2020-01-15  75.119911  76.123634  74.688019  75.242966  121923600   AAPL   

Price       is_outlier  daily_return  log_return  
Date                                