# Exploration notebook
Experiments and test of code related to the trading platform development

## Phase 1. Data pipelines

In [19]:
import yfinance as yf
import pandas as pd

tickers = ['AAPL','MSFT', 'AMZN']
dd = yf.Ticker('AAPL')
dd.get_info()

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '(408) 996-1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple Vision Pro, Apple TV, Apple Watch, Beats products, and HomePod, as well as Apple branded and third-party accessories. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download app

In [45]:
tel = {'jack': 4098, 'sape': 4139}
tel['jack']

4098

In [46]:
a = 'bond'
b = '17683ed37'
print(a + '_' + b)

bond_17683ed37


In [None]:
# Test: invalid ticker
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
df = fetch_single_ticker('invalid','2020-01-01','2020-04-01')
print(df.head())
print(len(df))

In [None]:
df

In [None]:
# Testing the MultiIndex flattening
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
df = fetch_single_ticker('AAPL', '2020-01-01', '2020-04-01')
print(df.columns)
print(type(df['Close']))
print(df.head())

In [None]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker, fetch_multiple_tickers
# Single ticker
df_single = fetch_single_ticker('AAPL', '2020-01-01', '2020-04-01')
print(df_single.columns)
print(df_single.head())

# Multiple tickers
df_multi = fetch_multiple_tickers(['AAPL', 'MSFT'], '2020-01-01', '2020-04-01')
print(df_multi.columns)
print(df_multi.head())
print(df_multi.tail())  # Check that MSFT is there too

In [None]:
# Test: data validator
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError

df = fetch_single_ticker('AAPL','2020-01-01','2020-04-01')
validate_price_data(df, 'AAPL')

In [None]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError
df_bad = fetch_single_ticker("INVALIDTICKER123", "2020-01-01", "2024-12-31")
try:
    validate_price_data(df_bad, "INVALIDTICKER123")
except ValidationError as e:
    print(f"Caught error: {e}")

**Very important:** with try/except, you catch the error and the code continues
Without it, code crashes and stops

In [28]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data, ValidationError
df_bad = fetch_single_ticker("INVALIDTICKER123", "2020-01-01", "2024-12-31")
validate_price_data(df_bad, "INVALIDTICKER123")

ImportError: cannot import name 'fetch_single_ticker' from 'src.data_pipelines.yahoo_fetcher' (/Users/hugo/quant-research/src/data_pipelines/yahoo_fetcher.py)

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.random.randn(100,3),columns=['x1','x2','x3'])
mask = np.random.rand(*df.shape) < .1
df[mask] = np.nan
print(df.head(20))

In [None]:
import pandas as pd
(df.isna()).equals(pd.isna(df))

In [None]:
df = df.dropna(subset=['x2','x3'])
print(df.head(20))

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.random.randint(0,2,10),columns=['x1'])
dfcount = df.value_counts()
dfcount.get(1,0)

In [None]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data, save_processed_data

df = fetch_single_ticker('AAPL','2020-01-01','2020-04-01')
validate_price_data(df, 'AAPL')
df_processed = process_ticker_data(df , 'AAPL', fill_method='ffill', check_outliers=True)
print(df_processed.head(10))
print(df_processed.columns)

In [None]:
windows = {
    'mom_5d': 5,
    'mom_21d': 21,
'mom_63d': 63,
    'mom_252d': 25
}
for name, window in windows.items():
    print([name, window])

In [None]:
list(windows.keys())

In [None]:
k=5
'mom_'+str(k)

In [None]:
# Test the full pipeline
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data
from src.data_pipelines.features import build_features

# Full pipeline
ticker = 'AAPL'
df = fetch_single_ticker(ticker, '2020-01-01', '2024-12-31')
validate_price_data(df, ticker)
df = process_ticker_data(df, ticker)
df = build_features(df, ticker)

print(df.columns.tolist())
print(df.tail())

In [None]:
# Look at correlations between features and future returns
df['future_return_5d'] = df['Close'].pct_change(5).shift(-5)  # 5-day forward return

correlations = df[['mom_21d', 'mom_63d', 'vol_21d', 'dist_from_ma_21d', 'future_return_5d']].corr()
print(correlations['future_return_5d'])

## Phase 2. Prediction project

In [None]:
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data
from src.data_pipelines.features import build_features
from src.models.dataset import PredictionDataset

# Build features
ticker = 'AAPL'
df = fetch_single_ticker(ticker, '2015-01-01', '2024-12-31')
validate_price_data(df, ticker)
df = process_ticker_data(df, ticker)
df = build_features(df, ticker)

# Define features to use
feature_cols = ['mom_5d', 'mom_21d', 'mom_63d', 'vol_21d', 'vol_63d',
                'volume_ratio', 'dist_from_ma_21d', 'dist_from_ma_50d','mom_21_252','mom_1_21']

# Create dataset
dataset = PredictionDataset(df, feature_cols, target_horizon=1, target_type='binary')

# Check class balance
dataset.get_class_balance()

# Simple train/test split
X_train, X_test, y_train, y_test = dataset.get_train_test_split(test_size=0.2)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

In [None]:
from src.models.baseline import LogisticRegressionModel, compare_models
import numpy as np
# Train and evaluate
log_model = LogisticRegressionModel(l1_ratio=1, C=0.1)
log_model.fit(X_train, y_train)

train_metrics = log_model.evaluate(X_train, y_train)
test_metrics = log_model.evaluate(X_test, y_test)
print(f"Train accuracy: {train_metrics['accuracy']:.4f}")
print(f"Test accuracy: {test_metrics['accuracy']:.4f}")

# Look at feature importance
print(log_model.get_feature_importance(feature_cols))

In [None]:
models = [
    LogisticRegressionModel(C=np.inf),
    LogisticRegressionModel(C=0.1, l1_ratio=1),
    LogisticRegressionModel(C=1, l1_ratio=.5),
    LogisticRegressionModel(C=1, l1_ratio=1),
    LogisticRegressionModel(C=.1, l1_ratio=.1),
]

results = compare_models(models, X_train, X_test, y_train, y_test)
print(results)

In [None]:
# Now let's try with the walk-forward method
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data
from src.data_pipelines.features import build_features
from src.models.dataset import PredictionDataset
import pandas as pd

# Build features
ticker = 'AAPL'
df = fetch_single_ticker(ticker, '2015-01-01', '2024-12-31')
validate_price_data(df, ticker)
df = process_ticker_data(df, ticker)
df = build_features(df, ticker)

# Define features to use
feature_cols = ['mom_5d', 'mom_21d', 'mom_63d', 'vol_21d', 'vol_63d',
                'volume_ratio', 'dist_from_ma_21d', 'dist_from_ma_50d','mom_21_252','mom_1_21']

dataset = PredictionDataset(df, feature_cols, target_horizon=1, target_type='binary')
splits = dataset.get_walk_forward_splits(n_splits=5)

from src.models.baseline import LogisticRegressionModel

results = []
for i, (X_train, X_test, y_train, y_test) in enumerate(splits):
    model = LogisticRegressionModel(C=1.0, l1_ratio=0)
    model.fit(X_train, y_train)

    metrics = model.evaluate(X_test, y_test)
    metrics['split'] = i + 1
    results.append(metrics)

    print(f"Split {i+1}: accuracy = {metrics['accuracy']:.4f}, AUC = {metrics['roc_auc']:.4f}")

# Summary statistics
results_df = pd.DataFrame(results)
print(f"\nMean accuracy: {results_df['accuracy'].mean():.4f} ± {results_df['accuracy'].std():.4f}")
print(f"Mean AUC:      {results_df['roc_auc'].mean():.4f} ± {results_df['roc_auc'].std():.4f}")


In [None]:
# Test Gradient boosting
from src.data_pipelines.yahoo_fetcher import fetch_single_ticker
from src.data_pipelines.validators import validate_price_data
from src.data_pipelines.processor import process_ticker_data
from src.data_pipelines.features import build_features
from src.models.dataset import PredictionDataset
import pandas as pd

# Build features
ticker = 'AAPL'
df = fetch_single_ticker(ticker, '2015-01-01', '2024-12-31')
validate_price_data(df, ticker)
df = process_ticker_data(df, ticker)
df = build_features(df, ticker)

# Define features to use
feature_cols = ['mom_5d', 'mom_21d', 'mom_63d', 'vol_21d', 'vol_63d',
                'volume_ratio', 'dist_from_ma_21d', 'dist_from_ma_50d','mom_21_252','mom_1_21']

dataset = PredictionDataset(df, feature_cols, target_horizon=1, target_type='binary')
X_train, X_test, y_train, y_test = dataset.get_train_test_split(test_size=0.2)

from src.models.baseline import LogisticRegressionModel, compare_models
from src.models.tree_models import RandomForestModel, GradientBoostingModel

models = [
    LogisticRegressionModel(C=1.0),
    RandomForestModel(n_estimators=100, max_depth=5, min_samples_leaf=50),
    RandomForestModel(n_estimators=100, max_depth=10, min_samples_leaf=20),
    GradientBoostingModel(n_estimators=100, max_depth=3, learning_rate=0.1),
    GradientBoostingModel(n_estimators=200, max_depth=3, learning_rate=0.05),
]

results = compare_models(models, X_train, X_test, y_train, y_test)
print(results)

In [None]:
best_tree = RandomForestModel(n_estimators=100, max_depth=5)
best_tree.fit(X_train, y_train)
print(best_tree.get_feature_importance(feature_cols))

In [None]:
from src.models.tree_models import RandomForestModel, GradientBoostingModel

splits = dataset.get_walk_forward_splits(n_splits=5)

# Test the conservative Random Forest
rf_results = []
for i, (X_train, X_test, y_train, y_test) in enumerate(splits):
    model = RandomForestModel(n_estimators=100, max_depth=3, min_samples_leaf=100)  # Even more conservative
    model.fit(X_train, y_train)

    metrics = model.evaluate(X_test, y_test)
    rf_results.append(metrics)
    print(f"Split {i+1}: accuracy = {metrics['accuracy']:.4f}")

print(f"\nRandom Forest Mean accuracy: {pd.DataFrame(rf_results)['accuracy'].mean():.4f}")

In [None]:
rf_model = RandomForestModel(n_estimators=100, max_depth=3, min_samples_leaf=100)
rf_model.fit(X_train, y_train)

importance = rf_model.get_feature_importance(feature_cols)
print(importance)

In [None]:
# here testing neural net
from src.models.neural_net import NeuralNetModel

# Simple network
nn_model = NeuralNetModel(
    hidden_dims=[32, 16],
    dropout=0.2,
    learning_rate=0.001,
    epochs=100,
    early_stopping_patience=10,
    verbose=True
)

nn_model.fit(X_train, y_train)

train_metrics = nn_model.evaluate(X_train, y_train)
test_metrics = nn_model.evaluate(X_test, y_test)

print(f"Train accuracy: {train_metrics['accuracy']:.4f}")
print(f"Test accuracy: {test_metrics['accuracy']:.4f}")
print(f"Overfit gap: {train_metrics['accuracy'] - test_metrics['accuracy']:.4f}")

# Visualize training
nn_model.plot_training_history()

In [29]:
from src.models.baseline import compare_models
from src.models.tree_models import RandomForestModel

models = [
    LogisticRegressionModel(C=1.0),
    RandomForestModel(n_estimators=100, max_depth=3, min_samples_leaf=100),
    NeuralNetModel(hidden_dims=[16], epochs=50),           # Tiny
    NeuralNetModel(hidden_dims=[32, 16], epochs=100),      # Small
    NeuralNetModel(hidden_dims=[64, 32, 16], epochs=100),  # Medium
]

results = compare_models(models, X_train, X_test, y_train, y_test)
print(results)

NameError: name 'LogisticRegressionModel' is not defined

## Phase 3. Live APIs

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import nest_asyncio
nest_asyncio.apply()

from src.execution.ib_connection import IBConnection, get_stock_contract

conn = IBConnection(port=7497)
conn.connect()

print(f"Connected: {conn.is_connected()}")
print(f"Accounts: {conn.ib.managedAccounts()}")

conn.ib.reqMarketDataType(3)

# Fetch a quote
aapl = get_stock_contract('AAPL')
conn.ib.qualifyContracts(aapl)

ticker = conn.ib.reqMktData(aapl)

import time
time.sleep(2)

print(f"\nSymbol: {aapl.symbol}")
print(f"Last price: {ticker.last}")
print(f"Bid: {ticker.bid}")
print(f"Ask: {ticker.ask}")

conn.ib.cancelMktData(aapl)
conn.disconnect()

In [None]:
conn = IBConnection(port=7497)
conn.connect()

aapl = get_stock_contract('AAPL')
conn.ib.qualifyContracts(aapl)

# Fetch historical data
bars = conn.ib.reqHistoricalData(
    aapl,
    endDateTime='',  # Empty = now
    durationStr='5 D',  # Last 5 days
    barSizeSetting='1 hour',
    whatToShow='TRADES',
    useRTH=True  # Regular trading hours only
)

for bar in bars[-5:]:  # Print last 5 bars
    print(f"{bar.date} | O:{bar.open} H:{bar.high} L:{bar.low} C:{bar.close} V:{bar.volume}")

conn.disconnect()