# Libs

In [70]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [39]:
!pip install --upgrade gradio

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [40]:
!pip install torch



In [41]:
!pip install torch torch-geometric pandas scikit-learn



# Inital pull of data to see what it looks like

In [42]:
stocks = ["AAPL", "MSFT", "AMZN", "GOOG", "GOOGL", "TSLA", "NVDA", "META", "UNH", "JNJ",
          "V", "JPM", "PG", "HD", "MA", "BAC", "XOM", "CVX", "LLY", "PFE"]

data = yf.download(stocks, period="1mo")
df = pd.DataFrame(data)
df.to_csv("stock_data.csv")

[*********************100%***********************]  20 of 20 completed


In [43]:
df.head()

Price,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Ticker,AAPL,AMZN,BAC,CVX,GOOG,GOOGL,HD,JNJ,JPM,LLY,...,MA,META,MSFT,NVDA,PFE,PG,TSLA,UNH,V,XOM
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2024-08-19 00:00:00+00:00,225.889999,178.220001,39.41304,146.830002,168.178604,166.449127,360.878082,158.427521,215.449997,921.809998,...,1690800,9879700,15234000,318333600,26382500,3885300,76435200,1847900,6555900,12417500
2024-08-20 00:00:00+00:00,226.509995,178.880005,38.439388,144.690002,168.737885,166.95845,365.052704,158.953522,214.520004,949.969971,...,1763300,7944400,16387600,300087400,21301300,4513600,74001200,1677500,6803600,15632000
2024-08-21 00:00:00+00:00,226.399994,180.110001,38.499001,145.320007,167.409637,165.630219,368.22345,160.213943,214.600006,951.969971,...,2105900,13423300,16067300,257883600,16729700,4082000,70146000,2844300,6639700,11752400
2024-08-22 00:00:00+00:00,224.529999,176.130005,38.985828,145.729996,165.272446,163.582932,363.154236,161.127029,216.630005,954.169983,...,1538200,15708300,19361900,376189100,20241500,4097300,79514500,2282000,4425200,10609500
2024-08-23 00:00:00+00:00,226.839996,177.039993,39.512398,147.619995,167.209885,165.400513,373.322479,162.893616,218.309998,952.73999,...,1986300,11323900,18493800,323230300,21885300,4771300,81525200,2376400,3697400,10381400


# Fine tuned data API call

The data  is not tidy. Pull and format data to make it tidy.

In [81]:
def download_and_preprocess_data(tickers, days=30):
    # Download data
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days)
    data = yf.download(tickers, start=start_date, end=end_date)

    # Reshape data
    df_list = []
    for ticker in tickers:
        df = pd.DataFrame({
            'date': data.index,
            'ticker': ticker,
            'open': data['Open'][ticker],
            'high': data['High'][ticker],
            'low': data['Low'][ticker],
            'close': data['Close'][ticker],
            'volume': data['Volume'][ticker],
            'adj_close': data['Adj Close'][ticker]
        })
        df_list.append(df)

    # Combine all dataframes
    combined_df = pd.concat(df_list, ignore_index=True)

    # Sort by date and ticker
    combined_df = combined_df.sort_values(['date', 'ticker'])

    # Reset index
    combined_df = combined_df.reset_index(drop=True)

    return combined_df

# Example usage
tickers = ["AAPL", "MSFT", "AMZN", "GOOG", "GOOGL", "TSLA", "NVDA", "META", "UNH", "JNJ",
           "V", "JPM", "PG", "HD", "MA", "BAC", "XOM", "CVX", "LLY", "PFE"]

preprocessed_data = download_and_preprocess_data(tickers)
print(preprocessed_data.head(10))
print(f"\nShape of the dataframe: {preprocessed_data.shape}")

# Save to CSV (optional)
preprocessed_data.to_csv('preprocessed_stock_data.csv', index=False)

# Additional steps for GNN preparation
def prepare_for_gnn(df):
    # Create a datetime feature
    df['datetime'] = pd.to_datetime(df['date'])

    # Create lagged features (example: 1-day lag)
    for col in ['open', 'high', 'low', 'close', 'volume', 'adj_close']:
        df[f'{col}_lag1'] = df.groupby('ticker')[col].shift(1)

    # Calculate returns
    df['returns'] = df.groupby('ticker')['adj_close'].pct_change()

    # Drop rows with NaN values (first day for each ticker)
    df = df.dropna()

    # Reset index
    df = df.reset_index(drop=True)

    return df

gnn_ready_data = prepare_for_gnn(preprocessed_data)
print(gnn_ready_data.head(10))
print(f"\nShape of the GNN-ready dataframe: {gnn_ready_data.shape}")

# Save GNN-ready data to CSV (optional)
gnn_ready_data.to_csv('gnn_ready_stock_data.csv', index=False)
df = pd.read_csv('gnn_ready_stock_data.csv')

[*********************100%***********************]  20 of 20 completed


                       date ticker        open        high         low  \
0 2024-08-19 00:00:00+00:00   AAPL  225.720001  225.990005  223.039993   
1 2024-08-19 00:00:00+00:00   AMZN  177.639999  178.300003  176.160004   
2 2024-08-19 00:00:00+00:00    BAC   39.480000   39.740002   39.450001   
3 2024-08-19 00:00:00+00:00    CVX  146.009995  147.729996  145.889999   
4 2024-08-19 00:00:00+00:00   GOOG  167.000000  168.470001  166.089996   
5 2024-08-19 00:00:00+00:00  GOOGL  165.279999  166.690002  164.259995   
6 2024-08-19 00:00:00+00:00     HD  362.500000  364.690002  362.100006   
7 2024-08-19 00:00:00+00:00    JNJ  159.460007  160.289993  159.130005   
8 2024-08-19 00:00:00+00:00    JPM  214.000000  215.529999  213.820007   
9 2024-08-19 00:00:00+00:00    LLY  922.119995  926.000000  916.500000   

        close    volume   adj_close  
0  225.889999  40687800  225.889999  
1  178.220001  31129800  178.220001  
2   39.669998  27268900   39.413040  
3  146.830002   5967400  146.8300

Preprocess data

In [82]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [83]:
print(df.columns)

Index(['date', 'ticker', 'open', 'high', 'low', 'close', 'volume', 'adj_close',
       'datetime', 'open_lag1', 'high_lag1', 'low_lag1', 'close_lag1',
       'volume_lag1', 'adj_close_lag1', 'returns'],
      dtype='object')


In [85]:
# Define the features we'll use
feature_columns = ['open', 'high', 'low', 'close', 'volume', 'adj_close',
                   'open_lag1', 'high_lag1', 'low_lag1', 'close_lag1', 'volume_lag1', 'adj_close_lag1']
target_column = 'returns'

# Normalize features
scaler = StandardScaler()
df[feature_columns] = scaler.fit_transform(df[feature_columns])

# Create adjacency matrix (example: based on correlation of returns)
tickers = df['ticker'].unique()
corr_matrix = df.pivot(index='date', columns='ticker', values='returns').corr().abs()
adj_matrix = (corr_matrix > 0.5).astype(int)  # Threshold at 0.5 correlation

# Convert data to PyTorch Geometric format
def create_pyg_data(df, adj_matrix, window_size=10):
    data_list = []
    tickers = df['ticker'].unique()
    n_tickers = len(tickers)

    for i in range(window_size, len(df) // n_tickers):
        start_idx = i * n_tickers - window_size
        end_idx = i * n_tickers

        x = torch.FloatTensor(df[feature_columns].values[start_idx:end_idx])
        y = torch.FloatTensor(df[target_column].values[end_idx:end_idx+n_tickers])

        # Create edge_index from adj_matrix
        edge_index = []
        for i in range(n_tickers):
            for j in range(n_tickers):
                if adj_matrix.iloc[i, j] == 1 and i != j:
                    edge_index.append([i, j])

        edge_index = torch.LongTensor(edge_index).t().contiguous()

        data = Data(x=x, y=y, edge_index=edge_index)
        data_list.append(data)

    return data_list

# The rest of your code remains the same
data_list = create_pyg_data(df, adj_matrix)

# Split data into train and test sets
train_data, test_data = train_test_split(data_list, test_size=0.2, random_state=42)

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)




# Modeling

In [86]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [87]:
# Define the GNN model
class TemporalGNN(nn.Module):
    def __init__(self, num_features, hidden_dim, output_dim, num_gcn_layers=2, num_lstm_layers=1):
        super(TemporalGNN, self).__init__()
        self.num_gcn_layers = num_gcn_layers

        # GCN layers
        self.gcn_layers = nn.ModuleList([GCNConv(num_features if i == 0 else hidden_dim, hidden_dim)
                                         for i in range(num_gcn_layers)])

        # LSTM layer
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=num_lstm_layers, batch_first=True)

        # Output layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        batch_size = data.num_graphs

        # GCN layers
        for i in range(self.num_gcn_layers):
            x = F.relu(self.gcn_layers[i](x, edge_index))

        # Reshape for LSTM: [num_nodes, num_features] -> [batch_size, seq_len, num_features]
        x = x.view(batch_size, -1, x.size(-1))

        # LSTM layer
        x, _ = self.lstm(x)

        # Use the last time step
        x = x[:, -1, :]

        # Output layer
        x = self.fc(x)

        return x.squeeze()

# Set up the model, loss function, and optimizer
num_features = len(feature_columns)
hidden_dim = 64
output_dim = 1  # Predicting a single value (returns or adj_close)

model = TemporalGNN(num_features, hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training function
def train(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        out = model(batch)
        loss = criterion(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation function
def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in test_loader:
            out = model(batch)
            loss = criterion(out, batch.y)
            total_loss += loss.item()
            predictions.extend(out.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())
    return total_loss / len(test_loader), predictions, actuals

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}')



RuntimeError: index 85 is out of bounds for dimension 0 with size 80

In [None]:
# Final evaluation
test_loss, predictions, actuals = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f}')

# Calculate metrics
mse = mean_squared_error(actuals, predictions)
mae = mean_absolute_error(actuals, predictions)
r2 = r2_score(actuals, predictions)

print(f'Mean Squared Error: {mse:.4f}')
print(f'Mean Absolute Error: {mae:.4f}')
print(f'R2 Score: {r2:.4f}')

# Visualize results
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(actuals, predictions, alpha=0.5)
plt.plot([min(actuals), max(actuals)], [min(actuals), max(actuals)], 'r--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs Predicted Stock Values')
plt.tight_layout()
plt.show()

# If you want to make predictions for future dates
def predict_future(model, last_data, num_days=30):
    model.eval()
    predictions = []
    current_data = last_data

    for _ in range(num_days):
        with torch.no_grad():
            prediction = model(current_data)
        predictions.append(prediction.item())

        # Update the data for the next prediction
        # This part depends on how you want to update your features
        # Here's a simple example that just shifts the window:
        new_x = current_data.x[1:].clone()
        new_x[-1] = torch.tensor([prediction.item()] + [0] * (num_features - 1))
        current_data.x = new_x

    return predictions

Display results


In [62]:
import gradio as gr