In [26]:
import os
import time
import requests
import numpy as np
import pandas as pd
import pandas_ta as ta
from dotenv import load_dotenv
from bs4 import BeautifulSoup

# Load API key from .env
load_dotenv()
api_key = os.getenv("POLYGON_API_KEY")


In [None]:
# Step 1: Pull OHLCV Data
symbol = "SPY"
start_date = "2020-01-01"
end_date = "2024-01-01"
filename = f"{symbol}_data.csv"

if os.path.exists(filename):
    df = pd.read_csv(filename, parse_dates=["date"])
else:
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=50000&apiKey={api_key}"
    response = requests.get(url)
    data = response.json().get('results', [])
    if not data:
        raise Exception("❌ No OHLCV data returned from Polygon.")
    df = pd.DataFrame(data)
    df['t'] = pd.to_datetime(df['t'], unit='ms')
    df.rename(columns={'t': 'date', 'o': 'open', 'h': 'high', 'l': 'low', 'c': 'close', 'v': 'volume'}, inplace=True)
    df = df[['date', 'open', 'high', 'low', 'close', 'volume']]
    df.to_csv(filename, index=False)

df.set_index('date', inplace=True)

0
Empty DataFrame
Columns: [open, high, low, close, volume, rsi, macd, sma50, ema20, atr, p/e, eps_(ttm), dividend_%, market_cap, next_day_return, target]
Index: []


In [28]:
def get_finviz_fundamentals(symbol):
    url = f"https://finviz.com/quote.ashx?t={symbol}"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table', class_='snapshot-table2')
    if not table:
        return {}
    data = {}
    rows = table[0].find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        for i in range(0, len(cols), 2):
            key = cols[i].text.strip()
            val = cols[i+1].text.strip()
            data[key] = val
    return data


In [29]:
def parse_val(val):
    if val is None:
        return None
    val = val.replace('%', '').replace('B', 'e9').replace('M', 'e6').replace(',', '')
    try:
        return float(eval(val))
    except:
        return val


In [30]:

df = get_polygon_ohlcv(symbol)
# Add Technical Indicators
df['rsi'] = ta.rsi(df['close'], length=14)
df['macd'] = ta.macd(df['close'])['MACD_12_26_9']
df['sma50'] = ta.sma(df['close'], length=50)
df['ema20'] = ta.ema(df['close'], length=20)
df['atr'] = ta.atr(df['high'], df['low'], df['close'])
df.dropna(inplace=True)

# Add Fundamentals from Finviz
fundamentals = get_finviz_fundamentals(symbol)
keys_of_interest = ['P/E', 'EPS (ttm)', 'Dividend %', 'Market Cap', 'Volume']
selected = {k: fundamentals.get(k) for k in keys_of_interest}

for key, val in selected.items():
    df[key.lower().replace(' ', '_')] = parse_val(val)

# Add target variable for ML
df['next_day_return'] = df['close'].pct_change().shift(-1)
df['target'] = (df['next_day_return'] > 0).astype(int)
df.dropna(inplace=True)
df.head()


Unnamed: 0_level_0,open,high,low,close,volume,rsi,macd,sma50,ema20,atr,p/e,eps_(ttm),dividend_%,market_cap,next_day_return,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [31]:
df.to_csv(f"{symbol}_enriched.csv")
print(f"Saved enriched dataset for {symbol} to CSV.")


Saved enriched dataset for SPY to CSV.


In [32]:
import os
import time
import requests
import numpy as np
import pandas as pd
import pandas_ta as ta
from dotenv import load_dotenv
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load environment variables
load_dotenv()
api_key = os.getenv("POLYGON_API_KEY")

# Step 1: Pull OHLCV Data
symbol = "SPY"
start_date = "2020-01-01"
end_date = "2024-01-01"
filename = f"{symbol}_data.csv"

if os.path.exists(filename):
    df = pd.read_csv(filename, parse_dates=["date"])
else:
    url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}?adjusted=true&sort=asc&limit=50000&apiKey={api_key}"
    response = requests.get(url)
    data = response.json().get('results', [])
    if not data:
        raise Exception("❌ No OHLCV data returned from Polygon.")
    df = pd.DataFrame(data)
    df['t'] = pd.to_datetime(df['t'], unit='ms')
    df.rename(columns={'t': 'date', 'o': 'open', 'h': 'high', 'l': 'low', 'c': 'close', 'v': 'volume'}, inplace=True)
    df = df[['date', 'open', 'high', 'low', 'close', 'volume']]
    df.to_csv(filename, index=False)

df.set_index('date', inplace=True)

# Step 2: Add Technical Indicators
df['rsi'] = ta.rsi(df['close'], length=14)
df['macd'] = ta.macd(df['close'])['MACD_12_26_9']
df['sma50'] = ta.sma(df['close'], length=50)
df['ema20'] = ta.ema(df['close'], length=20)
df['atr'] = ta.atr(df['high'], df['low'], df['close'])

# Step 3: Clean only critical missing values
df['next_day_return'] = df['close'].pct_change().shift(-1)
df['target'] = (df['next_day_return'] > 0).astype(int)
df = df.dropna(subset=['rsi', 'macd', 'sma50', 'ema20', 'atr', 'next_day_return'])

# Step 4: Pull Basic Fundamentals from Polygon (if available)
url = f"https://api.polygon.io/v3/reference/tickers/{symbol}?apiKey={api_key}"
response = requests.get(url)
data = response.json()

if 'results' in data:
    result = data['results']
    fundamentals = {
        'name': result.get('name'),
        'market_cap': result.get('market_cap'),
        'share_class_shares_outstanding': result.get('share_class_shares_outstanding'),
        'weighted_shares_outstanding': result.get('weighted_shares_outstanding'),
        'total_employees': result.get('total_employees'),
        'sector': result.get('sic_description'),
    }
    for key, value in fundamentals.items():
        df[key] = value
    print(f"✅ Polygon fundamentals added for {symbol}:")
    print(fundamentals)
else:
    print("⚠️ No fundamentals found for this ETF in Polygon.")

# Step 5: Add Simulated Sentiment
np.random.seed(42)
df['daily_sentiment'] = np.random.normal(loc=0.02, scale=0.1, size=len(df))

# Final check
print("✅ Final DataFrame shape:", df.shape)
print("✅ Columns:", df.columns.tolist())
df.head()


✅ Polygon fundamentals added for SPY:
{'name': 'SPDR S&P 500 ETF Trust', 'market_cap': None, 'share_class_shares_outstanding': 1016280000, 'weighted_shares_outstanding': None, 'total_employees': None, 'sector': None}
✅ Final DataFrame shape: (134, 19)
✅ Columns: ['open', 'high', 'low', 'close', 'volume', 'rsi', 'macd', 'sma50', 'ema20', 'atr', 'next_day_return', 'target', 'name', 'market_cap', 'share_class_shares_outstanding', 'weighted_shares_outstanding', 'total_employees', 'sector', 'daily_sentiment']


Unnamed: 0_level_0,open,high,low,close,volume,rsi,macd,sma50,ema20,atr,next_day_return,target,name,market_cap,share_class_shares_outstanding,weighted_shares_outstanding,total_employees,sector,daily_sentiment
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2023-06-20 04:00:00,437.45,438.37,435.03,437.18,75935359.0,66.377267,6.470936,417.876,428.89285,4.519668,-0.005124,0,SPDR S&P 500 ETF Trust,,1016280000,,,,0.069671
2023-06-21 04:00:00,436.16,436.99,434.33,434.94,76840751.0,62.218352,6.143438,418.3826,429.468769,4.3974,0.00361,1,SPDR S&P 500 ETF Trust,,1016280000,,,,0.006174
2023-06-22 04:00:00,433.95,436.62,433.6,436.51,70510175.0,63.924472,5.942083,418.9184,430.139362,4.296715,-0.00756,0,SPDR S&P 500 ETF Trust,,1016280000,,,,0.084769
2023-06-23 04:00:00,432.93,435.06,432.47,433.21,91981537.0,57.996211,5.453362,419.4216,430.431804,4.277981,-0.004086,0,SPDR S&P 500 ETF Trust,,1016280000,,,,0.172303
2023-06-26 04:00:00,432.62,434.61,431.19,431.44,72723560.0,55.047438,4.867118,419.781,430.527823,4.215466,0.010963,1,SPDR S&P 500 ETF Trust,,1016280000,,,,-0.003415
