### DOWNLOAD DATA

In [6]:
#!pip install yfinance
import yfinance as yf

# Download S&P 500 index data (^GSPC) from 1990-01-01 to 2018-12-31
sp500 = yf.download("^GSPC", start="1990-01-01", end="2018-12-31", interval="1d")

# Save to CSV
sp500.to_csv("sp500_1990_2018.csv")

print("Saved S&P 500 data to sp500_1990_2018.csv")

import yfinance as yf

# Download S&P 500 index data (^GSPC) from 1990-01-01 to 2018-12-31
sp500 = yf.download("^GSPC", start="1990-01-01", end="2018-12-31", interval="1d")

# Save to CSV
sp500.to_csv("sp500_1990_2018.csv")

print("Saved S&P 500 data to sp500_1990_2018.csv")


  sp500 = yf.download("^GSPC", start="1990-01-01", end="2018-12-31", interval="1d")
[*********************100%***********************]  1 of 1 completed
  sp500 = yf.download("^GSPC", start="1990-01-01", end="2018-12-31", interval="1d")
[*********************100%***********************]  1 of 1 completed

Saved S&P 500 data to sp500_1990_2018.csv
Saved S&P 500 data to sp500_1990_2018.csv





### IMPORT INTO PANDAS & CLEAN

In [10]:
import pandas as pd
import numpy as np

# 1) Load exactly as-is
df = pd.read_csv("sp500_1990_2018.csv")

# 2) Drop the two metadata rows: where 'Price' is 'Ticker' or 'Date'
df = df[~df["Price"].isin(["Ticker", "Date"])].copy()

# 3) Parse dates and set index
df["Date"] = pd.to_datetime(df["Price"])
df = df.drop(columns=["Price"]).set_index("Date").sort_index()

# 4) Make sure numeric cols are numeric
for c in ["Close","High","Low","Open","Volume"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

print(df.head())
print(df.dtypes)


                 Close        High         Low        Open     Volume
Date                                                                 
1990-01-02  359.690002  359.690002  351.980011  353.399994  162070000
1990-01-03  358.760010  360.589996  357.890015  359.690002  192330000
1990-01-04  355.670013  358.760010  352.890015  358.760010  177000000
1990-01-05  352.200012  355.670013  351.350006  355.670013  158530000
1990-01-08  353.790009  354.239990  350.540009  352.200012  140110000
Close     float64
High      float64
Low       float64
Open      float64
Volume      int64
dtype: object


### feature eng & train / test split

In [20]:
# === CONFIG ===
LOOKBACK_LAGS = 5
ROLL_N = 5
TRAIN_FRACTION = 0.70     # time-aware 70/30 split

# === 0) ASSUMPTIONS ===
# df already exists, Date is the DatetimeIndex, and columns include 'Close'
import pandas as pd, numpy as np
assert isinstance(df.index, pd.DatetimeIndex), "df must have a DatetimeIndex"
assert "Close" in df.columns, "df must have a 'Close' column"

# === 1) RETURNS ===
df = df.copy()
df["Return"] = df["Close"].pct_change()
df = df.dropna(subset=["Return"])

# === 2) PRELIM SPLIT (to compute Q1/Q3 only on train) ===
split_idx = int(len(df) * TRAIN_FRACTION)
split_date = df.index[split_idx]   # timestamp boundary

# Quartiles from TRAIN returns only (no look-ahead)
q1 = df.loc[:split_date, "Return"].quantile(0.25)
q3 = df.loc[:split_date, "Return"].quantile(0.75)

# === 3) LABELS (Q1/Q3) ===
def label_by_quartiles(r, q1, q3):
    if r <= q1:  return -1  # Down
    if r >= q3:  return  1  # Up
    return 0                 # Flat

df["Label"] = df["Return"].apply(lambda r: label_by_quartiles(r, q1, q3))

# === 4) FEATURES (past-only) ===
# lags
for lag in range(1, LOOKBACK_LAGS + 1):
    df[f"lag_{lag}"] = df["Return"].shift(lag)

# rolling stats (use only past info -> shift(1) after rolling)
df["rolling_mean_5"] = df["Return"].rolling(ROLL_N).mean().shift(1)
df["rolling_std_5"]  = df["Return"].rolling(ROLL_N).std().shift(1)

# drop rows introduced by shifting/rolling
df_feat = df.dropna().copy()

# === 5) FINAL SPLIT (use the SAME date boundary; safe after drops) ===
FEATURES = [f"lag_{i}" for i in range(1, LOOKBACK_LAGS + 1)] + ["rolling_mean_5", "rolling_std_5"]
TARGET = "Label"

X = df_feat[FEATURES]
y = df_feat[TARGET].astype(int)

X_train, y_train = X.loc[:split_date], y.loc[:split_date]
X_test,  y_test  = X.loc[split_date:],  y.loc[split_date:]

print("Boundary date:", split_date.date())
print("Shapes:", X_train.shape, X_test.shape)
print("Train class dist:", y_train.value_counts().sort_index().to_dict())
print("Test  class dist:", y_test.value_counts().sort_index().to_dict())

# === 6) QUICK BASELINE (multinomial logistic) ===
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(multi_class="multinomial", max_iter=2000)
)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("\nMultinomial Logistic (Q1/Q3 labels)")
print(classification_report(y_test, y_pred, digits=3))
print("Confusion (rows=true, cols=pred) [-1,0,1]:\n",
      confusion_matrix(y_test, y_pred, labels=[-1,0,1]))

# === 7) OPTIONAL: SAVE ARTIFACTS ===
df_feat[FEATURES + [TARGET]].to_csv("sp500_model_table_1990_2018.csv")
X_train.assign(Label=y_train).to_csv("train_sp500_1990_2018.csv")
X_test.assign(Label=y_test).to_csv("test_sp500_1990_2018.csv")


Boundary date: 2010-04-20
Shapes: (5104, 7) (2190, 7)
Train class dist: {-1: 1275, 0: 2552, 1: 1277}
Test  class dist: {-1: 437, 0: 1277, 1: 476}

Multinomial Logistic (Q1/Q3 labels)
              precision    recall  f1-score   support

          -1      0.386     0.039     0.071       437
           0      0.612     0.958     0.747      1277
           1      0.403     0.126     0.192       476

    accuracy                          0.594      2190
   macro avg      0.467     0.374     0.337      2190
weighted avg      0.522     0.594     0.491      2190

Confusion (rows=true, cols=pred) [-1,0,1]:
 [[  17  371   49]
 [  14 1223   40]
 [  13  403   60]]


In [13]:
# Lagged returns (1–5 days)
for lag in range(1, 6):
    df[f"lag_{lag}"] = df["Return"].shift(lag)

# Rolling mean / volatility using only past data (shift(1))
df["rolling_mean_5"] = df["Return"].rolling(5).mean().shift(1)
df["rolling_std_5"]  = df["Return"].rolling(5).std().shift(1)

# Drop rows created by shifting/rolling
df = df.dropna()

feature_cols = [c for c in df.columns if c.startswith("lag_")] + ["rolling_mean_5","rolling_std_5"]
X = df[feature_cols]
y = df["Label"].astype(int)

X.head(), y.value_counts().sort_index()


(               lag_1     lag_2     lag_3     lag_4     lag_5  rolling_mean_5  \
 Date                                                                           
 1990-01-10 -0.011787  0.004514 -0.009756 -0.008613 -0.002586       -0.005645   
 1990-01-11 -0.006607 -0.011787  0.004514 -0.009756 -0.008613       -0.006450   
 1990-01-12  0.003513 -0.006607 -0.011787  0.004514 -0.009756       -0.004025   
 1990-01-15 -0.024675  0.003513 -0.006607 -0.011787  0.004514       -0.007008   
 1990-01-16 -0.008619 -0.024675  0.003513 -0.006607 -0.011787       -0.009635   
 
             rolling_std_5  
 Date                       
 1990-01-10       0.006633  
 1990-01-11       0.006410  
 1990-01-12       0.007575  
 1990-01-15       0.012027  
 1990-01-16       0.010172  ,
 Label
 -1    2976
  0     896
  1    3428
 Name: count, dtype: int64)

### TRAIN TEST SPLIT

In [21]:
df.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Return,Label,lag_1,lag_2,lag_3,lag_4,lag_5,rolling_mean_5,rolling_std_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1990-01-12,339.929993,348.529999,339.48999,348.529999,183880000,-0.024675,-1,,,,,,,
1990-01-15,337.0,339.940002,336.570007,339.929993,140590000,-0.008619,-1,-0.024675,,,,,,
1990-01-16,340.75,340.75,333.369995,337.0,186070000,0.011128,1,-0.008619,-0.024675,,,,,
1990-01-17,337.399994,342.01001,336.26001,340.769989,170470000,-0.009831,-1,0.011128,-0.008619,-0.024675,,,,
1990-01-18,338.190002,338.380005,333.980011,337.399994,178590000,0.002341,0,-0.009831,0.011128,-0.008619,-0.024675,,,
