In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("sp500_stocks.csv")

In [3]:
df.head()

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume
0,2010-01-04,MMM,59.318886,83.019997,83.449997,82.669998,83.089996,3043700.0
1,2010-01-05,MMM,58.947342,82.5,83.230003,81.699997,82.800003,2847000.0
2,2010-01-06,MMM,59.783295,83.669998,84.599998,83.510002,83.879997,5268500.0
3,2010-01-07,MMM,59.826176,83.730003,83.760002,82.120003,83.32,4470100.0
4,2010-01-08,MMM,60.247749,84.32,84.32,83.300003,83.690002,3405800.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1645816 entries, 0 to 1645815
Data columns (total 8 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   Date       1645816 non-null  object 
 1   Symbol     1645816 non-null  object 
 2   Adj Close  1577319 non-null  float64
 3   Close      1577319 non-null  float64
 4   High       1577319 non-null  float64
 5   Low        1577319 non-null  float64
 6   Open       1577319 non-null  float64
 7   Volume     1577319 non-null  float64
dtypes: float64(6), object(2)
memory usage: 100.5+ MB


In [5]:
df.shape

(1645816, 8)

In [6]:
df.isna().sum()

Date             0
Symbol           0
Adj Close    68497
Close        68497
High         68497
Low          68497
Open         68497
Volume       68497
dtype: int64

In [7]:
df = df.dropna()

In [8]:
# Drop the `Date` column
df = df.drop(["Date"], axis=1)

In [9]:
df.head()

Unnamed: 0,Symbol,Adj Close,Close,High,Low,Open,Volume
0,MMM,59.318886,83.019997,83.449997,82.669998,83.089996,3043700.0
1,MMM,58.947342,82.5,83.230003,81.699997,82.800003,2847000.0
2,MMM,59.783295,83.669998,84.599998,83.510002,83.879997,5268500.0
3,MMM,59.826176,83.730003,83.760002,82.120003,83.32,4470100.0
4,MMM,60.247749,84.32,84.32,83.300003,83.690002,3405800.0


In [10]:
X = df.drop(["Close"], axis=1)
y = df["Close"]

In [11]:
# Handle stock symbols
symbols = X["Symbol"].unique()
symbol_to_int = {symbol: i for i, symbol in enumerate(symbols)}

X["Symbol"] = X["Symbol"].map(symbol_to_int)

X.head()

Unnamed: 0,Symbol,Adj Close,High,Low,Open,Volume
0,0,59.318886,83.449997,82.669998,83.089996,3043700.0
1,0,58.947342,83.230003,81.699997,82.800003,2847000.0
2,0,59.783295,84.599998,83.510002,83.879997,5268500.0
3,0,59.826176,83.760002,82.120003,83.32,4470100.0
4,0,60.247749,84.32,83.300003,83.690002,3405800.0


In [12]:
# Scale the data
scaler = StandardScaler()

X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
model = RandomForestRegressor(n_jobs=-1, random_state=42)

model.fit(X_train, y_train)

In [15]:
import numpy as np

y_pred = model.predict(X_test)

print(f"Mean absolute error: {np.mean(np.abs(y_pred - y_test))}")
print(f"Score: {model.score(X_test, y_test)}")

Mean absolute error: 0.3040858318247055
Score: 0.9999842669150731
