In [None]:
# pip install pandas polars scikit-learn ta matplotlib seaborn pyarrow fastparquet
# pip install xgboost

In [33]:
import pandas as pd
import polars as pl
import time
import ta
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

In [17]:
# load dataset into pandas
df = pd.read_csv("all_stocks_5yr.csv")

# convert pandas to polars
df_polars = pl.from_pandas(df)

# measure pandas processing time
start = time.time()
df_pandas["daily_return"] = df_pandas["close"].pct_change()
pandas_time = time.time() - start

# measure polars processing time
start = time.time()
df_polars = df_polars.with_columns((df_polars["close"].pct_change()).alias("daily_return"))
polars_time = time.time() - start

print(f"Pandas Time: {pandas_time:.4f} sec")
print(f"⚡ Polars Time: {polars_time:.4f} sec")

Pandas Time: 0.0149 sec
⚡ Polars Time: 0.0080 sec


In [19]:
# calculate indicators

df["SMA_14"] = df.groupby("name")["close"].transform(lambda x: x.rolling(14).mean())
df["EMA_14"] = df.groupby("name")["close"].transform(lambda x: x.ewm(span=14, adjust=False).mean())
df["RSI_14"] = df.groupby("name")["close"].transform(lambda x: ta.momentum.rsi(x, window=14))
df["MACD"] = df.groupby("name")["close"].transform(lambda x: ta.trend.macd(x))

print(df.head())

         date   open   high    low  close    volume name  SMA_14     EMA_14  \
0  2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL     NaN  14.750000   
1  2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL     NaN  14.711333   
2  2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL     NaN  14.652489   
3  2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL     NaN  14.653490   
4  2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL     NaN  14.565025   

   RSI_14  MACD  
0     NaN   NaN  
1     NaN   NaN  
2     NaN   NaN  
3     NaN   NaN  
4     NaN   NaN  


## train test splite (80-20)

In [23]:
# linear regression and random forest regressor

# drop rows with NaN (from rolling calculations)
df = df.dropna()

# define features & target
features = ["SMA_14", "EMA_14", "RSI_14", "MACD", "open", "high", "low", "volume"]
target = "close"

X = df[features]
y = df[target]

# train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Training Data: {X_train.shape}, Testing Data: {X_test.shape}")

Training Data: (485126, 8), Testing Data: (121282, 8)


## optimized random forest

In [28]:
rf_model = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)  # uses 50 trees, all CPU cores
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print(f"Random Forest MAE: {mae_rf:.4f}")

Random Forest MAE: 0.2978


## xgboost model

In [35]:
xgb_model = XGBRegressor(n_estimators=100, tree_method="hist", random_state=42)  # "hist" makes it very fast
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)

print(f"XGBoost MAE: {mae_xgb:.4f}")

XGBoost MAE: 0.5644


In [39]:
#compare models

if mae_xgb < mae_rf:
    print("\nXGBoost is more accurate than Random Forest!")
else:
    print("\nRandom Forest performed better!")


Random Forest performed better!


## linear regression

In [45]:
#just curious how will linear regression perform with the two

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print(f"Linear Regression MAE: {mae_lr:.4f}")

# compare all models
print("\nModel Comparison:")
print(f"Linear Regression MAE: {mae_lr:.4f}")
print(f"Random Forest MAE: {mae_rf:.4f}")
print(f"XGBoost MAE: {mae_xgb:.4f}")

best_model = min([("Linear Regression", mae_lr), ("Random Forest", mae_rf), ("XGBoost", mae_xgb)], key=lambda x: x[1])
print(f"\nBest Model: {best_model[0]} with MAE: {best_model[1]:.4f}")

Linear Regression MAE: 0.3057

Model Comparison:
Linear Regression MAE: 0.3057
Random Forest MAE: 0.2978
XGBoost MAE: 0.5644

Best Model: Random Forest with MAE: 0.2978
