In [None]:
# pip install pandas polars scikit-learn ta matplotlib seaborn pyarrow fastparquet

In [13]:
import pandas as pd
import polars as pl
import time
import ta
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# load dataset into pandas
df_pandas = pd.read_csv("all_stocks_5yr.csv")

# convert pandas to polars
df_polars = pl.from_pandas(df_pandas)

# measure pandas processing time
start = time.time()
df_pandas["daily_return"] = df_pandas["close"].pct_change()
pandas_time = time.time() - start

# measure polars processing time
start = time.time()
df_polars = df_polars.with_columns((df_polars["close"].pct_change()).alias("daily_return"))
polars_time = time.time() - start

print(f"Pandas Time: {pandas_time:.4f} sec")
print(f"⚡ Polars Time: {polars_time:.4f} sec")

Pandas Time: 0.0216 sec
⚡ Polars Time: 0.0091 sec


In [7]:
# calculate indicators
df_pandas["SMA_14"] = df_pandas.groupby("name")["close"].transform(lambda x: x.rolling(14).mean())
df_pandas["EMA_14"] = df_pandas.groupby("name")["close"].transform(lambda x: x.ewm(span=14, adjust=False).mean())
df_pandas["RSI_14"] = df_pandas.groupby("name")["close"].transform(lambda x: ta.momentum.rsi(x, window=14))
df_pandas["MACD"] = df_pandas.groupby("name")["close"].transform(lambda x: ta.trend.macd(x))

print(df_pandas.head())

         date   open   high    low  close    volume name  daily_return  \
0  2013-02-08  15.07  15.12  14.63  14.75   8407500  AAL           NaN   
1  2013-02-11  14.89  15.01  14.26  14.46   8882000  AAL     -0.019661   
2  2013-02-12  14.45  14.51  14.10  14.27   8126000  AAL     -0.013140   
3  2013-02-13  14.30  14.94  14.25  14.66  10259500  AAL      0.027330   
4  2013-02-14  14.94  14.96  13.16  13.99  31879900  AAL     -0.045703   

   SMA_14     EMA_14  RSI_14  MACD  
0     NaN  14.750000     NaN   NaN  
1     NaN  14.711333     NaN   NaN  
2     NaN  14.652489     NaN   NaN  
3     NaN  14.653490     NaN   NaN  
4     NaN  14.565025     NaN   NaN  


In [15]:
# linear regression and random forest regressor

# drop rows with NaN (from rolling calculations)
df_pandas = df_pandas.dropna()

# define features & target
features = ["SMA_14", "EMA_14", "RSI_14", "MACD", "open", "high", "low", "volume"]
target = "close"

X = df_pandas[features]
y = df_pandas[target]

# train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"✅ Training Data: {X_train.shape}, Testing Data: {X_test.shape}")

✅ Training Data: (485126, 8), Testing Data: (121282, 8)
