# Model Training Notebook

In [17]:
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from numpy.random import default_rng
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from simulator.objects.policies.architectures import ModelTask
from simulator.objects.policies.architectures.perceptron import MultiLayerPerceptron
from simulator.objects.stock import Stock

np.random.seed(0)
rng = default_rng()

In [5]:
def generate_example_stock_features(n_stocks: int) -> np.ndarray:
    output = []
    for _ in range(n_stocks):
        cash = rng.uniform(-10000, 100000, size=1)[0]
        earning_value_of_assets = rng.uniform(10000, 30000, size=1)[0]
        latest_quarterly_earnings = rng.uniform(10000, 30000, size=1)[0]
        start_price = rng.uniform(10.0, 1010.0, size=1)[0]
        price_slope = rng.uniform(-0.005, 0.005, size=1)[0]
        growth_component = start_price + price_slope * np.arange(0, 1825)
        noise_component = np.random.normal(loc=0, scale=0.001, size=(1825,))
        price_history = growth_component + noise_component
        quality_of_leadership = np.random.uniform(0.0, 1.0, size=1)[0]
        stock = Stock(
            cash=cash,
            earning_value_of_assets=earning_value_of_assets,
            latest_quarterly_earnings=latest_quarterly_earnings,
            price_history=price_history,
            quality_of_leadership=quality_of_leadership,
            stock_volatility=0.5,
        )

        output.append(np.append(stock.get_stock_features(), 0))

    return np.array(output)

In [8]:
N_SAMPLES = 5000
TEST_RATIO = 0.2

# NOTE: SWITCH TO PERCENT ERROR LOSS OR SOME VARIANT

stock_features = generate_example_stock_features(N_SAMPLES)
stock_labels = stock_features[:, 0]

stock_dataset = pd.DataFrame(
    stock_features
)
X = stock_dataset
y = stock_dataset[0]

test_length = int(N_SAMPLES * TEST_RATIO)
train_length = N_SAMPLES - test_length

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_RATIO, random_state=0
)


In [None]:
model = RandomForestRegressor(
    n_estimators=100,
    random_state=0,
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(mean_squared_error(y_test, y_pred))
for feature, importance in zip(X.columns, model.feature_importances_):
    print(f"{feature}: {importance}")

0.06407196621982704
0: 0.9999982109338125
1: 2.6006722837167284e-07
2: 2.55268183096212e-07
3: 2.826311999634637e-07
4: 1.7250873058182653e-07
5: 1.096431570050497e-07
6: 1.125277086677498e-07
7: 1.0738073347198461e-07
8: 9.734240540278901e-08
9: 9.285058702294144e-08
10: 1.1175513371259248e-07
11: 9.408942040901236e-08
12: 9.300169994394323e-08
13: 0.0


In [18]:
outfile = Path("random_forest_regressor.pkl")
with outfile.open("wb") as model_file:
    pickle.dump(model, model_file)