### Imports

In [None]:
from tqdm import tqdm
import random
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import gymnasium as gym
import gym_mtsim
sys.path.append("C:/Users/WilliamFetzner/Documents/Trading/")
from gym_mtsim_forked.gym_mtsim.data import FOREX_DATA_PATH_TRAIN, FOREX_DATA_PATH_TEST, FOREX_DATA_PATH
from gym_mtsim import OrderType, Timeframe, MtEnv, MtSimulator
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, STATUS_FAIL
from stable_baselines3 import A2C, PPO
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
import time
import pickle
import torch

# Parameters

In [None]:
# import pickle
# import time
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials


# def objective(x):
#     return {
#         'loss': x ** 2,
#         'status': STATUS_OK,
#         # -- store other results like this
#         'eval_time': time.time(),
#         'other_stuff': {'type': None, 'value': x},
#         # -- attachments are handled differently
#         'attachments':
#             {'time_module': pickle.dumps(time.time)}
#         }
# trials = Trials()
# best = fmin(objective,
#             space=hp.uniform('x', -10, 10),
#             algo=tpe.suggest,
#             max_evals=100,
#             trials=trials)

# print(best)

In [None]:
# trials.results

In [None]:
# unpack the pickle file and load the data that is in symbols_forex.pkl
with open('C:/Users/WilliamFetzner/Documents/Trading/gym_mtsim_forked/gym_mtsim/data/symbols_forex.pkl', 'rb') as f:
    symbols = pickle.load(f)
# convert symbols to a pd.dataframe
# symbols[1]['EURUSD']
split = int(len(symbols[1]['EURUSD']) * 0.80)
validation_split = int(len(symbols[1]['EURUSD']) * 0.90)

In [None]:
# get the 2 weeks of the symbols[1]['EURUSD'] dataframe by first finding the max date
# then subtracting 14 days from that date
symbols[1]['EURUSD'].index = pd.to_datetime(symbols[1]['EURUSD'].index)
max_date = symbols[1]['EURUSD'].index.max()
two_weeks = max_date - pd.DateOffset(days=14)
one_week = max_date - pd.DateOffset(days=7)

In [None]:
training_index_slice = symbols[1]['EURUSD'].loc[:two_weeks, :].index
validation_index_slice = symbols[1]['EURUSD'].loc[two_weeks:one_week, :].index
testing_index_slice = symbols[1]['EURUSD'].loc[one_week:, :].index

In [None]:
validation_index_slice

### Create Env

In [None]:
sim_train = gym_mtsim.MtSimulator(
    unit='USD',
    balance=200000.,
    leverage=100.,
    stop_out_level=0.2,
    hedge=True,
    symbols_filename=FOREX_DATA_PATH
)

env_train = MtEnv(
    original_simulator=sim_train,
    trading_symbols=['EURUSD'],
    window_size = 10,
    time_points=list(training_index_slice),
    hold_threshold=0.5,
    close_threshold=0.5,
    fee=lambda symbol: {
        # 'GBPCAD': max(0., np.random.normal(0.0007, 0.00005)),
        'EURUSD': max(0., np.random.normal(0.0001, 0.00003))
        # 'USDJPY': max(0., np.random.normal(0.02, 0.003)),
    }[symbol],
    symbol_max_orders=2,
    multiprocessing_processes=2
)

In [None]:
sim_validation = gym_mtsim.MtSimulator(
    unit='USD',
    balance=200000.,
    leverage=100.,
    stop_out_level=0.2,
    hedge=True,
    symbols_filename=FOREX_DATA_PATH
)

env_validation = MtEnv(
    original_simulator=sim_validation,
    trading_symbols=['EURUSD'],
    window_size = 10,
    time_points=list(validation_index_slice),
    hold_threshold=0.5,
    close_threshold=0.5,
    fee=lambda symbol: {
        # 'GBPCAD': max(0., np.random.normal(0.0007, 0.00005)),
        'EURUSD': max(0., np.random.normal(0.0001, 0.00003))
        # 'USDJPY': max(0., np.random.normal(0.02, 0.003)),
    }[symbol],
    symbol_max_orders=2,
    multiprocessing_processes=2
)

In [None]:
sim_testing = gym_mtsim.MtSimulator(
    unit='USD',
    balance=200000.,
    leverage=100.,
    stop_out_level=0.2,
    hedge=True,
    symbols_filename=FOREX_DATA_PATH
)

env_testing = MtEnv(
    original_simulator=sim_testing,
    trading_symbols=['EURUSD'],
    window_size = 10,
    time_points=list(testing_index_slice),
    hold_threshold=0.5,
    close_threshold=0.5,
    fee=lambda symbol: {
        # 'GBPCAD': max(0., np.random.normal(0.0007, 0.00005)),
        'EURUSD': max(0., np.random.normal(0.0001, 0.00003))
        # 'USDJPY': max(0., np.random.normal(0.02, 0.003)),
    }[symbol],
    symbol_max_orders=2,
    multiprocessing_processes=2
)

### Define Functions

In [None]:
def print_stats(reward_over_episodes, printing_name):
    """  Print Reward  """

    avg_rewards = np.mean(reward_over_episodes)
    min_rewards = np.min(reward_over_episodes)
    max_rewards = np.max(reward_over_episodes)

    print (f'Min. {printing_name}          : {min_rewards:>10.3f}')
    print (f'Avg. {printing_name}          : {avg_rewards:>10.3f}')
    print (f'Max. {printing_name}          : {max_rewards:>10.3f}')

    return min_rewards, avg_rewards, max_rewards


# ProgressBarCallback for model.learn()
class ProgressBarCallback(BaseCallback):

    def __init__(self, check_freq: int, verbose: int = 1):
        super().__init__(verbose)
        self.check_freq = check_freq

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        self.progress_bar = tqdm(total=self.model._total_timesteps, desc="model.learn()")

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            self.progress_bar.update(self.check_freq)
        return True
    
    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the `learn()` method.
        """
        self.progress_bar.close()


In [None]:
space = {
    'learning_rate': hp.loguniform('learning_rate', -5, -2), # Learning rate
    'gamma': hp.uniform('gamma', 0.97, 0.99), # Discount factor
    'ent_coef': hp.loguniform('ent_coef', -5, 0) # Entropy coefficient
    # 'learning_timesteps': hp.choice('learning_timesteps', [25, 50, 100, 250, 500]),
}

In [None]:
# # create a graph that shows the distribution of values created by 10_000 iterations of 10 ** np.random.uniform(-5, -2)
# # and then plot the graph
# learning_rate_dist = [10 ** np.random.uniform(-5, 0) for _ in range(10_000)]
# sns.histplot(learning_rate_dist, kde=True)
# plt.xscale('log')
# plt.xlabel('Learning Rate')
# plt.title('Learning Rate Distribution')
# plt.show()



In [None]:
def env_walkthrough(env, model, seed, testing_training_env=False, run_count=10):
    reward_over_validations = []
    orders_over_validations = []

    for episode in range(0, run_count):
        if testing_training_env:
            obs_val = env.reset()
        else:
            obs_val, info_val = env.reset(seed=seed)

        total_reward = 0
        done_val = False

        while not done_val:
            action, _states = model.predict(obs_val)
            if testing_training_env:
                obs_val, reward_val, done_val, info_val = env.step(action)
            else:
                obs_val, reward_val, terminated_val, truncated_val, info_val = env.step(action)
                done_val = terminated_val or truncated_val

            total_reward += reward_val
            if done_val:
                break
        if not testing_training_env:
            try:
                order_len = len(env.render()['orders'])
            except:
                order_len = 0

        # model_dict[f'model_{episode}'] = model
        # model.save(f'models_4_19_24/window_{window_size_param}_entropy_{round(entropy, 4)}/model_{steps_str}_{episode}.pkl')

        reward_over_validations.append(total_reward)    
        if not testing_training_env:
            orders_over_validations.append(order_len)  


        # if episode % 1 == 0:
        avg_reward = np.mean(reward_over_validations)
        if not testing_training_env:
            avg_orders = np.mean(orders_over_validations)
            print(f'Episode: {episode}, Avg. Reward: {avg_reward:.3f}, # of orders: {avg_orders:.3f}')
        else: 
            print(f'Episode: {episode}, Avg. Reward: {avg_reward:.3f}')

        if (avg_reward > 0) and (testing_training_env):
            print('model successfully trained!')
            break

    return reward_over_validations, orders_over_validations

In [None]:
# TRAINING + TEST
def train_val_model(model, model_policy, env_tr, env_val, seed, steps_str, window_size_param, lr, gamma_param, entropy, training_attempts=5, training_run_count=10, validating_run_count=10, total_learning_timesteps=10_000):
    """
    Trains and validates a model using the Proximal Policy Optimization (PPO) algorithm.

    Args:
        model (object): The model to be trained.
        model_policy (object): The policy used by the model.
        env_tr (object): The training environment.
        env_val (object): The validation environment.
        seed (int): The random seed for reproducibility.
        steps_str (str): A string representing the number of steps.
        window_size_param (int): The window size parameter.
        lr (float): The learning rate.
        gamma_param (float): The gamma parameter.
        entropy (float): The entropy coefficient.
        total_learning_timesteps (int, optional): The total number of learning timesteps. Defaults to 10,000.

    Returns:
        tuple: A tuple containing the reward over validations, orders over validations, and the model dictionary.
    """
    # reproduce training and test
    print('-' * 80)
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

    #model_dict = {}
    print(f'entropy: {entropy}, learning rate: {lr}')
    vec_env = None
    # eval_callback = EvalCallback(env_tr, log_path='./logs/', eval_freq=1000)
    model = PPO(model_policy, env_tr, verbose=0, ent_coef=entropy, learning_rate=lr)#, gamma=gamma_param, 
    obs_tr, info_tr = env_tr.reset(seed=seed)
    training_success = False
    for i in range(training_attempts):
        print(f'training model attempt {i}')
        model.learn(total_timesteps=total_learning_timesteps)

        # checking if the model learned
        vec_env = model.get_env()
        obs = vec_env.reset()
        rewards, _ = env_walkthrough(vec_env, model, seed, testing_training_env=True, run_count=training_run_count)

        if np.mean(rewards) > 0:
            training_success = True
            break
    if not training_success:
        print('Model failed to learn with those parameters')
        return ValueError('Model failed to learn with those parameters')
            

    return env_walkthrough(env_val, model, seed, run_count=validating_run_count)

### Train + Test Env

In [None]:
seed = 2024  # random seed
total_num_episodes = 10

# print ("env_name                 :", env_name)
print ("seed                     :", seed)

# INIT matplotlib
plot_settings = {}
plot_data = {'x': [i for i in range(1, total_num_episodes + 1)]}

# learning_timesteps_list_in_K = [25]#, 50, 100]
# learning_timesteps_list_in_K = [50, 250, 500]
# learning_timesteps_list_in_K = [500, 1000, 3000, 5000]

# RL Algorithms: https://stable-baselines3.readthedocs.io/en/master/guide/algos.html

timesteps_models_dict = {}
def objective(params):
    window_size = 10#params['window_size']
    learning_timesteps = 50 #params['learning_timesteps']
    ent_coef = params['ent_coef']
    gamma = params['gamma'] #0.99 #
    learning_rate = params['learning_rate']#0.0003#

    if learning_rate > 0.05:
        print(f'Learning rate too high: {learning_rate}')
        return {'loss': None, 'status': STATUS_FAIL, 'eval_time': time.time(), 'parameters': params}
    if ent_coef > 0.1:
        print(f'Entropy too high: {ent_coef}')
        return {'loss': None, 'status': STATUS_FAIL, 'eval_time': time.time(), 'parameters': params}

    total_learning_timesteps = learning_timesteps * 1000
    step_key = f'{learning_timesteps}K'
    policy_dict = PPO.policy_aliases
    policy = policy_dict.get('MultiInputPolicy')
    class_name = type(PPO).__qualname__
    plot_key = f'{class_name}_rewards_'+step_key
    try:
        print(f'length of training env time points: {len(env_train.time_points)}, \
              length of validation env time points: {len(env_validation.time_points)}')
        rewards, orders = train_val_model(PPO, policy, env_train, env_validation, seed, step_key, window_size, 
                                                    learning_rate, gamma, ent_coef, total_learning_timesteps=total_learning_timesteps, 
                                                    training_attempts=2, training_run_count=1, validating_run_count=10)
    except:
        print(f'''there was an error with those parameters: Window: {window_size}, timesteps: {learning_timesteps}, \n
              ent_coef: {ent_coef}, gamma: {gamma}, learning_rate: {learning_rate}''')
        return {'loss': None, 'status': STATUS_FAIL, 'eval_time': time.time(), 'parameters': params}
    # timesteps_models_dict[step_key] = models_dict
    min_rewards, avg_rewards, max_rewards, = print_stats(rewards, 'Reward')
    print_stats(orders, 'Orders')
    label = f'Avg. {avg_rewards:>7.2f} : {class_name} - {step_key}'
    plot_data[plot_key] = rewards
    plot_settings[plot_key] = {'label': label}
    params['avg_orders'] = np.mean(orders)     

    return {'loss': -avg_rewards, 'status': STATUS_OK, 'eval_time': time.time(), 'parameters': params} 

In [None]:
# train_val_model(PPO, 'MultiInputPolicy', env_train, env_validation, seed, '50K', 10, 
#                                                     0.0003, 0.9, 0, total_learning_timesteps=50, 
#                                                     training_attempts=2, training_run_count=1, validating_run_count=10)

In [None]:
# check if it is working:
parameters = {
    'window_size': 10,
    'learning_timesteps': 50,
    'ent_coef': 0.008841807731982131,
    'gamma': 0.9484679718228304,
    'learning_rate': 0.021173768344759137
}

objective(parameters)

In [None]:
# PPO('MultiInputPolicy', env_train, verbose=0, ent_coef=parameters['ent_coef']).learn(total_timesteps=25_000) #, learning_rate=parameters['learning_rate'], gamma=parameters['gamma'], ent_coef=parameters['ent_coef']

# Hyperparameter search

In [None]:
for i in range(0, 35, 7):
    training_index_slice = symbols[1]['EURUSD'].loc[:(max_date - pd.DateOffset(days=i+7)), :].index
    validation_index_slice = symbols[1]['EURUSD'].loc[(max_date - pd.DateOffset(days=i+7)):(max_date - pd.DateOffset(days=i)), :].index
    env_train.time_points = list(training_index_slice)
    env_validation.time_points = list(validation_index_slice)
    print(f'length of training env time points: {len(env_train.time_points)}, \
          length of validation env time points: {len(env_validation.time_points)}')
    trials = Trials()
    best = fmin(fn=objective,
                space=space,
                algo=tpe.suggest,
                max_evals=50, # Number of evaluations of the objective function
                trials=trials,
                trials_save_file=f'hyperopt/trials_4_22_iter_{i}.pkl')

    print("Best parameters:", best)

In [None]:
trials.results

# Understanding hyperparameter results

In [None]:
# load in the saved trials data
trials = pickle.load(open('hyperopt/trials_04_19.pkl', 'rb'))
trials_4_18 = pickle.load(open('hyperopt/trials.pkl', 'rb'))
trials_4_19_results = trials.results
trials_4_18_results = trials_4_18.results
trials_4_19_results.extend(trials_4_18_results)
len(trials_4_19_results)

In [None]:
trials_all_results = trials_4_18_results + trials_4_19_results
len(trials_all_results)

In [None]:
trials_all_results[0]

In [None]:
results_df = pd.DataFrame()
for idx, result in enumerate(trials_all_results):
    result['window_size'] = result['parameters']['window_size']
    result['learning_rate'] = result['parameters']['learning_rate']
    result['ent_coef'] = result['parameters']['ent_coef']
    del result['parameters']
    new_row = pd.DataFrame(result, index=[idx])
    results_df = pd.concat([results_df, new_row], axis=0)
results_df


In [None]:
# remove all the window size values that are not 10
results_df = results_df[results_df['window_size'] == 10]

In [None]:
# visualize the parameters that cause failures in the objective function

# create a graph that has learning rate on the x-axis and ent_coef on the y-axis, 
# then the color of the points is whether the status is ok or fail, green for ok and red for fail

fig, ax = plt.subplots()
scatter = ax.scatter(results_df['learning_rate'], results_df['ent_coef'], 
                     c=results_df['status'].apply(lambda x: 'green' if x == 'ok' else 'red'))
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Entropy Coefficient')
ax.set_title('Hyperparameter Optimization')
# y lim to 0.2
# plt.ylim(0, 0.2)
# x lim to 0.05
# plt.xlim(0, 0.05)
plt.legend(handles=scatter.legend_elements()[0], labels=['OK', 'Fail'])
plt.show()


In [None]:
# visualize the parameters that cause failures in the objective function

# create a graph that has learning rate on the x-axis and ent_coef on the y-axis, 
# then the color of the points is whether the status is ok or fail, green for ok and red for fail

fig, ax = plt.subplots()
scatter = ax.scatter(results_df['learning_rate'], results_df['ent_coef'], 
                     c=results_df['status'].apply(lambda x: 'green' if x == 'ok' else 'red'))
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Entropy Coefficient')
ax.set_title('Hyperparameter Optimization')
# y lim to 0.2
plt.ylim(0, 0.2)
# x lim to 0.05
plt.xlim(0, 0.05)
plt.legend(handles=scatter.legend_elements()[0], labels=['OK', 'Fail'])
plt.show()


In [None]:
# this showed that window sizes above 10 failed

# # what is the count of the different window sizes grouped by status
# results_df.groupby(['window_size', 'status']).size()
# # Define a dictionary that maps window sizes to marker shapes
# marker_dict = {10: '^', 20: 'o', 50: 's'}

# # Create a new column in the DataFrame that maps window sizes to marker shapes
# results_df['marker'] = results_df['window_size'].map(marker_dict)

# fig, ax = plt.subplots()

# # Loop over each group of points with the same marker shape
# for marker in results_df['marker'].unique():
#     subset = results_df[results_df['marker'] == marker]
#     scatter = ax.scatter(subset['learning_rate'], subset['ent_coef'], 
#                          c=subset['status'].apply(lambda x: 'green' if x == 'ok' else 'red'), 
#                          marker=marker)

# ax.set_xlabel('Learning Rate')
# ax.set_ylabel('Entropy Coefficient')
# ax.set_title('Hyperparameter Optimization')
# # y lim to 0.2
# plt.ylim(0, 0.2)
# # x lim to 0.05
# plt.xlim(0, 0.05)
# # increase the figure size
# fig.set_size_inches(20, 20)
# # plt.legend(handles=scatter.legend_elements()[0], labels=['OK', 'Fail'])
# plt.show()

In [None]:
# only successes 
results_df_success = results_df[results_df['status'] == 'ok']
results_df_success_negative = results_df_success[results_df_success['loss'] < 0]
results_df_success_negative
# sort values from least to greatest loss
results_df_success_negative.sort_values(by='loss', ascending=True).head(25)

In [None]:
# generate a 3d plot of the learning rate, ent_coef, and loss
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x = results_df_success_negative['learning_rate']
y = results_df_success_negative['loss']
z = results_df_success_negative['ent_coef']

ax.scatter(x, y, z, c=z, cmap='viridis')

ax.set_xlabel('Learning Rate')
ax.set_ylabel('Loss')
ax.set_zlabel('Entropy Coefficient')

# increase the plot size
fig.set_size_inches(20, 20)

plt.show()


In [None]:
# generate a 3d plot of the learning rate, ent_coef, and loss
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

results_df_success_negative_low_entropy = results_df_success_negative[results_df_success_negative['ent_coef'] <= 0.2]

x = results_df_success_negative_low_entropy['learning_rate']
y = results_df_success_negative_low_entropy['loss']
z = results_df_success_negative_low_entropy['ent_coef']

ax.scatter(x, y, z, c=z, cmap=cm.coolwarm)

ax.set_xlabel('Learning Rate')
ax.set_ylabel('Loss')
ax.set_zlabel('Entropy Coefficient')



# increase the plot size
fig.set_size_inches(20, 20)

plt.show()


In [None]:
# visualize the parameters that cause failures in the objective function

# create a graph that has learning rate on the x-axis and ent_coef on the y-axis, 
# then the color of the points is whether the status is ok or fail, green for ok and red for fail

fig, ax = plt.subplots()
scatter = ax.scatter(results_df_success['learning_rate'], results_df_success['ent_coef'], 
                     c=results_df_success['loss'].apply(lambda x: 'green' if x < 0 else 'red'))
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Entropy Coefficient')
ax.set_title('Hyperparameter Optimization')
# plt.legend(handles=scatter.legend_elements()[0], labels=['OK', 'Fail'])
plt.ylim(0, 0.1)
# increase the plot size
fig.set_size_inches(20, 20)
plt.show()


In [None]:

# create a heatmap with learning rate on the x-axis and ent_coef on the y-axis and the color is the loss
fig, ax = plt.subplots()
scatter = ax.scatter(results_df_success['learning_rate'], results_df_success['ent_coef'], 
                     c=results_df_success['loss'], cmap='viridis')
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Entropy Coefficient')
ax.set_title('Hyperparameter Optimization')

plt.colorbar(scatter)
plt.ylim(0, 0.1)
# increase the plot size
fig.set_size_inches(25, 15)
plt.show()


In [None]:
results_df_success_low_entropy = results_df_success[results_df_success['ent_coef'] <= 0.2]
# create a heatmap with learning rate on the x-axis and ent_coef on the y-axis and the color is the loss
fig, ax = plt.subplots()
scatter = ax.scatter(results_df_success_low_entropy['learning_rate'], results_df_success_low_entropy['loss'], 
                     c=results_df_success_low_entropy['ent_coef'], cmap=cm.coolwarm)
ax.set_xlabel('Learning Rate')
ax.set_ylabel('Loss')
ax.set_title('Hyperparameter Optimization')

plt.colorbar(scatter)
# plt.ylim(-50_000, 50_000)
# increase the plot size
fig.set_size_inches(25, 15)
plt.show()


In [None]:
results_df_success_low_entropy = results_df_success[results_df_success['ent_coef'] <= 0.2]
# create a heatmap with learning rate on the x-axis and ent_coef on the y-axis and the color is the loss
fig, ax = plt.subplots()
scatter = ax.scatter(results_df_success_low_entropy['ent_coef'], results_df_success_low_entropy['loss'], 
                     c=results_df_success_low_entropy['learning_rate'], cmap=cm.coolwarm)
ax.set_xlabel('Entropy Coefficient')
ax.set_ylabel('Loss')
ax.set_title('Hyperparameter Optimization')

plt.colorbar(scatter)
# plt.ylim(-50_000, 50_000)
# increase the plot size
fig.set_size_inches(25, 15)
plt.show()


### Plot Results

In [None]:
# # create a dataframe of the rewards
# rewards_df = pd.DataFrame({'rewards': rewards})
# # plot the rewards
# plt.figure(figsize=(10, 5))
# sns.lineplot(data=rewards_df)
# plt.title('Rewards')
# plt.xlabel('Episode')
# plt.ylabel('Reward')
# plt.legend()
# plt.show()


In [None]:
# # import the models from /models folder
# import os
# import glob
# # get the list of models
# model_list = glob.glob('models_4_17_24/*.pkl')
# # separate the strings of each model name on _ and get the last element of the string if the string of the model doesn't include 'initial' or 'updated'
# model_list_episode_nbr = [model.split('_')[-1] for model in model_list if 'initial' not in model and 'updated' not in model]
# model_list_episode_nbr = [int(model_name.split('.')[0]) for model_name in model_list_episode_nbr]
# max_episode = max(model_list_episode_nbr)
# # test the last set of 10 episodes
# init_episode = ((int(max_episode)/10) - 10)*10
# # print(max_episode, init_episode)
# models = []
# # test the last set of 10 episodes from init_episode to max_episode
# for nbr in range(int(init_episode), int(max_episode)+10, 10):
#     # set up the appropriate time_points for each of the models in the list
#     env_train.time_points = list(symbols[1]['EURUSD'].iloc[-int(training_length):-(int(testing_length)-int(nbr)), :].index)# make this -nbr not +nbr next time
#     obs_train, info_train = env_train.reset(seed=2024)
#     # find the model name that contains the nbr
#     model_name = [model for model in model_list if str(nbr) in model][0]
#     print(model_name)
#     # load the models into a list
#     models.append(PPO.load(model_name, env=env_train))

In [None]:
# sim_testing = gym_mtsim.MtSimulator(
#     unit='USD',
#     balance=200000.,
#     leverage=100.,
#     stop_out_level=0.2,
#     hedge=True,
#     symbols_filename=FOREX_DATA_PATH
# )

# env_testing = MtEnv(
#     original_simulator=sim_testing,
#     trading_symbols=['EURUSD'],
#     window_size = window_size_param,
#     time_points=list(testing_index_slice),
#     hold_threshold=0.1,
#     close_threshold=0.1,
#     fee=lambda symbol: {
#         # 'GBPCAD': max(0., np.random.normal(0.0007, 0.00005)),
#         'EURUSD': max(0., np.random.normal(0.0001, 0.00003))
#         # 'USDJPY': max(0., np.random.normal(0.02, 0.003)),
#     }[symbol],
#     symbol_max_orders=2,
#     multiprocessing_processes=2
# )

In [None]:
# model_ppo = PPO.load(f'models_4_17_24\model_25K_5.pkl', env=env_train)

# obs_test, info_test = env_testing.reset(seed=2024)
# done_test = False
# while not done_test:
#     action, _states = model_ppo.predict(obs_test)
#     obs_test, reward_test, terminated_test, truncated_test, info_test = env_testing.step(action)
#     done_test = terminated_test or truncated_test
#     # total_reward += reward_test
#     if done_test:
#         break
# try:
#     order_len = len(env_testing.render()['orders'])
# except:
#     order_len = 0
# # print(f"Episode: {episode}, Reward: {total_reward:.3f}, # orders: {order_len}")


In [None]:
# # if model_dict is still a thing
# for timestep in timesteps_models_dict.keys():
#     models_dict = timesteps_models_dict[timestep]

#     for nbr in range(0, 10):
#         msg = f"{'-'*8} Testing Model {nbr} with {timestep} training timesteps {'-'*8}"
#         print(f"""{msg}\n{'-'*len(msg)}""")
#         reward_across_episodes = []
#         rewards_dict = {}
#         model_results_dict = {}
#         for episode in range(0, 10):   
#             total_reward = 0
#             done_test = False
#             model_ppo = models_dict[f'model_{nbr}']

#             obs_test, info_test = env_train.reset(seed=2024)
#             while not done_test:
#                 action, _states = model_ppo.predict(obs_test)
#                 obs_test, reward_test, terminated_test, truncated_test, info_test = env_train.step(action)
#                 done_test = terminated_test or truncated_test
#                 total_reward += reward_test
#                 if done_test:
#                     break
#             reward_across_episodes.append(total_reward)
#             try:
#                 order_len = len(env_train.render()['orders'])
#             except:
#                 order_len = 0
#             print(f"Episode: {episode}, Reward: {total_reward:.3f}, # orders: {order_len}")
#         print_stats(reward_across_episodes)
#         model_results_dict[f'model_{nbr}_{timestep}'] = reward_across_episodes
# model_results_df = pd.DataFrame(model_results_dict)

In [None]:
# # plot the rewards for each model over episodes
# plt.figure(figsize=(10, 5))
# sns.lineplot(data=model_results_df)
# plt.title('Rewards')
# plt.xlabel('Episode')
# plt.ylabel('Reward')
# plt.legend()
# plt.show()

In [None]:
# # if model_dict is still a thing
# for timestep in ['25K']:
#     for nbr in tqdm(range(1, 10)):
#         msg = f"{'-'*8} Testing Model {nbr} with {timestep} training timesteps {'-'*8}"
#         print(f"""{msg}\n{'-'*len(msg)}""")
#         reward_across_episodes = []
#         number_of_orders_across_episodes = []
#         rewards_dict = {}
#         model_results_dict = {}
#         for episode in tqdm(range(0, 10)):   
#             total_reward = 0
#             done_test = False
#             model_ppo = PPO.load(f'models_4_17_24\model_{timestep}_{nbr}.pkl', env=env_train)

#             obs_test, info_test = env_testing.reset(seed=2024)
#             while not done_test:
#                 action, _states = model_ppo.predict(obs_test)
#                 obs_test, reward_test, terminated_test, truncated_test, info_test = env_testing.step(action)
#                 done_test = terminated_test or truncated_test
#                 total_reward += reward_test
#                 if done_test:
#                     break
#             reward_across_episodes.append(total_reward)
#             try:
#                 order_len = len(env_testing.render()['orders'])
#             except:
#                 order_len = 0
#             number_of_orders_across_episodes.append(order_len)
#             print(f"Episode: {episode}, Reward: {total_reward:.3f}, # orders: {order_len}")
#         print_stats(reward_across_episodes, 'Reward')
#         print_stats(number_of_orders_across_episodes, 'Orders')
#         model_results_dict[f'model_{nbr}_{timestep}'] = reward_across_episodes
# model_results_df = pd.DataFrame(model_results_dict)

In [None]:
# # plot the rewards for each model over episodes
# plt.figure(figsize=(10, 5))
# sns.lineplot(data=model_results_df)
# plt.title('Rewards')
# plt.xlabel('Episode')
# plt.ylabel('Reward')
# plt.legend()
# plt.show()

In [None]:
# # if the cluster has been restarted
# for nbr, time_points_idx in zip(range(0, 10), range(0, 500, 50)):
#     # model_ppo.learn(total_timesteps=25000, callback=ProgressBarCallback(100))
#     env_train = MtEnv(
#         original_simulator=sim_train,
#         trading_symbols=['EURUSD'],
#         window_size = window_size_param,
#         time_points=list(symbols[1]['EURUSD'].iloc[-int(training_length):-(int(testing_length)-int(time_points_idx)), :].index),
#         hold_threshold=0.5,
#         close_threshold=0.5,
#         fee=lambda symbol: {
#             # 'GBPCAD': max(0., np.random.normal(0.0007, 0.00005)),
#             'EURUSD': max(0., np.random.normal(0.0001, 0.00003))
#             # 'USDJPY': max(0., np.random.normal(0.02, 0.003)),
#         }[symbol],
#         symbol_max_orders=2,
#         multiprocessing_processes=2
#     )
#     # obs_train, info_train = env_train.reset(seed=2024)
#     total_reward = 0
#     done_test = False
#     model_ppo = PPO.load(f'models\model_{nbr}.pkl', env=env_train)

#     env_testing = MtEnv(
#         original_simulator=sim_testing,
#         trading_symbols=['EURUSD'],
#         window_size = window_size_param,
#         # time_points=list(testing_index_slice),
#         hold_threshold=0.5,
#         close_threshold=0.5,
#         fee=lambda symbol: {
#             # 'GBPCAD': max(0., np.random.normal(0.0007, 0.00005)),
#             'EURUSD': max(0., np.random.normal(0.0001, 0.00003))
#             # 'USDJPY': max(0., np.random.normal(0.02, 0.003)),
#         }[symbol],
#         symbol_max_orders=2,
#         multiprocessing_processes=2
#     )
#     obs_test, info_test = env_testing.reset(seed=2024)
#     while not done_test:
#         action, _states = model_ppo.predict(obs_test)
#         obs_test, reward_test, terminated_test, truncated_test, info_test = env_testing.step(action)
#         done_test = terminated_test or truncated_test
#         total_reward += reward_test
#         if done_test:
#             break
#     state = env_testing.render()

#     print(
#         f"balance: {state['balance']}, equity: {state['equity']}, margin: {state['margin']}\n"
#         f"free_margin: {state['free_margin']}, margin_level: {state['margin_level']}\n"

#     )
#     # print(state['orders'].Profit.sum())
#     if len(state['orders']) > 0:
#         print(state['orders'].Profit.sum())

In [None]:
# max_episode = 120
# for model_nbr in range(0, int(max_episode)+10, 10):
#     print(f'Model: {model_nbr}')
#     over_episodes_rewards = []
#     over_episodes_balance = []
#     obs_training, info_training = env_train.reset(seed=2024)
#     for episode in range(0, 10):
#         obs_test, info_test = env_testing.reset(seed=2024)
        
#         # model_ppo.learn(total_timesteps=25000, callback=ProgressBarCallback(100))

#         total_reward = 0
#         done_test = False
#         env_train.time_points = list(symbols[1]['EURUSD'].iloc[-int(training_length):-(int(testing_length)-int(model_nbr)), :].index)
#         obs_training, info_training = env_train.reset(seed=2024)
#         model_ppo = PPO.load(f'models\model_{model_nbr}.pkl', env=env_train)

#         while not done_test:
#             action, _states = model_ppo.predict(obs_test)
#             obs_test, reward_test, terminated_test, truncated_test, info_test = env_testing.step(action)
#             done_test = terminated_test or truncated_test

#             total_reward += reward_test
#             if done_test:
#                 break
#         over_episodes_balance.append(info_test['balance'])
#         over_episodes_rewards.append(total_reward)
#         print(f'Episode: {episode}, Reward: {total_reward:.3f}, Balance: {info_test["balance"]:.3f}')
#     print_stats(over_episodes_rewards)
#     print_stats(over_episodes_balance)

In [None]:
# data = pd.DataFrame(plot_data)

# sns.set_style('whitegrid')
# plt.figure(figsize=(8, 6))

# for key in plot_data:
#     if key == 'x':
#         continue
#     label = plot_settings[key]['label']
#     line = plt.plot('x', key, data=data, linewidth=1, label=label)

# plt.xlabel('episode')
# plt.ylabel('reward')
# plt.title('Random vs. SB3 Agents')
# plt.legend()
# plt.show()