In [14]:
#@title Install required packages if needed:
!pip install gym stable-baselines3 sb3-contrib --quiet
!pip install 'shimmy>=2.0' --quiet

from sb3_contrib.ppo_recurrent import RecurrentPPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from typing_extensions import final
import numpy as np
import pandas as pd
import gym
from gym import spaces
import matplotlib.pyplot as plt
import random
import datetime
import os

  and should_run_async(code)


In [61]:
# --------------------------
#@title 1. Data Loading & Preprocessing
# --------------------------

def resample_to_higher_tf(df, rule, col='close'):
    """
    Resample the 1-minute DataFrame to a higher timeframe by taking the last available close price.
    Then reindex the resulting series to the original timestamps (forward-filled).
    """
    df_copy = df.copy()
    df_copy['timestamp'] = pd.to_datetime(df_copy['timestamp'])  # Ensure timestamp is datetime
    df_copy.set_index('timestamp', inplace=True)  # Set timestamp as index

    # Resample and take the last value in each period.
    resampled = df_copy[col].resample(rule).last()

    # Reindex back to the original 1-minute timestamps using forward fill.
    resampled = resampled.reindex(df_copy.index, method='ffill')

    return resampled.reset_index(drop=True)  # Reset index to match the main DataFrame


# Load the 1m data
data_row = pd.read_csv("btc_1m_2012-2025-02.01-22.00.csv")
data=data_row
df_1m = data.rename(columns=str.lower)

# Derive higher timeframe closing prices:
df_1m['close_1m'] = df_1m['close']  # original 1m close
df_1m['close_1h'] = resample_to_higher_tf(df_1m, "60min", col='close')
df_1m['close_4h'] = resample_to_higher_tf(df_1m, "240min", col='close')

# For our environment, we need a DataFrame sorted by time.
df_1m.sort_values('timestamp', inplace=True)
df_1m.reset_index(drop=True, inplace=True)


In [63]:
df_1m = df_1m.iloc[:-1]

In [None]:
df_1m

In [66]:

# --------------------------
#@title 2. Custom Trading Environment
# --------------------------
class TradingEnv(gym.Env):
    """
    A custom trading environment for BTC/USD.

    Observation:
      A 3-dimensional vector:
        [close_1m, close_1h, close_4h]

    Actions:
      Discrete actions: 0: Hold, 1: Buy, 2: Sell.

    Reward:
      When the agent sells, the reward is the profit (or loss) realized.
      (For simplicity, we assume immediate execution and no transaction costs.)
    """
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(TradingEnv, self).__init__()
        self.df = df.copy()
        # Ensure there is enough data to compute higher timeframes.
        self.start_idx = 240  # 240 minutes for the 4h timeframe
        self.current_step = self.start_idx

        # Observation space: three continuous features.
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(3,), dtype=np.float32)
        # Action space: 0: Hold, 1: Buy, 2: Sell.
        self.action_space = spaces.Discrete(3)

        # Trading parameters
        self.initial_balance = 10000.0
        self.balance = self.initial_balance
        self.position = 0.0  # amount of BTC held
        self.entry_price = 0.0  # price when entering a position
        self.trades = []  # Store actions for analysis

    def reset(self):
        self.current_step = self.start_idx
        self.balance = self.initial_balance
        self.position = 0.0
        self.entry_price = 0.0
        self.trades= []
        return self._get_observation()

    def _get_observation(self):
        row = self.df.iloc[self.current_step]
        obs = np.array([row['close_1m'], row['close_1h'], row['close_4h']])
        return obs.astype(np.float32)

    def step(self, action):
        done = False
        reward = 0.0
        current_price = self.df.iloc[self.current_step]['close_1m']

        # Execute the chosen action:
        if action == 1:  # Buy
            if self.position == 0:  # only enter if no current position
                self.position = self.balance / current_price  # invest full balance
                self.entry_price = current_price
        elif action == 2:  # Sell
            if self.position > 0:  # only sell if in position
                profit = (current_price - self.entry_price) * self.position
                reward = profit
                self.balance += profit
                self.trades.append({
                    'entry_price': self.entry_price,
                    'exit_price': current_price,
                    'p/l': profit,
                    'step': self.current_step
                })
                print(f"Step {self.current_step}: SELL at {current_price:.2f} | P/L: {profit:.2f} | Balance: {self.balance:.2f}")
                self.position = 0.0
                self.entry_price = 0.0

        # Move to next timestep
        self.current_step += 1
        if self.current_step >= len(self.df) - 1:
            done = True
            # Liquidate any remaining position at the end.
            if self.position > 0:
                final_price = self.df.iloc[self.current_step]['close_1m']
                profit = (final_price - self.entry_price) * self.position
                reward += profit
                self.balance += profit
                self.trades.append({
                    'entry_price': self.entry_price,
                    'exit_price': final_price,
                    'p/l': profit,
                    'step': self.current_step
                })
                print(f"Step {self.current_step}: SELL at {exit_price:.2f} | P/L: {profit:.2f} | Balance: {self.balance:.2f}")
                self.position = 0.0

        next_obs = self._get_observation()
        info = {
            'balance': self.balance,
            'trades': self.trades,
            'current_step': self.current_step
        }
        return next_obs, reward, done, info

    def render(self, mode='human'):
        print(f"Step: {self.current_step}, Balance: {self.balance:.2f}, Position: {self.position:.4f}, Price: {self.df.iloc[self.current_step]['close_1m']:.2f}")


In [67]:
# --------------------------
#@title 3. PPO with LSTM (RecurrentPPO) and Reward Logging
# --------------------------

class RewardLoggingCallback(BaseCallback):
    """
    Custom callback for logging rewards at the end of each episode.
    """
    def __init__(self, verbose=0):
        super(RewardLoggingCallback, self).__init__(verbose)
        self.episode_rewards = []

    def _on_step(self) -> bool:
        # Check if done flag is present in infos.
        for info in self.locals.get("infos", []):
            if "episode" in info.keys():
                self.episode_rewards.append(info["episode"]["r"])
        return True

# Wrap our custom environment in a DummyVecEnv:
env = DummyVecEnv([lambda: TradingEnv(df_1m)])

# Create the RecurrentPPO agent using an LSTM-based policy.
model = RecurrentPPO("MlpLstmPolicy", env, verbose=1)

# Create our reward logging callback.
reward_callback = RewardLoggingCallback()

# Train the agent for a specified number of timesteps.
total_timesteps = 100_00
#model.learn(total_timesteps=total_timesteps, callback=reward_callback)
#model.save("ppo_trading_agent_lstm_1m")

print("Training complete. Model saved as 'ppo_trading_agent_lstm_1m'.")


Using cpu device
Training complete. Model saved as 'ppo_trading_agent_lstm_1m'.


## Different models

### Model_2

Learning_rate = 0.01

**Results:**

iterations = 10112

f_balance =  10300      

In [None]:
model_2 = RecurrentPPO("MlpLstmPolicy", env, learning_rate=0.01, verbose=1)
model_2.learn(total_timesteps=total_timesteps, callback=reward_callback)

### model_3

Learning_rate = 0.001

**Results:**

iterations = 10112

f_balance = 12043.86       

In [None]:
model_3 = RecurrentPPO("MlpLstmPolicy", env, learning_rate=0.001, verbose=1)
model_3.learn(total_timesteps=total_timesteps, callback=reward_callback)

### model_3_1

Learning_rate = 0.001

**Results:**

iterations = 20112

f_balance = 12043.86       

In [58]:
total_timesteps=30112
model_3_1 = RecurrentPPO("MlpLstmPolicy", env, learning_rate=0.001, verbose=1)
model_3_1.learn(total_timesteps=total_timesteps, callback=reward_callback)

[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
|    clip_range           | 0.2           |
|    entropy_loss         | -0.785        |
|    explained_variance   | 0             |
|    learning_rate        | 0.001         |
|    loss                 | 110           |
|    n_updates            | 630           |
|    policy_gradient_loss | -3.27e-05     |
|    value_loss           | 220           |
-------------------------------------------
Step 8448: SELL at 68611.00 | P/L: -21.69 | Balance: 9922.12
Step 8497: SELL at 68719.00 | P/L: 15.47 | Balance: 9937.59
Step 8505: SELL at 68800.00 | P/L: 7.08 | Balance: 9944.68
Step 8518: SELL at 68840.00 | P/L: 3.61 | Balance: 9948.29
Step 8535: SELL at 68811.00 | P/L: -2.02 | Balance: 9946.27
Step 8547: SELL at 68805.00 | P/L: -13.57 | Balance: 9932.70
Step 8554: SELL at 68777.00 | P/L: -4.04 | Balance: 9928.65
-----------------------------------------
| time/                   |             |
|    fps                  | 79         

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x782d79567e10>

### model_4
Learning_rate = 0.0001

**Results:**

iterations = 10112

f_balance = 9514.59   


In [None]:
model_4 = RecurrentPPO("MlpLstmPolicy", env, learning_rate=0.0001, verbose=1)
model_4.learn(total_timesteps=total_timesteps, callback=reward_callback)

### model_5
Learning_rate = 0.0002

**Results:**

iterations = 10112

f_balance = 10548.18

In [None]:
model_5 = RecurrentPPO("MlpLstmPolicy", env, learning_rate=0.0002, verbose=1)
model_5.learn(total_timesteps=total_timesteps, callback=reward_callback)

### model_6
Learning_rate = 0.002

**Results:**

iterations = 10112

f_balance = 10002.44


In [47]:
model_6 = RecurrentPPO("MlpLstmPolicy", env, learning_rate=0.002, verbose=1)
model_6.learn(total_timesteps=total_timesteps, callback=reward_callback)

  and should_run_async(code)


Using cpu device
Step 241: SELL at 68339.00 | P/L: -1.76 | Balance: 9998.24
Step 247: SELL at 68363.00 | P/L: 1.17 | Balance: 9999.41
Step 268: SELL at 68414.00 | P/L: 5.12 | Balance: 10004.53
Step 270: SELL at 68452.00 | P/L: 0.88 | Balance: 10005.41
Step 278: SELL at 68420.00 | P/L: -2.19 | Balance: 10003.22
Step 285: SELL at 68381.00 | P/L: -2.05 | Balance: 10001.17
Step 291: SELL at 68401.00 | P/L: 2.78 | Balance: 10003.95
Step 300: SELL at 68365.00 | P/L: -7.60 | Balance: 9996.34
Step 313: SELL at 68358.00 | P/L: -0.58 | Balance: 9995.76
Step 317: SELL at 68358.00 | P/L: 0.00 | Balance: 9995.76
Step 323: SELL at 68391.00 | P/L: 4.09 | Balance: 9999.85
Step 327: SELL at 68378.00 | P/L: 0.00 | Balance: 9999.85
Step 333: SELL at 68376.00 | P/L: 1.17 | Balance: 10001.02
Step 335: SELL at 68384.00 | P/L: 2.05 | Balance: 10003.07
Step 338: SELL at 68392.00 | P/L: 0.00 | Balance: 10003.07
Step 346: SELL at 68411.00 | P/L: 0.15 | Balance: 10003.22
Step 359: SELL at 68399.00 | P/L: -1.75 |

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x782d792f8f90>

## Best Working was model_3 so full data on it with 1000 iteration

In [74]:
total_timesteps=6000000
model_3_1 = RecurrentPPO("MlpLstmPolicy", env, learning_rate=0.001, verbose=1)
model_3_1.learn(total_timesteps=total_timesteps, callback=reward_callback)

[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
|    time_elapsed         | 24686     |
|    total_timesteps      | 1489152   |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -6.73e-06 |
|    explained_variance   | 0         |
|    learning_rate        | 0.001     |
|    loss                 | 8.58e-08  |
|    n_updates            | 116330    |
|    policy_gradient_loss | 2.34e-08  |
|    value_loss           | 2.85e-08  |
---------------------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 60        |
|    iterations           | 11635     |
|    time_elapsed         | 24688     |
|    total_timesteps      | 1489280   |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range    

KeyboardInterrupt: 

In [75]:
policy_weights = model_3_1.policy.state_dict()
print(policy_weights)  # Shows all learned weights

OrderedDict([('mlp_extractor.policy_net.0.weight', tensor([[-0.0522, -0.0210, -0.1242,  ...,  0.0295,  0.1829, -0.1071],
        [-0.0611, -0.0062, -0.0015,  ...,  0.0932, -0.0064, -0.0006],
        [-0.0340, -0.0194,  0.0259,  ...,  0.0182, -0.0402, -0.0313],
        ...,
        [-0.0562,  0.0524,  0.1106,  ..., -0.1356,  0.0185,  0.0288],
        [-0.1174,  0.1792,  0.0752,  ...,  0.0624, -0.0950, -0.0780],
        [-0.0207, -0.1055,  0.1071,  ..., -0.1061, -0.0473, -0.1924]])), ('mlp_extractor.policy_net.0.bias', tensor([-0.0248,  0.0411, -0.0399,  0.0098,  0.0306,  0.0315, -0.0338, -0.0236,
         0.0308, -0.0286, -0.0269, -0.0296, -0.0367,  0.0317, -0.0285,  0.0227,
        -0.0306, -0.0343, -0.0305,  0.0290,  0.0426,  0.0396, -0.0184,  0.0208,
         0.0284,  0.0227, -0.0263, -0.0241, -0.0130,  0.0287,  0.0309, -0.0311,
         0.0204,  0.0401, -0.0035,  0.0251,  0.0330, -0.0104, -0.0434,  0.0310,
         0.0233,  0.0233,  0.0338, -0.0128, -0.0320,  0.0348, -0.0103, -0.020

## Predicting

In [48]:
def predict_actions(model, env, steps=100):
    obs = env.reset()
    actions = []
    for _ in range(steps):
        action, _ = model.predict(obs, deterministic=True)
        actions.append(action)
        obs, reward, done, _ = env.step(action)
        if done:
            break
    return actions

### model_2

In [50]:
predicted_actions = predict_actions(model_2, env, steps=100)
print("Predicted actions over 100 steps:", predicted_actions)

Predicted actions over 100 steps: [array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array

  and should_run_async(code)


### model_3

In [51]:
predicted_actions = predict_actions(model_3, env, steps=100)
print("Predicted actions over 100 steps:", predicted_actions)

Predicted actions over 100 steps: [array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array

  and should_run_async(code)


### model_4

In [52]:
predicted_actions = predict_actions(model_4, env, steps=100)
print("Predicted actions over 100 steps:", predicted_actions)

  and should_run_async(code)


Predicted actions over 100 steps: [array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array([1]), array

### model_5

In [53]:
predicted_actions = predict_actions(model_5, env, steps=100)
print("Predicted actions over 100 steps:", predicted_actions)

Predicted actions over 100 steps: [array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array

  and should_run_async(code)


### model_6

In [54]:
predicted_actions = predict_actions(model_6, env, steps=100)
print("Predicted actions over 100 steps:", predicted_actions)

  and should_run_async(code)


Predicted actions over 100 steps: [array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array

## Predict with best model

In [76]:
predicted_actions = predict_actions(model_3_1, env, steps=100)
print("Predicted actions over 100 steps:", predicted_actions)

Predicted actions over 100 steps: [array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array([2]), array