# Derinlemesine Reinforcement Learning (RL)

## İmportlar

In [60]:
import gym
from gym import spaces
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Veri Okuma

In [70]:
data = pd.read_csv('/content/drive/MyDrive/btc_1m_2012-2025-02.01-22.00.csv')  # Veri dosyanızın yolu
data = data.drop(data.index[-1])  # Son elemanın indeksi: -1
data = data.rename(columns=str.lower)
data = data.rename(columns={'timestamp': 'date'})
data['date'] = pd.to_datetime(data['date'], unit='s')
data.set_index('date', inplace=True)

## Env

In [101]:
class TradingEnvironment(gym.Env):
    def __init__(self, data, initial_balance=10000):
        super(TradingEnvironment, self).__init__()
        self.data = data
        self.initial_balance = initial_balance
        self.current_step = 0
        self.balance = initial_balance
        self.position = 0  # 1: Long, -1: Short, 0: No Position
        self.total_profit = 0
        self.entry_price = 0

        # Lookback aralıkları
        self.lookback_offsets = [1440, 240, 60, 15]

        # Gözlem alanı: Açılış, Yüksek, Düşük, Kapanış, Pozisyon, Lookback low ve high değerleri
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(5 + 2 * len(self.lookback_offsets),), dtype=np.float32
        )
        # Eylem alanı: 0 -> Bekle, 1 -> Short, 2 -> Long, 3 -> Pozisyonu kapat
        self.action_space = spaces.Discrete(4)

    def reset(self):
        self.current_step = 0
        self.balance = self.initial_balance
        self.position = 0
        self.total_profit = 0
        self.entry_price = 0
        return self._next_observation()

    def _get_lookback_highs_and_lows(self, default_value=float('nan')):
        """Lookback chunk verilerinin high ve low değerlerini hesaplar."""
        lookback_lows = []
        lookback_highs = []

        for offset in self.lookback_offsets:
            if self.current_step - offset <= 0:
                lookback_low = self.data['low'].iloc[self.current_step]
                lookback_high = self.data['high'].iloc[self.current_step]
            else:
                lookback_low = self.data['low'].iloc[offset:self.current_step].min()
                lookback_high = self.data['high'].iloc[offset:self.current_step].max()

            lookback_lows.append(lookback_low)
            lookback_highs.append(lookback_high)

        return lookback_lows, lookback_highs

    def _next_observation(self):
        """Bir sonraki gözlem durumunu döner."""
        frame = self.data.iloc[self.current_step]
        lookback_lows, lookback_highs = self._get_lookback_highs_and_lows(default_value=self.data.iloc[self.current_step]['close'])

        observation = np.array([
            frame['open'], frame['high'], frame['low'], frame['close'], self.position,
            *lookback_lows, *lookback_highs
        ])

        # NaN değerlerini kontrol et ve düzelt
        observation = np.nan_to_num(observation)
        return observation

    def step(self, action):
        current_price = self.data.iloc[self.current_step]['close']
        lookback_lows, lookback_highs = self._get_lookback_highs_and_lows(default_value=self.data.iloc[self.current_step]['close'])

        # Aksiyonları işle
        if action == 1:  # Short pozisyon aç
            if self.position == 0:  # Yalnızca pozisyon açık değilse
                self.position = -1
                self.entry_price = current_price
            else:
                action = 3
                profit = (current_price - self.entry_price) * self.position
                self.total_profit += profit
                self.balance += profit
                self.position = 0  # Pozisyonu kapat
                self.position = -1 # Short pozisyon aç
                self.entry_price = current_price

        elif action == 2:  # Long pozisyon aç
            if self.position == 0:  # Yalnızca pozisyon açık değilse
                self.position = 1
                self.entry_price = current_price
            else:
                action = 3
                profit = (current_price - self.entry_price) * self.position
                self.total_profit += profit
                self.balance += profit
                self.position = 0  # Pozisyonu kapat
                self.position = 1  # Long pozisyon aç
                self.entry_price = current_price

        elif action == 3:  # Pozisyonu kapat
            if self.position != 0:  # Yalnızca pozisyon açıkken
                profit = (current_price - self.entry_price) * self.position
                self.total_profit += profit
                self.balance += profit
                self.position = 0  # Pozisyonu kapat

        # Sinyalleri kontrol et
        lookback_lows, lookback_highs = self._get_lookback_highs_and_lows()
        current_low = self.data.iloc[self.current_step]['low']
        current_high = self.data.iloc[self.current_step]['high']
        short_signal = any(current_low < low and current_high < low for low in lookback_lows)
        long_signal = any(current_low > high and current_high > high for high in lookback_highs)

        # Sinyale dayalı otomatik işlem
        if self.position == 0:  # Yalnızca pozisyon açık değilse
            if short_signal:
                self.position = -1
                self.entry_price = current_price
            elif long_signal:
                self.position = 1
                self.entry_price = current_price


        # Bir adım ileri git
        self.current_step += 1
        done = self.current_step >= len(self.data) - 1

        # Ödül: Güncel toplam kar
        reward = self.balance - self.initial_balance
        return self._next_observation(), reward, done, {}

    def render(self):
        print(f'Step: {self.current_step}, Balance: {self.balance}, Total Profit: {self.total_profit}')


## Deneme


In [100]:
# PPO Modeli Kurma ve Eğitim
def train_model():
    env = TradingEnvironment(data)

    model = PPO('MlpPolicy', env, verbose=1)

    # Modeli eğit
    model.learn(total_timesteps=10000)

    # Eğitim sonrası test et
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _states = model.predict(state)
        state, reward, done, _ = env.step(action)
        total_reward += reward

    print(f"Total Reward: {total_reward}")

# Modeli çalıştır
train_model()

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-----------------------------
| time/              |      |
|    fps             | 338  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2048 |
-----------------------------


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

## Optimizasyon

### Pip İnstall & İmport

In [None]:
!pip install optuna
!pip install --upgrade stable-baselines3
!pip install stable-baselines3[extra]
!pip install 'shimmy>=2.0'

In [11]:
import optuna
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3 import PPO
import gymnasium as gym

### Optuna Fonks

In [85]:
def optimize_ppo(trial):
    """
    PPO modelini belirli parametrelerle optimize eder ve ortalama ödülü döndürür.
    """

    trail_env = make_vec_env(lambda: TradingEnvironment(data), n_envs=1)

    # PPO modelini oluştur
    model = PPO(
        policy='MlpPolicy',
        env=trail_env,
        learning_rate=trial.suggest_loguniform('learning_rate', 1e-5, 1e-2),
        n_steps=trial.suggest_int('n_steps', 1024, 2048, step=128),
        batch_size=trial.suggest_categorical('batch_size', [32, 64, 128, 256, 512]),
        gamma=trial.suggest_uniform('gamma', 0.9, 0.999),
        use_sde=False,
        verbose=0
    )

    # Modeli eğit
    model.learn(total_timesteps=10000)

    # Ortalama ödülü hesapla
    rewards = []
    for _ in range(10):
        state = trail_env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = model.predict(state)
            state, reward, done, _ = trail_env.step(action)
            total_reward += reward
        rewards.append(total_reward)

    mean_reward = np.mean(rewards)
    return mean_reward


### Optuna

In [86]:
# Optuna çalışma
study = optuna.create_study(direction='maximize')
study.optimize(optimize_ppo, n_trials=50)


[I 2025-01-16 19:54:03,912] A new study created in memory with name: no-name-87e13eec-88e4-47d2-8314-80645ad24109
  learning_rate=trial.suggest_loguniform('learning_rate', 1e-5, 1e-2),
  gamma=trial.suggest_uniform('gamma', 0.9, 0.999),
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1920 and n_envs=1)
[W 2025-01-16 19:54:10,859] Trial 0 failed with parameters: {'learning_rate': 1.4244344080005866e-05, 'n_steps': 1920, 'batch_size': 256, 'gamma': 0.9223565997820701} because of the following error: RuntimeError('element 0 of tensors does not require grad and does not have a grad_fn').
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-85-4691790d5bd9>", line 21, in optimize_ppo
    model.learn(total_timesteps=10000)
  File "/usr/local/lib/python3.11/dist-packages/stable_base

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
# En iyi parametreleri yazdır
print(f"En iyi parametreler: {study.best_params}")