In [28]:
import pandas as pd
import numpy as np
import yfinance as yf
import random

# load NVDA data from Yahoo Finance
nvda = yf.download('NVDA', start='2022-01-01', end='2023-04-03')

# define the reinforcement learning model
class QLearningTrader:
    def __init__(self, alpha, gamma, epsilon, n_actions):
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor
        self.epsilon = epsilon # exploration rate
        self.n_actions = n_actions # number of possible actions
        self.q_table = {} # Q-table

    def choose_action(self, state):
        if np.random.uniform() < self.epsilon:
            # explore
            action = np.random.choice(self.n_actions)
        else:
            # exploit
            if tuple(state) in self.q_table:
                q_values = self.q_table[tuple(state)]
                max_q_value = max(q_values)
                if q_values.count(max_q_value) > 1:
                    # multiple actions have the same max Q-value
                    best_actions = [i for i in range(self.n_actions) if q_values[i] == max_q_value]
                    action = random.choice(best_actions)
                else:
                    action = q_values.index(max_q_value)
            else:
                # no previous experience with this state
                self.q_table[tuple(state)] = [0] * self.n_actions
                action = np.random.choice(self.n_actions)
        return action

    def learn(self, state, action, reward, next_state):
        if tuple(state) not in self.q_table:
            self.q_table[tuple(state)] = [0] * self.n_actions
        if tuple(next_state) not in self.q_table:
            self.q_table[tuple(next_state)] = [0] * self.n_actions
        old_value = self.q_table[tuple(state)][action]
        next_max = max(self.q_table[tuple(next_state)])
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_max)
        self.q_table[tuple(state)][action] = new_value

# define the trading environment
class TradingEnvironment:
    def __init__(self, data):
        self.data = data
        self.n_steps = len(data)
        self.current_step = 0
        self.reward_range = (-1, 1)
        self.current_hold_time = 0
        self.max_hold_time = 10
    def reset(self):
        self.current_step = 0
        return self.data.iloc[self.current_step]

    def step(self, action):
        self.current_step += 1
        if self.current_step >= self.n_steps:
            return None, 0, True, {}
        reward = self._get_reward(action)
        next_state = self.data.iloc[self.current_step]
        done = False
        return next_state, reward, done, {}

    def _get_reward(self, action):
    # calculate reward based on the action and the price change
        prev_price = self.data.iloc[self.current_step - 1]['Close']
        curr_price = self.data.iloc[self.current_step]['Close']
        price_diff = curr_price - prev_price
        if action == 0:
        # hold
            reward = -0.001  # small penalty for holding
        elif action == 1:
        # buy
            reward = -curr_price * 0.001
        else:
        # sell
            reward = price_diff - curr_price * 0.001

    # add a penalty for holding the asset for too long
        if action == 0 and self.current_hold_time >= self.max_hold_time:
            reward -= 0.01

    # update the current hold time
        if action == 0:
            self.current_hold_time += 1
        else:
            self.current_hold_time = 0

        return reward


[*********************100%***********************]  1 of 1 completed


In [None]:
# initialize the Q-learning trader
trader = QLearningTrader(alpha=0.2, gamma=0.9, epsilon=0.1, n_actions=3)

# initialize the trading environment
env = TradingEnvironment(nvda)

# train the Q-learning trader
n_episodes = 1000
max_steps = len(nvda) - 1
returns = []
for episode in range(n_episodes):
    state = env.reset()
    total_reward = 0
    for step in range(max_steps):
        action = trader.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        trader.learn(state, action, reward, next_state)
        state = next_state
        total_reward += reward
        if done:
            break
    returns.append(total_reward)

# make trading decisions for new data using the Q-table learned from training
nvda_new = yf.download('NVDA', start='2023-01-01', end='2023-04-03')
trader.epsilon = 0 # set exploration rate to 0 for exploitation
actions = []
for i in range(len(nvda_new)):
    state = nvda_new.iloc[i]
    action = trader.choose_action(state)
    actions.append(action)

# calculate returns based on the actions taken
prices = nvda_new['Close'].values
holdings = np.zeros(len(prices))
balance = 0
for i in range(len(prices)):
    if actions[i] == 1:
        # buy
        balance -= prices[i]
        holdings[i] += 1
    elif actions[i] == 2 and holdings[i] > 0:
        # sell
        balance += prices[i]
        holdings[i] -= 1
returns_new = np.diff(holdings * prices) + balance

# plot the returns
import matplotlib.pyplot as plt
plt.plot(returns_new)
plt.plot(nvda.loc['2022-04-01':'2023-04-03','Close'])
plt.title('Returns from Trading NVDA using Reinforcement Learning')
plt.xlabel('Time Step')
plt.ylabel('Return ($)')
plt.show()


In [None]:
returns_new