In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras import regularizers
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
from collections import deque
import random
import math

In [2]:
# Need to run this command as an administrator
#pip install tensorflow


In [3]:
import os
import tensorflow as tf

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow logging
tf.get_logger().setLevel('ERROR')


In [4]:
features = ['OPEN', 'HIGH', 'LOW', 'PREV. CLOSE', 'ltp', 'close', 'vwap', '52W H', '52W L', 'VOLUME', 'VALUE']

In [5]:
# Define the action space
actions = ['BUY', 'SELL', 'HOLD']
num_actions = len(actions)

In [6]:
# Define the DDQL agent class
class DDQLAgent:
    def __init__(self, state_size, action_size, look_back):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=1500)
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_decay = 0.990
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.look_back = look_back  # Number of previous time steps to consider
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential([
            LSTM(32, input_shape=(self.look_back, self.state_size), return_sequences=True, 
                 kernel_regularizer=regularizers.l2(0.01)),
            Dropout(0.2),
            LSTM(32, return_sequences=False, kernel_regularizer=regularizers.l2(0.01)),
            Dropout(0.2),
            Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
            Dense(self.action_size, activation='linear', kernel_regularizer=regularizers.l2(0.01))
        ])
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, env):
        state = np.reshape(state, [1, self.look_back, self.state_size])
        if env.current_step == env.look_back:
            return 0  # Buy action at the start of each episode
        else:
            if np.random.rand() <= self.epsilon:
                return random.randint(0, self.action_size - 1)
            else:
                act_values = self.model.predict(state, verbose=0)
                return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state, verbose=0)[0]))
            target_f = self.model.predict(state, verbose=0)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [7]:
# Define the Environment

class TradingEnvironment:
    def __init__(self, data, look_back):
        self.data = data
        self.n = len(data)
        self.current_step = 0
        self.hold_counter = 0
        self.hold_penalty = 1  # Initialize the hold penalty factor
        self.look_back = look_back  # Add look_back attribute
        self.buy_price = -1
        self.has_bought = False  # Track if the agent has already bought the stock
        self.is_holding = False  # New state variable to track holding position
        #self.actions_taken = []

    def is_illegal_action(self, action):
        # Example condition: cannot sell if not holding a stock
        if actions[action] == 'SELL' and self.buy_price == -1:
            return True
        return False

    def detect_bullish_engulfing(self):
        if self.current_step < 1:
            return False
    
        previous_candle = self.data.iloc[self.current_step - 1]
        current_candle = self.data.iloc[self.current_step]
    
        if (previous_candle['close'] < previous_candle['OPEN'] and
                current_candle['close'] > current_candle['OPEN'] and
                current_candle['close'] > previous_candle['OPEN'] and
                current_candle['OPEN'] < previous_candle['close']):
            return True
        else:
            return False

    def reset(self):
        self.current_step = self.look_back  # Start from look_back step
        self.buy_price = -1  # Reset buy_price to indicate no stock held
        self.is_holding = False  # Reset holding position
        self.has_bought = False  # Reset the flag when the episode resets

        return self.data.iloc[self.current_step - self.look_back:self.current_step][features].values

    def step(self, action):
        next_state = np.zeros((self.look_back, len(features)))

        if self.is_illegal_action(action):
            # Handle illegal action
            reward = -100  # Penalty for illegal action
        else:
            # Regular action handling
            reward = self.calculate_reward(action)

        # Increment the step
        self.current_step += 1
        done = self.current_step + self.look_back > self.n

        if not done:
            next_state = self.data.iloc[self.current_step:self.current_step + self.look_back][features].values
        else:
            next_state = np.zeros((self.look_back, len(features)))  # End of data handling

        return next_state, reward, done


    
    def calculate_reward(self, action):
        if actions[action] == 'SELL':
            if self.buy_price == -1:
                return -150  # Penalty for selling without holding
            else:
                profit = self.data.iloc[self.current_step]["close"] - self.buy_price
                self.buy_price = -1
                self.is_holding = False
                self.hold_counter = 0
                self.has_bought = False
                if profit > 0:
                    return max(200, profit)  # Reward at least 200 or actual profit if greater
                else:
                    return profit  # Penalize for loss
                
        elif actions[action] == 'BUY':
            if self.is_holding:
                return -200  # Increased penalty for trying to buy when already holding
            else:
                self.buy_price = self.data.iloc[self.current_step]["OPEN"]
                self.is_holding = True
                self.has_bought = True
                return 0  # No immediate reward for buying

        elif actions[action] == 'HOLD':
            if self.is_holding:
                potential_profit = self.data.iloc[self.current_step]["close"] - self.buy_price
                if potential_profit > 0:
                    return 10  # Reward for holding a potentially profitable stock
                else:
                    return -5  # Penalty for holding a non-profitable stock
            else:
                return -100  # Penalty for holding without buying

        else:
            return 0  # Default case: no reward or penalty for other actions



    


In [8]:
# Cumulative Reward Plot
def plot_cumulative_reward(cumulative_rewards):
    plt.figure(figsize=(10, 6))
    plt.plot(cumulative_rewards)
    plt.xlabel('Episode')
    plt.ylabel('Cumulative Reward')
    plt.title('Cumulative Reward over Episodes')
    plt.grid(True)
    plt.show()

# Action Distribution Plot
def plot_action_distribution(all_actions):
    actions_flat = [action for actions_episode in all_actions for action in actions_episode]
    action_counts = np.unique(actions_flat, return_counts=True)
    plt.figure(figsize=(8, 6))
    plt.bar(action_counts[0], action_counts[1])
    plt.xlabel('Action')
    plt.ylabel('Frequency')
    plt.title('Action Distribution over Episodes')
    plt.grid(True)
    plt.show()

# Episode Length Plot
def plot_episode_lengths(episode_lengths):
    plt.figure(figsize=(10, 6))
    plt.plot(episode_lengths)
    plt.xlabel('Episode')
    plt.ylabel('Episode Length')
    plt.title('Episode Length over Episodes')
    plt.grid(True)
    plt.show()

# Reward Distribution Plot
def plot_reward_distribution(rewards):
    plt.figure(figsize=(8, 6))
    plt.hist(rewards, bins=20)
    plt.xlabel('Reward')
    plt.ylabel('Frequency')
    plt.title('Reward Distribution over Episodes')
    plt.grid(True)
    plt.show()

# Action Trajectory Plot
def plot_action_trajectory(actions_episode):
    plt.figure(figsize=(10, 6))
    plt.plot(actions_episode)
    plt.xlabel('Time Step')
    plt.ylabel('Action')
    plt.title('Action Trajectory in a Single Episode')
    plt.grid(True)
    plt.show()

# State Trajectory Plot
def plot_state_trajectory(states_episode):
    num_features = states_episode[0].shape[2]  # Get the number of features from the state shape
    num_time_steps = len(states_episode)
    fig, axs = plt.subplots(num_features, 1, figsize=(10, num_features * 3))
    if num_features == 1:
        axs = [axs]  # Convert to a list if there's only one feature
    for i in range(num_features):
        feature_values = [state[0][0][i] for state in states_episode]  # Extract the feature values
        axs[i].plot(feature_values)
        axs[i].set_xlabel('Time Step')
        axs[i].set_ylabel(f'Feature {i+1}')
        axs[i].set_title(f'State Feature {i+1} Trajectory in a Single Episode')
        axs[i].grid(True)
    plt.tight_layout()
    plt.show()

# Portfolio Value Plot
def plot_portfolio_value(portfolio_values):
    plt.figure(figsize=(10, 6))
    plt.plot(portfolio_values)
    plt.xlabel('Episode')
    plt.ylabel('Portfolio Value')
    plt.title('Portfolio Value over Episodes')
    plt.grid(True)
    plt.show()

# State-Action Value Heatmap
def plot_q_value_heatmap(q_values):
    # Assuming q_values is a 2D array where rows represent states and columns represent actions
    plt.figure(figsize=(10, 6))
    plt.imshow(q_values, cmap='hot', interpolation='nearest')
    plt.colorbar()
    plt.xlabel('Action')
    plt.ylabel('State')
    plt.title('State-Action Value (Q-Value) Heatmap')
    plt.show()


In [9]:
# Load and preprocess data
data = pd.read_csv('fin_data.csv')
data = data.iloc[:, 2:] 

# Define a function to preprocess the data
def preprocess_data(data):
    # Replace commas with empty strings and convert to float
    data = data.replace(',', '', regex=True).astype(float)
    return data

# Preprocess the entire DataFrame
data = preprocess_data(data)

look_back = 15  # Assuming you want to look back 10 time steps, adjust as needed
state_size = len(features)
num_actions = len(actions)

# Initialize the environment and agent
env = TradingEnvironment(data, look_back)
agent = DDQLAgent(state_size, num_actions, look_back)


  super().__init__(**kwargs)


In [10]:
batch_size = 16
num_episodes = 250

cumulative_rewards = []
episode_lengths = []
rewards = []
all_actions = []
states_episode_all = []

for episode in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, agent.look_back, state_size])
    total_reward = 0
    episode_length = 0
    actions_episode = []
    states_episode = []
    holding_start_step = None
    holding_price = None
    holding_steps = 0
    episode_profit = 0  # Initialize episode profit
    
    for t in range(env.n):
        action = agent.act(state, env)
        next_state, reward, done = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, agent.look_back, state_size])  # Correct reshaping
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        actions_episode.append(actions[action])
        states_episode.append(state)
        episode_length += 1
        
        if actions[action] == 'BUY':
            holding_start_step = t
            holding_price = env.data.iloc[env.current_step]["close"]
            holding_steps = 0
            #print(f"Episode: {episode + 1}/{num_episodes}, Step: {t}, Action: BUY, Buy Price: {holding_price:.2f}")
            
        elif actions[action] == 'SELL':
            if holding_start_step is not None:
                buy_price = holding_price
                sell_price = env.data.iloc[env.current_step]["close"]
                profit = sell_price - buy_price
                episode_profit += profit  # Accumulate profit for the episode
                #print(f"Episode: {episode + 1}/{num_episodes}, Step: {t}, Action: SELL, Buy Price: {buy_price:.2f}, Sell Price: {sell_price:.2f}, Profit: {profit:.2f}")
        
        elif actions[action] == 'HOLD':
            if holding_start_step is not None:
                holding_steps += 1
                #print(f"Episode: {episode + 1}/{num_episodes}, Step: {t}, Action: HOLD, Holding Price: {holding_price:.2f}, Holding Steps: {holding_steps}")
            
        if done:
            all_actions.append(actions_episode)
            break
    
    print(f"Episode: {episode + 1}/{num_episodes}, Total Reward: {total_reward:.2f}, Total Profit: {episode_profit:.2f}")
    cumulative_rewards.append(total_reward)
    episode_lengths.append(episode_length)
    rewards.extend([reward] * episode_length)
    all_actions.append(actions_episode)
    states_episode_all.append(states_episode)
    
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)

    if len(agent.memory) == agent.memory.maxlen:
        agent.memory.popleft()  # This will remove the oldest experience if memory is full

model_save_path = "C:\\Users\\varma\\OneDrive\\Desktop\\MastersAI\\_MachineLearning\\Project\\DDQL\\Saved_Models\\ddql4_model.keras"
agent.model.save(model_save_path)


Episode: 1/250, Total Reward: -257912.95, Total Profit: 4568.00
Episode: 2/250, Total Reward: -259841.45, Total Profit: 1422.80
Episode: 3/250, Total Reward: -250397.35, Total Profit: 4047.95
Episode: 4/250, Total Reward: -252483.45, Total Profit: -4554.75
Episode: 5/250, Total Reward: -257785.35, Total Profit: -7.90
Episode: 6/250, Total Reward: -260842.60, Total Profit: 7112.05
Episode: 7/250, Total Reward: -264456.85, Total Profit: 9842.40
Episode: 8/250, Total Reward: -254584.95, Total Profit: 6891.75
Episode: 9/250, Total Reward: -261289.70, Total Profit: 8625.25
Episode: 10/250, Total Reward: -263938.80, Total Profit: 9611.65
Episode: 11/250, Total Reward: -259090.55, Total Profit: 2880.15
Episode: 12/250, Total Reward: -255669.45, Total Profit: -4104.15
Episode: 13/250, Total Reward: -260589.80, Total Profit: 11820.60
Episode: 14/250, Total Reward: -260325.05, Total Profit: 1507.60
Episode: 15/250, Total Reward: -259705.50, Total Profit: -2749.10
Episode: 16/250, Total Reward: -

In [None]:
# Results Plotting

# Plotting total rewards gained in every episode
plt.figure(figsize=(12, 6))
plt.plot(cumulative_rewards[-100:])
plt.title("Learning Curve(Total Rewards per Episode)")
plt.xlabel("Episode")
plt.ylabel("Cumulative Reward")
plt.show()


# Assuming all_actions is a list of lists, where each sublist contains actions for an episode
buy_counts = [episode.count('BUY') for episode in all_actions]
sell_counts = [episode.count('SELL') for episode in all_actions]
hold_counts = [episode.count('HOLD') for episode in all_actions]

# stacked bar chart
plt.figure(figsize=(12, 6))
episodes = range(len(buy_counts))
plt.bar(episodes, buy_counts, label='Buy', color='green')
plt.bar(episodes, sell_counts, bottom=buy_counts, label='Sell', color='red')
plt.bar(episodes, hold_counts, bottom=[i+j for i,j in zip(buy_counts, sell_counts)], label='Hold', color='blue')

plt.title("Count of Buy, Sell, Hold Actions per Episode")
plt.xlabel("Episode")
plt.ylabel("Action Count")
plt.legend()
plt.show()

# TESTING ENV








..

In [None]:
# Define the environment

class TradingEnvironmentTest:
    def __init__(self, data, look_back):
        self.data = data
        self.n = len(data)
        self.current_step = 0
        self.hold_counter = 0
        self.hold_penalty = 1  # Initialize the hold penalty factor
        self.look_back = look_back  # Add look_back attribute
        self.buy_price = -1
        self.is_holding = False  # New state variable to track holding position
        #self.actions_taken = []

    def is_illegal_action(self, action):
        # Example condition: cannot sell if not holding a stock
        if actions[action] == 'SELL' and self.buy_price == -1:
            return True
        return False

    def detect_bullish_engulfing(self):
        if self.current_step < 1:
            return False
    
        previous_candle = self.data.iloc[self.current_step - 1]
        current_candle = self.data.iloc[self.current_step]
    
        if (previous_candle['close'] < previous_candle['OPEN'] and
                current_candle['close'] > current_candle['OPEN'] and
                current_candle['close'] > previous_candle['OPEN'] and
                current_candle['OPEN'] < previous_candle['close']):
            return True
        else:
            return False

    def reset(self):
        self.current_step = self.look_back  # Start from look_back step
        self.buy_price = -1  # Reset buy_price to indicate no stock held
        self.is_holding = False  # Reset holding position
        return self.data.iloc[self.current_step - self.look_back:self.current_step][features].values

    def step(self, action):
        # Update the step logic to detect bullish engulfing pattern
        if self.detect_bullish_engulfing():
            #print("Bullish engulfing pattern detected!")
            if self.is_holding:
                # Agent is already holding a stock
                if actions[action] == 'SELL':
                    # Agent decides to sell based on the bullish engulfing pattern
                    reward = self.calculate_reward(action)
                    next_state = self.data.iloc[self.current_step:self.current_step + self.look_back][features].values
                    done = False
                else:
                    # Agent decides to continue holding
                    reward = self.calculate_reward('HOLD')  # Use 'HOLD' directly here
                    next_state = self.data.iloc[self.current_step:self.current_step + self.look_back][features].values
                    done = False
            else:
                # Agent is not holding a stock, proceed with regular step logic
                reward = self.calculate_reward(action)
                self.current_step += 1
                if self.current_step + self.look_back > self.n:
                    done = True
                    next_state = np.zeros((self.look_back, len(features)))  # End of data handling
                else:
                    done = False
                    next_state = self.data.iloc[self.current_step:self.current_step + self.look_back][features].values
        else:
            if self.is_illegal_action(action):
                # Handle illegal action: could be a fixed penalty or other measures
                reward = -1000  # Example penalty for illegal action
                done = False
                next_state = self.data.iloc[self.current_step:self.current_step + self.look_back][features].values
            else:
                self.current_step += 1
                if self.current_step + self.look_back > self.n:
                    done = True
                    next_state = np.zeros((self.look_back, len(features)))  # End of data handling
                else:
                    done = False
                    next_state = self.data.iloc[self.current_step:self.current_step + self.look_back][features].values
                reward = self.calculate_reward(action)
    
        return next_state, reward, done



    def calculate_reward(self, action):
        if actions[action] == 'SELL':
            if self.buy_price == -1:
                # Penalize for selling without holding to prevent shorting
                return -1000
            else:
                profit = self.data.iloc[self.current_step]["close"] - self.buy_price
                self.buy_price = -1
                self.is_holding = False  # Reset holding position after selling
                self.hold_counter = 0  # Reset hold counter after selling
                return profit
    
        if actions[action] == 'BUY':
            if self.buy_price != -1:
                # Small penalty for trying to buy when already holding
                return -100
            else:
                self.buy_price = self.data.iloc[self.current_step]["OPEN"]
                self.is_holding = True  # Set holding position after buying
                return 0  # No immediate reward for buying
    
        if actions[action] == 'HOLD':
            self.hold_counter += 1
            # Increasing penalty for holding: the longer you hold, the larger the penalty
            penalty = -self.hold_penalty * self.hold_counter * self.hold_counter
            return penalty
    
        else:
            return 0

                        

In [None]:
env = TradingEnvironmentTest(data, look_back)

In [None]:
# Modified test training code to record only the profit made but not the total reward
model = load_model(model_save_path)

state_size = len(env.reset()[0])
actions = {0: 'SELL', 1: 'BUY', 2: 'HOLD'} 

batch_size = 10
num_episodes = 10

for episode in range(num_episodes):
    state = env.reset()
    state = np.reshape(state, [1, agent.look_back, state_size])
    total_profit = 0  # To track profit
    holding = False   # Flag to track if holding a stock
    actions_episode = []

    agent.epsilon = 0  # Ensure no random actions during testing

    done = False  # Initialize the termination condition
    
    while not done:
        action = agent.act(state, env)
        next_state, reward, done = env.step(action)  # Capture the reward value as well
    
        # Modify the following part accordingly to use the reward value if necessary
    
        if actions[action] == 'SELL' and env.buy_price != -1:
            # Calculate profit for this transaction and reset holding flag
            sell_price = env.data.iloc[env.current_step]["close"]
            profit = sell_price - env.buy_price
            total_profit += profit
            holding = False
    
        elif actions[action] == 'BUY':
            # Set holding flag to true when buying
            holding = True
    
        state = np.reshape(next_state, [1, agent.look_back, state_size])
        actions_episode.append(actions[action])
    
    holding_status = "Holding" if holding else "Not Holding"
    print(f"Episode: {episode + 1}/{num_episodes}, Total Profit: {total_profit:.2f}, {holding_status}")
    all_actions.append(actions_episode)
