In [92]:
import math
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np

import matplotlib.pyplot as plt
import torch
import seaborn as sns

#Now to implement q learning and variants on the above market environment
import sys
if "../../" not in sys.path:
  sys.path.append("../../") 

from TD.TD import GeneralQ #, ExperienceQ, DynaQ 
#from FA.model import TabularModel 
#from TD.Tabular import ExpTabAgent
#from lib.envs.market import Market

import collections
import ptan
import torch.nn as nn

%matplotlib inline

In [93]:
class Market(gym.Env):
    def __init__(self, kappa, episodes, time_periods, mu, r, sigma, inv_range):

        self.episodes = episodes
        self.time_periods = time_periods
        self.epi = 0
        self.t = 0

        self.inv_range = inv_range

        self.mu = mu
        self.r = r
        self.sigma = sigma

        self._start_wealth = 100.0
        self.kappa = kappa

        self.action_space = spaces.Discrete(len(inv_range))
        self.observation_space = spaces.Box(0,120, np.array([2]))
        
        self.seed()
        self.viewer = None
        self.state = None

        
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    
    def reset(self):
        self.S, self.B, self.dS, self.dB = self.generate_price_series()
        self.wealth = self._start_wealth
        self.state = (int(self.S[self.t,self.epi]*100),int(self._start_wealth/10))
        return np.array(self.state)
    
    def step(self, action):
        
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        price_state, wealth_state = self.state
        
        prop = self.inv_range[action]

        discount = 0.99

        if self.t < self.time_periods:
            NB = (1-prop)*self.wealth/self.B[self.t]
            NS = prop*self.wealth/self.S[self.t, self.epi]

            dX = NB*self.dB[self.t] +NS*self.dS[self.t,self.epi]
            reward = dX - (self.kappa/2)*(dX**2)  
            self.wealth += dX

            done = False

            self.wealth_state = int(self.wealth/10)
            self.t += 1
            
            new_state = (int(self.S[self.t,self.epi]*100), self.wealth_state)
            self.state = new_state
            final_wealth = 0.0
            
        else:
            #reached the end of episode...
            self.t = 0
            self.epi = 0 #+=1

            reward = 0.0
            dX = 0.0
            final_wealth = self.wealth
            done = True

            _ = self.reset()

        return np.array(self.state), reward, done, final_wealth

    def generate_price_series(self):
        I = 1 #self.episodes
        M = self.time_periods

        S0 = 1
        B0 = 1
        T = 1.0
        dt = T/M

        mu = self.mu
        r = self.r
        sigma = self.sigma

        S = np.zeros((M+1,I))
        dS = np.zeros((M,I))
        dB = np.zeros(M)

        B = np.zeros(M+1)
        B[0] = B0
        S[0] = S0

        for t in range(1, M+1):
            z = np.random.standard_normal(I)
            #df = 10
            #z = np.random.standard_t(df,I)
            S[t] = S[t-1]*np.exp((mu-0.5*sigma**2)*dt + sigma*math.sqrt(dt)*z)
            B[t] = B[t-1]*np.exp(r*dt)

        for t in range(1,M):
            dS[t] = S[t+1] - S[t]
            dB[t] = B[t+1] - B[t]

        return S, B, dS, dB

In [94]:
kappa = 0.008
episodes = 3000000
time_periods = 20
mu =0.10
rf = 0.02
sigma = 0.20

In [95]:
episodes = episodes #500k worked well? as did 1m

utes = 15
u_star = np.linspace(0,2, utes)

Mark = Market(kappa, episodes, time_periods, mu, rf, sigma, u_star) #parameters from earlier

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [110]:
env = Mark
env = gym.make("CartPole-v0")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [111]:
net = nn.Sequential(
        nn.Linear(env.observation_space.shape[0],256),
        nn.ReLU(),
        nn.Linear(256, env.action_space.n))

In [112]:
action_selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.1)
agent = ptan.agent.DQNAgent(net, action_selector)

In [113]:
obs = np.array([env.reset()], dtype=np.float32)

In [114]:
agent(obs)

(array([0]), [None])

In [115]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=0.99, steps_count=1)
it = iter(exp_source)
next(it)

RuntimeError: Expected object of type torch.FloatTensor but found type torch.DoubleTensor for argument #4 'mat1'

In [54]:
from tensorboardX import SummaryWriter

GAMMA = 0.9
ALPHA = 0.2
TEST_EPISODES = 20

In [74]:
class Agent:
    def __init__(self, env):
        self.env = env #gym.make(ENV_NAME)
        self.state = self.env.reset()[1]
        self.values = collections.defaultdict(float)

    def sample_env(self):
        action = self.env.action_space.sample()
        old_state = self.state
        new_state, reward, is_done, _ = self.env.step(action)
        
        print(reward)
        self.state = self.env.reset() if is_done else new_state
        
        self.state = self.state[1]
        new_state = new_state[1]
        return (old_state, action, reward, new_state)

    def best_value_and_action(self, state):
        
        best_value, best_action = None, None
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action      
        if np.random.random() < 0.1:
            best_action = self.env.action_space.sample()
            
        return best_value, best_action

    def value_update(self, s, a, r, next_s):
        best_v, _ = self.best_value_and_action(next_s)
        
        new_val = r + GAMMA * best_v
        old_val = self.values[(s, a)]
        
        #print(best_v,r)
        self.values[(s, a)] = old_val * (1-ALPHA) + new_val * ALPHA

    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()[1]
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            
            #print(reward)
            if is_done:
                break
            state = new_state[1]
        
        #print(total_reward)
        return total_reward

In [75]:
num_episodes = 10000

test_env = env
agent = Agent(test_env)
writer = SummaryWriter(comment="-q-learning")

iter_no = 0
best_reward = 0.0

for i_episode in range(num_episodes-1):
    iter_no += 1
    s, a, r, next_s = agent.sample_env()
    
    agent.value_update(s, a, r, next_s)

    reward = 0.0
    for _ in range(TEST_EPISODES):
        reward += agent.play_episode(test_env)
    reward /= TEST_EPISODES
    writer.add_scalar("reward", reward, iter_no)
    if reward > best_reward:
        print("Best reward updated %.3f -> %.3f" % (best_reward, reward))
        best_reward = reward
writer.close()

0.0
Best reward updated 0.000 -> 4.325
0.0
Best reward updated 4.325 -> 4.640
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Best reward updated 4.640 -> 5.106
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Best reward updated 5.106 -> 5.404
0.0
Best reward updated 5.404 -> 6.560
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
Best reward updated 6.560 -> 7.834
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.

KeyboardInterrupt: 

In [68]:
agent.values

defaultdict(float,
            {(5, 0): 0.0,
             (5, 1): 0.0,
             (5, 2): 0.0,
             (5, 3): 0.0,
             (5, 4): 0.0,
             (5, 5): 0.0,
             (5, 6): 0.0,
             (5, 7): 0.0,
             (5, 8): 0.0,
             (5, 9): 0.0,
             (5, 10): 0.0,
             (5, 11): 0.0,
             (5, 12): 0.0,
             (5, 13): 0.0,
             (5, 14): 0.0,
             (6, 0): 0.0,
             (6, 1): 0.0,
             (6, 2): 0.0,
             (6, 3): 0.0,
             (6, 4): 0.0,
             (6, 5): 0.0,
             (6, 6): 0.0,
             (6, 7): 0.0,
             (6, 8): 0.0,
             (6, 9): 0.0,
             (6, 10): 0.0,
             (6, 11): 0.0,
             (6, 12): 0.0,
             (6, 13): 0.0,
             (6, 14): 0.0,
             (7, 0): 0.0,
             (7, 1): 0.0,
             (7, 2): 0.0,
             (7, 3): 0.0,
             (7, 4): 0.0,
             (7, 5): 0.0,
             (7, 6): 0.0,
         

In [64]:
def train_agent(kappa, episodes,):
    
    SARSA = False
    double = False
    utilities_mod = []
    rewards_mod = []
    wealth_episodes = []
    rsum = 0

    number_of_actions = 15 #again from earlier in the code  
    number_of_states = 120

    gamma = 0.95
    learning_rate = 0.10
    egreedy = 0.1
    wealth = 100.0

    start_state = int(wealth/10) 

    agent = GeneralQ(number_of_states, number_of_actions, start_state, SARSA, 
                 double, step_size=learning_rate) #here eps set to 0.1 anyway

    
    for i_episode in range(episodes-1):
        
        price_state, wealth_state = Mark.reset()
        state = wealth_state
        action = agent.behaviour_policy(agent._q[state])
        #print(state)

        while True:    

            (price_state, new_state) , reward, done, final_wealth = Mark.step(action) #(prop, wealth))
            action = agent.step(reward, gamma, new_state)
            rsum += reward
            state = new_state
            #print(state)

            if done:
                wealth_episodes.append(final_wealth)
                utilities_mod.append(np.log(final_wealth))
                rewards_mod.append(rsum)
                rsum = 0
                #wealth = 100.0
                break 

    q_name = 'gymtest'
    filename = q_name
    np.save(filename, agent.q_values)

    print(q_name + " last 50,000 rewards mean",np.mean(np.array(rewards_mod)[-50000:]))
    
    return agent.q_values, utilities_mod, rewards_mod, wealth_episodes

In [39]:
from lib.sim_prices import make_prices
from lib.execute_strat import execute_strat
from lib.graphs import make_baseline_graphs, make_agent_graphs, plot_sample_paths, plot_disc_utility, \
                        plot_mv_equiv, plot_const_step

In [None]:
episodes = 3000000
Q, utilities_mod, rewards_mod, wealth_episodes = train_agent(kappa, episodes)

In [None]:
wealth = 100.0

merton_ratio = (mu-rf)/sigma**2
best_action = np.argmin(np.abs(u_star-merton_ratio))
#temp change to execute strat
utilities_test_rand, rewards_test_rand, step_rew_rand, wealth_test_rand = execute_strat(kappa,mu, 
                                                                      rf, sigma, utes,u_star,best_action, 'Random', 
                                                                      time_periods=time_periods, wealth=wealth)
utilities_test_best, rewards_test_best, step_rew_best, wealth_test_best = execute_strat(kappa, mu, rf, sigma, 
                                                                      utes, u_star, best_action,'Merton', 
                                                                      time_periods=time_periods, wealth=wealth)

In [None]:
utilities_test, rewards_test, step_rew_test, wealth_test = execute_strat(kappa, mu, rf, sigma, utes, u_star,best_action, 
                                                            'Agent', q_values=Q, 
                                                            time_periods=time_periods, wealth=wealth)

results = make_agent_graphs(rewards_test_best, rewards_test_rand, rewards_test,
                  utilities_test_best, utilities_test_rand, utilities_test,
                         wealth_test_rand, wealth_test_best, wealth_test)

In [None]:
np.argmax(Q[0:40,:],1)

In [None]:
Q1 = np.load('../../TD/q_tables/models/Noisy2epi3000000er8kappa1.0.npy')

In [None]:
np.argmax(Q1[0:40,:],1)