In [None]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

# Constants
GAMMA = 0.9

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(PolicyNetwork, self).__init__()

        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 
    
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(Variable(state))
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob
    
def update_policy(policy_network, rewards, log_probs):
    discounted_rewards = []

    for t in range(len(rewards)):
        Gt = 0 
        pw = 0
        for r in rewards[t:]:
            Gt = Gt + GAMMA**pw * r
            pw = pw + 1
        discounted_rewards.append(Gt)
        
    discounted_rewards = torch.tensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

    policy_gradient = []
    for log_prob, Gt in zip(log_probs, discounted_rewards):
        policy_gradient.append(-log_prob * Gt)
    
    policy_network.optimizer.zero_grad()
    policy_gradient = torch.stack(policy_gradient).sum()
    policy_gradient.backward()
    policy_network.optimizer.step()
    
def main():
    env = gym.make('CartPole-v0')
    policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, 128)
    
    max_episode_num = 5000
    max_steps = 10000
    numsteps = []
    avg_numsteps = []
    all_rewards = []

    for episode in range(max_episode_num):
        state = env.reset()
        log_probs = []
        rewards = []

        for steps in range(max_steps):
            env.render()
            action, log_prob = policy_net.get_action(state)
            new_state, reward, done, _ = env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                update_policy(policy_net, rewards, log_probs)
                numsteps.append(steps)
                avg_numsteps.append(np.mean(numsteps[-10:]))
                all_rewards.append(np.sum(rewards))
                if episode % 1 == 0:
                    sys.stdout.write("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))
                break
            
            state = new_state
        
    plt.plot(numsteps)
    plt.plot(avg_numsteps)
    plt.xlabel('Episode')
    plt.show()

In [1]:
import simple_continuous_buy_sell_spy

In [2]:
env = simple_continuous_buy_sell_spy.simple_continuous_buy_sell_spy(1)

In [3]:
env

<simple_continuous_buy_sell_spy.simple_continuous_buy_sell_spy at 0x7fafa19d9970>

In [5]:
import gym
env_pole = gym.make('CartPole-v0')

In [6]:
env_pole

<TimeLimit<CartPoleEnv<CartPole-v0>>>

In [17]:
env_pole.action_space.n

2

In [15]:
len(env.index_feature_dataframe.columns)

4

In [18]:
env.reset()

array([[1.00110548],
       [1.00524768],
       [1.00828153],
       [0.        ]])

In [30]:
env_pole.reset().reshape((-1,1))

array([[-0.02496398],
       [ 0.00095806],
       [-0.02463566],
       [-0.0476596 ]])

In [31]:
env.reset()

array([[0.99692223],
       [0.99514243],
       [0.99542832],
       [0.        ]])

In [35]:
env.step(0.4)

(array([[1.00448918],
        [1.00311354],
        [1.00244062],
        [0.44977362]]),
 0)

In [39]:
np.squeeze(env.step(0.4)[0])

array([1.00423627, 1.00407877, 1.00377404, 0.40024724])

In [38]:
import numpy as np

In [40]:
env.step(0.4)[0]

array([[1.00204067],
       [1.00237704],
       [1.00227639],
       [0.34995062]])

In [1]:
from torch.distributions.kumaraswamy import Kumaraswamy

In [2]:
k = Kumaraswamy(0.1, 0.3)

In [23]:
k.sample()

tensor(1.0000)

In [2]:
from simple_continuous_buy_sell_spy_nn import SimpleContinuousBuySellSpy

In [2]:
import pandas as pd
import simple_continuous_buy_sell_spy_nn

env = simple_continuous_buy_sell_spy_nn.SimpleContinuousBuySellSpy()

In [8]:
env.reset().reshape((4))

array([1.00135594, 1.00554114, 1.00977159, 0.        ])

In [10]:


index_data = pd.read_csv("SPY.csv")
index_data = index_data.rename(columns = {"Date":"Date",\
                                   "Open":"index_open",\
                                   "High":'index_high',\
                                   'Low':'index_low',\
                                   'Close':'index_close',\
                                   'Adj Close':'index_adj_close',\
                                   'Volume':'index_volume'})

In [14]:
index_feature_dataframe['index_raw_price'] = index_data['index_adj_close']
period_list = [5,10,15]
for period in period_list:
    ewm = index_feature_dataframe['index_raw_price'].ewm(span = period).mean()
    ratio = index_feature_dataframe['index_raw_price']/ewm
    index_feature_dataframe['ewm_'+str(period)] = ratio
index_feature_dataframe = index_feature_dataframe.iloc[max(period_list):,:]

index_feature_dataframe = index_feature_dataframe.reset_index(drop=True)

In [17]:
index_feature_dataframe = index_feature_dataframe.reset_index(drop=True)

In [23]:
from random import randrange

current_time_index = randrange(0,index_feature_dataframe.shape[0]-500)

In [25]:
observation = index_feature_dataframe.iloc[current_time_index][1:].to_numpy()

In [26]:
observation

array([1.01195113, 1.0093477 , 1.00689748])

In [28]:
observation.reshape((-1,1))

array([[1.01195113],
       [1.0093477 ],
       [1.00689748]])

In [85]:
import matplotlib.pyplot as plt
import simple_continuous_buy_sell_spy_nn
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
from torch.distributions.kumaraswamy import Kumaraswamy

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(PolicyNetwork, self).__init__()

        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x)) + 0.0000001
        print("shape of x")
        print(x.shape)
        return x 
    
    def get_action(self, state):
        state = torch.from_numpy(state).float()
        print("state")
        print(state)
        param = self.forward(Variable(state))
        print("param")
        print(*param[0][0])
        k = Kumaraswamy(concentration1=param[0][0][0], concentration0=param[0][0][1])
        action = k.sample()
        print("action")
        print(action)
        return action

In [86]:
policy_net = PolicyNetwork(4, 2, 128)

In [87]:
state = env.reset()

In [88]:
state

array([[0.99898649],
       [0.99843106],
       [1.00051918],
       [0.        ]])

In [89]:
torch.from_numpy(state).float().unsqueeze(0)

tensor([[[0.9990],
         [0.9984],
         [1.0005],
         [0.0000]]])

In [91]:
torch.from_numpy(state)

tensor([[0.9990],
        [0.9984],
        [1.0005],
        [0.0000]], dtype=torch.float64)

In [99]:
np.zeros()

TypeError: zeros() missing required argument 'shape' (pos 1)