In [3]:
import gym
import pandas as pd
import os

In [50]:
class State:
    
    def __init__(self,features,number_of_assets,bars_count,commission,close_returns):
        
        
        assert features.index.equals(close_returns.index)
        
        self.features=features
        self.number_of_assets=number_of_assets
        self.bars_count=bars_count
        self.percent_commission=commission
        self.close_returns=close_returns
        
    def reset(self):
        """
        resets the weights_buffer
        
        """
        
        self._initialize_weights_buffer()
        
        
    
    def _initialize_weights_buffer(self):
        #TODO: Should this be part of state or environment?
        """
         :return: 
        """
        
        self.weight_buffer=self.features*0+1/self.number_of_assets
        
    @property
    def shape(self):
        raise
    def _set_weights_on_date(weights,target_date):
        self.weight_buffer.loc[target_date]=weights
        
    def step(self, action, action_date):
        """
        
        :param action: corresponds to portfolio weights np.array(n_assets,1)
        :param action_date: datetime.datetime
        :return: 
        """
        #get previous allocation
        
        
        
        action_date_index=np.argmax(self.weight_buffer.index.isin([action_date]))
        self._set_weights_on_date(weights=action,target_date=action_date)
        
        
        weight_difference=self.weight_buffer.iloc[action_date_index-1:action_date_index+1]
        #obtain the difference from the previous allocation, diff is done t_1 - t
        weight_difference=abs(weight_difference.diff().dropna())
        
        #calculate rebalance commission
        commision_percent_cost=-weight_difference.sum(axis-1)*self.percent_commission
        
        #get period_ahead_returns
        t_plus_one_returns=self.close_returns.iloc[action_date_index]
        one_period_mtm_reward=(t_plus_one_returns*action).sum()
        
        reward=one_period_mtm_reward-commision_percent_cost
        
        return reward
        
        
        
        
    def encode(self,date):
        """
        convert current state to tensor
        
        """
        
        pass
        
        
        
        

In [51]:
"""

    meta_parameters:
            bars_count=the number of bars that we pass on each observation
            commission
            reward_funtion:
                -cummulative_return_over_batch
                -defined_forecast_frequency

"""

meta_parameters={"bars_count":30}

class DeepTradingEnvironment(gym.Env):
    metadata={'render.modes':['human']}
    
    @classmethod
    def from_dirs_and_transform(cls,meta_parameters,data_dir="data_env",**kwargs):
        """
        Do transformations that shouldnt be part of the class
        
        
        
        
        """
        
        #optimally this should be only features
        features={file:pd.read_parquet(data_dir+"/"+file)["close"] for file in os.listdir(data_dir)}
        
        
        features=pd.DataFrame(features)
        
        assets_prices=features
        #transform features
        
        return DeepTradingEnvironment(features,
                                      assets_prices,
                                      meta_parameters,**kwargs)
        
    
    def __init__(self,features,assets_prices,meta_parameters,state_class_name="State"):
        """
        features: pandas.DataFrame with features by time
        asset_prices=pandas.DataFrame with asset prices by time
        """
        
        assert features.index.equals(assets_prices.index)
        
        self.features=features
        self.number_of_features=len(self.features.columns)
        self.assets_prices=assets_prices
        self.number_of_assets=len(self.assets_prices.columns)
        self.close_returns=np.log(self.assets_prices).diff()
        
        self.meta_parameters=meta_parameters
        
        
        #logic to create state
        self._state=State(features=features,number_of_assets=self.number_of_assets,
                          bars_count=self.meta_parameters["bars_count"],
                         commission=self.meta_parameters["bars_count"],
                         close_returns=self.close_returns)
        
        
        # crete action and observation space members
        
        #action space is the portfolio weights at any time in our example it is bounded by [0,1]
        self.action_space=gym.spaces.Box(low=0,high=1,shape=(self.number_of_assets,))
        
        
        #features to be scaled normal scaler will bound them in -4,4
        self.observation_space=gym.spaces.Box(low=-4, high=4,shape=(self.number_of_features,))
    
    
    def reset(self):
        """
        resets the environment:
            -resets the buffer of weights in the environments
        
        """
    
    def step(self, action_portfolio_weights, action_date):
        """
        
        :param action_portfolio_weights: 
        :param action_date: 
        :return: 
        """
        
        action=action_portfolio_weights
        reward,done=self._state.step(action,action_date)
        obs=self._state.encode()
        info={"action":None,
             "date":action_date}
        
        return obs,reward,done, info
    
        
    def render(self, mode='human', close=False):
        pass

    def close(self):
        pass
        
env=DeepTradingEnvironment.from_dirs_and_transform(meta_parameters=meta_parameters)

One dimension continuos action

$$
\pi(a|s,\theta)=\frac{1}{\sigma(s,\theta)\sqrt{2\pi}}exp(-\frac{(a-\mu(s,\theta))^2}{2\sigma(s,\theta)})
$$

We parametrize mu as we pleased. Simples parametrization

$$
\mu(s,\theta)=\theta_{\mu}^Tx(s)
$$

$$
\sigma(s,\theta)=exp(\theta_{\sigma}^Tx(s))
$$

Now 

$$
\theta=[\theta_{\mu},\theta_{\sigma}]
$$


Initialize variance to be large to have explorations


### Algorithm  REINFORCE


#### State

State contains historical features on the time window as well as the latest weights. 

#### Meta parameters

* in_window:  number of previous observations that each state will include for example if data is in intervals of 10 minutes and in_window=20 then will be 200 minutes time frame.
* forecast_window: time frame for the action to wait for the reward
* episode_length: the number of samples of  states for each episode. On each update it doesnt make sense to run continous time windows. Specially because our states will be heavily correlated.  We can also see this as the size of the batch if we use a neural network




#### Pseudo-Code:

Loop forever ( for each episode):

    (episode_index_sample)Take a random sample of size episode_length on all the dates in the training sample
    {We can test with non continuos and continuos batches}
    Generate an episode S0,A0,R1 ..... (To generate A0 use policy sampling
    Save actions to weights buffer
    loop[ for each step of episode t=0,,T-1]
        
        calculate reward if not continous then G shouldnt be discounted
        
        
    
    
    

**Example**
in_window=10
forecast_window=10 minutes
episode_lenght=200

```
episode_index_sample=[11,45,...,100]
S0=data[11-10:11]
A0=[,1,.2...,7]
R=log_return from data[11:+10_minutes]
```



