# Reinforcement Q-learning simple NN as function approximation

- portfolio grid of size 10 (0 to 1)
- 1 hidden neural layers 
- Improved by Dropout 

In [1]:
# Initialization
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('data_ext.csv') #Three stocks (R,X_s,X_b) Without predictors

# Initialization of the Tensorflow placeholders and the network

In [3]:
#Initialize Neural Network and set-up the placeholders
tf.reset_default_graph()
NN_input = tf.placeholder(shape=[1,6],dtype=tf.float32)
NN_weights = tf.Variable(tf.random_uniform([6,10],0,0.01))
b = tf.Variable(np.zeros((1,10)), dtype=tf.float32)
Q_FA = tf.tanh(tf.matmul(NN_input,NN_weights) + b)     ## very important 
Q_dropout = tf.nn.dropout(Q_FA,0.2)
A_Max = tf.argmax(Q_dropout,1)

# Calculate loss for the NN from the Q values
Q_Next = tf.placeholder(shape=[1,10],dtype=tf.float32)
# loss = tf.reduce_sum(tf.square(Q_Next - Q_FA))
loss = tf.reduce_sum(tf.square(Q_Next - Q_dropout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

#Define Action Matrix (Now discrete case) 
A = np.linspace(0,1,10) # portfolio weights of stocks (1-weight) is the weight in the bonds

# Training of the NN function approximation

In [4]:
# initialize
init = tf.global_variables_initializer()
gamma = 1 
epsilon = 0.1
jList = []
TWlistTrain = []
TWlist = []
Index = []
MWeights = []
Turnover = []
RU = []
# data parsing
dates = data['Date']
mdata = data[['r','xs','xb','snom','spe','sspr']]
mdata.index = pd.DatetimeIndex(dates)
n = 707
periods = 60
epochs = 1     # preferred to have a low amount of epochs because otherwise the the same data is used multiple times (usually not the case in stock returns)

with tf.Session() as sess:
    for i in range(331,int(n-periods-1)):
        OptimalWeights = np.zeros(periods-1)
        currentK = 0
        sess.run(init) # initialize the Neural Network again
        print(i)
        end = 0
        
        while currentK < periods - 1:
            #Initilization
            NN_data = mdata[0:i+currentK]  # Expanding window
            currentEpoch = 0
            
            indexes = np.asarray(range(len(NN_data)))
            end = len(NN_data)
            np.random.shuffle(indexes)
            
            while currentEpoch < epochs:
                #Training of the Q-Network for the data available (with Neural Nets) 
                for j in indexes:
                    s = NN_data.iloc[j,:].values.reshape(1,6)
                    #Choose an action by greedily (with e chance of random action) from the Q-network
                    a_int,allQ = sess.run([A_Max,Q_FA],feed_dict={NN_input:s})
                    a = A[a_int-1]  # -1 because the output neurons are labeled 1 till 101 and it will be an index
                    if np.random.rand(1) < epsilon:
                        a = random.choice(A)

                    #Get new state and reward from environment
                    s1 = mdata.iloc[j+1,:].values.reshape(1,6)
                    r = (a*s1[0][0] + (1-a)*s1[0][1]) #reward: this is now the wealth gained from this step, but could be other rewards like utility
                    Q = sess.run(Q_FA,feed_dict={NN_input:s1})

                    #Obtain maxQ' and set our target value for chosen action.
                    Q1 = np.max(Q)
                    targetQ = allQ
                    targetQ[0,a_int] = r + gamma*Q1

                    #Train the neural network using target and predicted Q values
                    _,W1 = sess.run([updateModel,NN_input],feed_dict={NN_input:s,Q_Next:targetQ})
                    s =  mdata.iloc[j+1,:].values.reshape(1,6)
                    if i  > 100:
                        # decrease amount of random actions over time in order to improve exploitation rather than exploration
                        # only increase exploitation when a good action has been found (otherwise one exploits a bad solution)
                        epsilon = 1./((i/50) + 10)
                currentEpoch += 1
        
            # After training now calculate the optimal weights for the K=60 periods to come
            s = mdata.iloc[i+periods,:].values.reshape(1,6)
            a_int,allQ = sess.run([A_Max,Q_FA],feed_dict={NN_input:s})
            aOpt = A[a_int-1]
            OptimalWeights[currentK] = aOpt
            currentK += 1
            
        firstdiff = OptimalWeights[1:] - OptimalWeights[:-1]
        # For insight purposes
        MWeights.append(np.mean(OptimalWeights))
        TerminalWealth = np.exp(sum(OptimalWeights*mdata[i+1:i+currentK+1]['xs'] + (1-OptimalWeights)*mdata[i+1:i+currentK+1]['xb']))
        TWlist.append(TerminalWealth)
        Index.append(i)
        # Turnover
        Turnover.append(sum(abs(firstdiff*np.exp(mdata[i+1:i+currentK]['xs'])) + abs((1-firstdiff)*np.exp(mdata[i+1:i+currentK]['xb']))))
        # Realized Utility
        RU.append((1/(1-5))*pow(TerminalWealth,(1-5)))
        
        print('Writing away results')
        df = pd.DataFrame({'index date':Index,'TW':TWlist, 'Mean Weights Xs':MWeights,'Turnover':Turnover, 'Realized Utility':RU})
        df.to_excel('Results_NN_g10_ExtraData_e1_3.xlsx', sheet_name='sheet1', index=False)
print('Done!')
        

331
Writing away results
332
Writing away results
333
Writing away results
334
Writing away results
335
Writing away results
336
Writing away results
337
Writing away results
338
Writing away results
339
Writing away results
340
Writing away results
341
Writing away results
342
Writing away results
343
Writing away results
344
Writing away results
345
Writing away results
346
Writing away results
347
Writing away results
348
Writing away results
349
Writing away results
350
Writing away results
351
Writing away results
352
Writing away results
353
Writing away results
354
Writing away results
355
Writing away results
356
Writing away results
357
Writing away results
358
Writing away results
359
Writing away results
360
Writing away results
361
Writing away results
362
Writing away results
363
Writing away results
364
Writing away results
365
Writing away results
366
Writing away results
367
Writing away results
368
Writing away results
369
Writing away results
370
Writing away results
