In [9]:
import pandas as pd, numpy as np, random

In [217]:
def build_q_dict(states_df):
    import itertools, random
    
    list_of_values = [list(states_df[col].unique()) for col in states_df.columns]
        
    state_list = list(itertools.product(*list_of_values))
    
    q_dict = {'long':{}, 'short':{}, None:{None:{None:0}}}
    
    for position in ['long','short',None]:
        for state_tuple in state_list:
            q_dict[position][state_tuple] = {'buy':random.random()/10000, 
                                           'sell':random.random()/10000, None:random.random()/10000}
    
    return q_dict
        

In [274]:
class QLearner():
    '''Creates a Q-learning agent to use on stock market data'''
    
    def __init__(self, states_df, price_series, date_col='Date'):
        '''states_df: a pandas dataframe where each column is a time series of states to be monitored 
        by the Q-learner. Each state should be an integer value. The df must be indexed by date. 
        
        price_series: A time series of prices. Must be a pandas series indexed by date and correspond to dates
        in the states_df'''
        
        assert len(states_df) == len(price_series), "Length of price_series and states_df are not equal."        
        
        self.states_df = states_df
        self.price_series = price_series
        # Instantiate with dummy states for parsimony of q_dict
        self.curr_state = (0, 0, 0, 0, 0, 0, 1, 0)
        self.prev_state = (0, 0, 0, 0, 0, 0, 1, 0)
        self.prev_action = None
        self.prev_price = None
        self.position = None
        self.prev_position = None
        self.prev_reward = 0
        self.cum_reward = 0
        self.alpha = 0.7
        self.gamma = 0.7
        self.epsilon = 0.1
        self.max_time = len(states_df)
        self.q_dict = build_q_dict(states_df)        
                    
    def print_attributes(self, print_q_dict=False):
        print "The current state is {}".format(self.curr_state)
        print "The previous state is {}".format(self.prev_state)
        print "The previous action is {}".format(self.prev_action)
        print "The previous price is {}".format(self.prev_price)
        print "The the current position is {}".format(self.position)
        print "The previous reward is {}".format(self.prev_reward)
        print "Alpha is {}".format(self.alpha)
        print "Gamma is {}".format(self.gamma)
        print "Epsilon is {}".format(self.epsilon)
        
        if print_q_dict:
            print self.q_dict
        
    def update(self, t):
        if t==0:
            self.cum_reward = 0
            
        inputs = self.states_df.iloc[t]
        
        self.curr_state = tuple(inputs.values)
        
        if random.random() > self.epsilon/(t + 1):
            action = max(self.q_dict[self.position][self.curr_state], 
                         key=self.q_dict[self.position][self.curr_state].get)
        else:               
            action = random.choice(['buy','sell',None])
        
        #print "Before action, the position is {}".format(self.position)
        
        self.position = self.update_position(action)
        #print "After action={}, the position is {}".format(action, self.position)
        
        reward = self.calc_reward(t)
        
        Qsa = self.q_dict[self.prev_position][self.prev_state][self.prev_action]
        Qs_prime_a_prime = self.q_dict[self.position][self.curr_state][max(self.q_dict[self.position][self.curr_state])]
        
        Qsa += self.alpha * (self.prev_reward + self.gamma * Qs_prime_a_prime -  Qsa)
        self.q_dict[self.position][self.prev_state][self.prev_action] = Qsa        
        
        self.cum_reward += reward
        self.prev_position = self.position
        self.prev_reward = reward
        self.prev_action = action
        self.prev_state = self.curr_state
        self.prev_price = self.price_series.iloc[t]
        
        #print "QLearner.update(): inputs = {}, position = {}, action = {}, \n \
        #        reward (from prev action) = {}, cumulative reward = {}".format(
        #        inputs.values, self.prev_position, action, reward, self.cum_reward)
        
    def calc_reward(self, t): 
        prev_price = self.prev_price
        price_today = self.price_series.iloc[t]
        
        if prev_price != None:
            diff = price_today - prev_price
            if self.position == 'long':
                reward = diff
            elif self.position == 'short':
                reward = -diff
            else:
                reward = 0
        else:
            reward = 0        
        return reward
        
    def update_position(self, action):
        curr_position = self.position
        if curr_position == 'long':
            if action == 'sell':
                self.position = None
                return self.position
            else:
                pass
        elif curr_position == 'short':
            if action == 'buy':
                self.position = None
                return self.position
            else:
                pass
        else:
            if action == 'sell':
                self.position = 'short'
                return self.position
            elif action == 'buy':
                self.position = 'long'
                return self.position
            else:
                pass
        
    def run_sim(self, n_trials=1):
        n = 0
        while n < n_trials:
            t = 0
            while t < self.max_time:
                self.update(t)        
                t += 1
            tot_hold = self.price_series[self.max_time - 1] - self.price_series[0]
            excess_return = self.cum_reward - tot_hold
            print "The excess return is {}".format(excess_return)   
            n += 1

In [175]:
df = pd.read_csv('market_direction_int.csv')
df['Date'].apply(pd.Timestamp)
df.head()

Unnamed: 0,Date,BullBearSpread,OneTenDiff,Hi-Lo,Adv-Dec,VIX,PE_Ratio,OthMktsMinusMA,SP,SP_Minus_MA
0,2006-08-02,0,0,0,0,0,0,1,1277.410034,0
1,2006-08-03,0,0,0,1,0,0,1,1280.27002,0
2,2006-08-04,0,0,0,1,0,0,1,1279.359985,0
3,2006-08-07,0,0,0,0,1,0,1,1275.77002,0
4,2006-08-08,0,0,0,0,1,0,1,1271.47998,0


In [176]:
df.set_index('Date',inplace=True)
states_df = df[['BullBearSpread','OneTenDiff','Hi-Lo','Adv-Dec','VIX','PE_Ratio','OthMktsMinusMA','SP_Minus_MA']]
price_series = df['SP']

In [275]:
agent = QLearner(states_df, price_series)
#agent.print_attributes()
agent.run_sim(n_trials = 100)

The excess return is -366.688118
The excess return is 852.5603
The excess return is 1582.258424
The excess return is 2325.120182
The excess return is 997.05011
The excess return is 3024.361032
The excess return is 2320.931951
The excess return is 2667.281068
The excess return is 2498.261419
The excess return is 2468.412298
The excess return is 2716.292428
The excess return is 2870.470768
The excess return is 3067.782481
The excess return is 2916.602618
The excess return is 2972.111577
The excess return is 3189.761605
The excess return is 2954.951602
The excess return is 2991.971504
The excess return is 3043.100655
The excess return is 3100.1009
The excess return is 3013.750672
The excess return is 3087.341133
The excess return is 3032.14081
The excess return is 3098.11127
The excess return is 3154.500679
The excess return is 3003.591253
The excess return is 3162.460999
The excess return is 2986.790828
The excess return is 3119.771301
The excess return is 3040.820797
The excess return i