In [125]:
import pandas as pd, numpy as np, random

In [126]:
def build_q_dict(states_df):
    '''Constructs a dictionary of Q values for the Q-learner. The dictionary keeps track of
    the rewards earned in a given state after a given action.'''
    import itertools, random
    
    # list of lists of all possible values (integers) each indicator variable can assume
    list_of_values = [list(states_df[col].unique()) for col in states_df.columns]
    
    # Constructs a list of all possible combinations of indicator values, ie all possible states
    state_list = list(itertools.product(*list_of_values))
    
    # Additional layer of the dict to keep track of position, ie does the learner currently own stock
    q_dict = {'long':{}, 'short':{}, None:{None:{None:0}}}
    
    # Initialize the dict with small random values
    for position in ['long','short',None]:
        for state_tuple in state_list:
            if position == None:
                '''If no position, the learner can take any of 3 actions'''
                q_dict[position][state_tuple] = {'buy':random.random()/10000,
                                                 'sell':random.random()/10000,
                                                 None:random.random()/10000}
            elif position == 'long':
                '''If the position is long, can only sell or hold (ie None action)'''
                q_dict[position][state_tuple] = {'sell':random.random()/10000, 
                                                 None:random.random()/10000}
            else:
                '''If the position is short, can only buy or hold short (ie None action)'''
                q_dict[position][state_tuple] = {'buy':random.random()/10000, 
                                                 None:random.random()/10000}
    return q_dict
        

In [127]:
class QLearner():
    '''Creates a Q-learning agent to use on stock market data'''
    
    def __init__(self, states_df, price_series, date_col='Date', train_fraction=.8):
        '''states_df: a pandas dataframe where each column is a time series of states to be monitored 
        by the Q-learner. Each state should be an integer value. The df must be indexed by date. 
        
        price_series: A pandas time series of prices. Must be a pandas series indexed by date and 
        correspond to dates in the states_df'''
        
        assert len(states_df) == len(price_series), "Length of price_series and states_df are not equal."        
        
        self.states_df = states_df
        self.price_series = price_series
        self.states_train, self.states_test, self.prices_train, self.prices_test = self.train_test_split()
        # Instantiate with dummy states for parsimony of q_dict
        self.curr_state = (0, 0, 0, 0, 0, 0, 1, 0)
        self.prev_state = (0, 0, 0, 0, 0, 0, 1, 0)
        self.prev_action = None
        self.prev_price = None
        self.position = None
        self.prev_position = None
        self.prev_reward = 0
        self.cum_reward = 0
        self.alpha = 0.7
        self.gamma = 0.7
        self.epsilon = 0.2
        self.max_train_time = len(self.states_train)
        self.max_test_time = len(self.states_test)
        self.q_dict = build_q_dict(states_df)
        self.history = {}
    
    def train_test_split(self, train_fraction=.8):
        states_df = self.states_df
        price_series = self.price_series
        
        assert states_df.index.all() == price_series.index.all(), \
            "All dates in the states_df must match all dates in the price_series"
        
        end_train = int(len(states_df)*train_fraction)
        end_test = len(states_df) - 1
        states_temp, prices_temp = states_df.copy(), price_series.copy()
        states_train, prices_train = states_temp[:end_train], prices_temp[:end_train]
        
        states_temp, prices_temp = states_df.copy(), price_series.copy()
        states_test, prices_test = states_temp[end_train + 1:end_test],prices_temp[end_train + 1:end_test]
        
        self.states_train = states_train
        self.states_test = states_test
        self.prices_train = prices_train
        self.prices_test = prices_test
        
        return self.states_train, self.states_test, self.prices_train, self.prices_test
                    
    def print_attributes(self, print_q_dict=False):
        # For debugging
        print "The current state is {}".format(self.curr_state)
        print "The previous state is {}".format(self.prev_state)
        print "Today's action is {}".format(self.prev_action)
        print "Today's price is {}".format(self.prev_price)
        print "End of day position is {}".format(self.position)
        print "Today's reward is {}".format(self.prev_reward)
        print "Alpha is {}".format(self.alpha)
        print "Gamma is {}".format(self.gamma)
        print "Epsilon is {}".format(self.epsilon)
        
        if print_q_dict:
            print self.q_dict
        
    def update(self, t, train=True):
        '''Updates all the relevant attributes (q_dict, reward, cum_reward, states, position,
        action, prices) of the Q-learner as it progresses through a training or testing 
        time series. Set 'train' to False to test the learned policy on new data. All actions
        are assumed to take place at the market closing price. Set verbose to True if you
        want messages to help debug.'''        
        
        if t==0:
            # Reset these at the start of each training loop
            self.cum_reward = 0
            self.prev_reward = 0
            self.prev_action = None
            self.prev_price = None
            self.position = None
            self.prev_position = None
            self.prev_reward = 0
        
        if train:
            # if training use the values from the training set
            date = self.states_train.index[t]
            price = self.prices_train.iloc[t]
            # Find the state, ie the tuple of indicator variables, at the given time, t
            inputs = self.states_train.iloc[t]
        else:
            #otherwise use values from the test set
            date = self.states_test.index[t]
            price = self.prices_test.iloc[t]
            # Find the state, ie the tuple of indicator variables, at the given time, t
            inputs = self.states_test.iloc[t]      
        
        '''Calculate the reward based on yesterday's position and today's price.
        Do this before taking any action or updating any attributes.'''
        reward = self.calc_reward(t, train)
        
        # Update current state, ie the tuple of indicator variables
        self.curr_state = tuple(inputs.values)
        
        # Take an action
        if train==False or random.random() > self.epsilon/(t + 1):
            # Take the best action if testing or if random number exceeds epsilon
            action = max(self.q_dict[self.position][self.curr_state], 
                         key=self.q_dict[self.position][self.curr_state].get)
        else:
            # if random number does not exceed epsilon, take a random allowable action
            if self.position == None:
                action = random.choice(['buy','sell',None])
            elif self.position == 'long':
                action = random.choice(['sell',None])
            else:
                action = random.choice(['buy',None]) 
                
        # Update the q_dict 
        # Look up the Q value for the prev state and action
        Qsa = self.q_dict[self.prev_position][self.prev_state][self.prev_action]       
        # Compute the Q value for the next (ie the current) state and action
        Qs_prime_a_prime = self.q_dict[self.position][self.curr_state][max(self.q_dict[self.position][self.curr_state])]
        # Update Q value according to modified Bellman equation
        Qsa += self.alpha * (reward + self.gamma * Qs_prime_a_prime -  Qsa)
        # Set the Q value of the previous state equal to the updated Q value
        self.q_dict[self.prev_position][self.prev_state][self.prev_action] = Qsa
        
        # Update prev_position, then update the current position
        self.prev_position = self.position
        # Update positon after taking action and updating Q dict
        self.position = self.update_position(action)           
        
        self.cum_reward += reward
        self.history[date] = {'action':action,'open_position':self.prev_position,
                              'close_position':self.position,'period_return':reward,
                              'cum_return':self.cum_reward}   
        self.prev_reward = reward
        self.prev_action = action
        self.prev_state = self.curr_state        
        self.prev_price = price
                       
        print "QLearner.update(): inputs = {}, open_position = {}, reward = {},\n \
              action = {}, close_position = {}, cumulative reward = {}".format(
              inputs.values, self.prev_position, reward, action, self.position, self.cum_reward)
        self.print_attributes()
        
    def calc_reward(self, t, train):
        '''Method to calculate the reward earned by the Q-learner based on the position
        from the end of the previous day and today's market action'''
        
        prev_price = self.prev_price
        
        # Determine which price series to use and find today's price
        if train:
            price_today = self.prices_train.iloc[t]
        else:
            price_today = self.prices_test.iloc[t]
            
        if prev_price != None:
            # Calculate reward
            diff = price_today - prev_price
            if self.position == 'long':
                reward = diff
            elif self.position == 'short':
                reward = -diff
            else:
                reward = 0
        else:
            # if there is no previous price, there can be no reward
            reward = 0        
        return reward
        
    def update_position(self, action):
        curr_position = self.position
        
        if curr_position == 'long':
            if action == 'sell':
                self.position = None                
            return self.position
                
        elif curr_position == 'short':
            if action == 'buy':
                self.position = None
            return self.position
            
        else:
            if action == 'sell':
                self.position = 'short'                
            elif action == 'buy':
                self.position = 'long'
            return self.position    
            
        
    def train(self, n_trials=1):
        n = 0
        while n < n_trials:
            t = 0
            print "Training trial number {}...".format(n)
            while t < self.max_train_time:
                self.update(t, train=True)        
                t += 1
            tot_hold = self.price_series[self.max_train_time - 1] - self.price_series[0]
            excess_return = self.cum_reward - tot_hold
            print "The excess return is {}".format(excess_return)   
            n += 1
            
    def test(self, n_trials=1, verbose=True):
        '''Method used to test the trained Q-learner on new data.'''
        n = 0
        while n < n_trials:
            t = 0
            print "Testing . . ."
            while t < self.max_test_time:
                self.update(t, train=False)        
                t += 1
            print "End value of index: {}".format(self.price_series[self.max_test_time - 1])
            print "Beginning value of index: {}".format(self.price_series[0])
            tot_hold = self.price_series[self.max_test_time - 1] - self.price_series[0]
            print "The return from buy and hold is {}".format(tot_hold)
            print "The return from the trading strategy is {}".format(self.cum_reward)
            excess_return = self.cum_reward - tot_hold
            print "The excess return is {}".format(excess_return)   
            n += 1 
        if verbose:
            #shrp_mod, shrp_mkt = self.sharpe_ratio()
            
            print "nothing"
            
    def sharpe_ratio(self):
        '''Method to calculate Sharpe Ratio of both the market (aka self.price_series)
        and the trading system. The risk free rate is assumed to be 0.'''
        
        # Sharpe Ratio of the market
        shrp_mkt = self.price_series.mean() / self.price_series.std()
        history = pd.DataFrame(self.history)
        shrp_mod = history['preiod_return'].mean() / history['preiod_return'].std()
        return history

In [128]:
df = pd.read_csv('market_direction_int.csv')
df['Date'].apply(pd.Timestamp)
df.head()

Unnamed: 0,Date,BullBearSpread,OneTenDiff,Hi-Lo,Adv-Dec,VIX,PE_Ratio,OthMktsMinusMA,SP,SP_Minus_MA
0,2006-08-02,0,0,0,0,0,0,1,1277.410034,0
1,2006-08-03,0,0,0,1,0,0,1,1280.27002,0
2,2006-08-04,0,0,0,1,0,0,1,1279.359985,0
3,2006-08-07,0,0,0,0,1,0,1,1275.77002,0
4,2006-08-08,0,0,0,0,1,0,1,1271.47998,0


In [129]:
df.set_index('Date',inplace=True)
states_df = df[['BullBearSpread','OneTenDiff','Hi-Lo','Adv-Dec','VIX','PE_Ratio','OthMktsMinusMA','SP_Minus_MA']]
price_series = df['SP']

In [130]:
agent = QLearner(states_df, price_series)

agent.train(n_trials = 1)
agent.test()

Training trial number 0...
***The previous action was: None
QLearner.update(): inputs = [0 0 0 0 0 0 1 0], open_position = None, reward = 0,
               action = None, close_position = None, cumulative reward = 0
The current state is (0, 0, 0, 0, 0, 0, 1, 0)
The previous state is (0, 0, 0, 0, 0, 0, 1, 0)
Today's action is None
Today's price is 1277.410034
End of day position is None
Today's reward is 0
Alpha is 0.7
Gamma is 0.7
Epsilon is 0.2
***The previous action was: None
QLearner.update(): inputs = [0 0 0 1 0 0 1 0], open_position = None, reward = 0,
               action = sell, close_position = short, cumulative reward = 0
The current state is (0, 0, 0, 1, 0, 0, 1, 0)
The previous state is (0, 0, 0, 1, 0, 0, 1, 0)
Today's action is sell
Today's price is 1280.27002
End of day position is short
Today's reward is 0
Alpha is 0.7
Gamma is 0.7
Epsilon is 0.2
***The previous action was: sell
QLearner.update(): inputs = [0 0 0 1 0 0 1 0], open_position = short, reward = 0.910035,
    

In [117]:
agent.q_dict[None][None]#[self.prev_action]

{None: 0}

In [118]:
agent.prev_action

'sell'

In [99]:
agent.prev_state

(0, 0, 0, 1, 0, 0, 1, 0)