# QSiFDD (Q-Learning Sparse incremental Feature Dependency Discovery)
All "helper" methods below are required to run Q-Learning and Sparse iFDD. At the very bottom is the "main" method for running experimental batches (i.e. 5 trials of Q-learning vs. Q-learning w/ Sparse iFDD)

In [42]:
import matplotlib.pyplot as plt
%matplotlib inline 

import gym
import numpy as np
import time
import tensorflow as tf
import itertools
from itertools import chain
from itertools import combinations

# Setting up the Swimmer-v1 environment
env = gym.make('Swimmer-v1')

# Hyperparameters and features
STATE_LEN = 8
ACTION_LEN = 2 
OBSERVATION_SIZE = STATE_LEN + ACTION_LEN
STATE_FEATURES = [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5]
ACTION_FEATURES = [-0.036, 0., 0.036]
POSSIBLE_ACTIONS = [-0.036, 0, 0.036]
FEATURE_MAP = []
for i in range(OBSERVATION_SIZE):
    if i < STATE_LEN: FEATURE_MAP.append(STATE_FEATURES)
    else: FEATURE_MAP.append(ACTION_FEATURES)
print('State Features:', STATE_FEATURES, '\nAction Features:', ACTION_FEATURES, '\nFeature Map:', FEATURE_MAP)
EPSILON = 0.01 # epsilon-greedy 
THRESHOLD = 0.9
GAMMA = 0.001 # probability of discovering new feature if exceed threshold
THETA = 0.0001 # probability of adding new feature to undiscovered set
DISCOUNT_FACTOR = 0.001 # Q-learning
LEARNING_RATE = 0.001 # weight TD update 
# ADDITIONAL_FEATURE_INDEX = len([item for sublist in FEATURE_MAP for item in sublist]) # number of current features 
# print('The first new feature is added at this index:', ADDITIONAL_FEATURE_INDEX)


# Output: random state and random action
def generateObservation():
    env.reset()
    for episode in range(1):
        randomEpisode = np.random.randint(100)
        for timeSteps in range(randomEpisode):
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
    testState = observation
    testAction = randomAction()
    print('Sample Observation:', testState, '\nSample Action:', testAction)
    return testState, testAction

# Output: random action
def randomAction():
    global ACTION_LEN
    randomAction = []
    for i in range(ACTION_LEN):
        randomIndex = np.random.randint(len(POSSIBLE_ACTIONS))
        randomAction.append(POSSIBLE_ACTIONS[randomIndex])
    return randomAction

# Output: random state
def randomState():
    global STATE_LEN
    randomState = []
    for i in range(STATE_LEN):
        randomIndex = np.random.randint(min(STATE_FEATURES), max(STATE_FEATURES))
        randomState.append(STATE_FEATURES[randomIndex])
    return randomState
    

# Output: randomly generated observation 
def generateRandomStateActionPair():
    state = randomState()
    action = randomAction()
    return state, action

# Input: state, action
# Output: states + action observation rounded to nearest whole number (discretization)
def observationProcessor(state, action):
    global STATE_FEATURES
    processedState = []
    for i in range(len(state)): 
        processedState.append(int(state[i]))
        if state[i] > max(STATE_FEATURES): processedState[i] = max(STATE_FEATURES)
        elif state[i] < min(STATE_FEATURES): processedState[i] = min(STATE_FEATURES)
    return processedState + action

# Input: processed observation
# Output: flattened basis function  and indexes 
def basisFunction(observation):
    global OBSERVATION_SIZE, STATE_FEATURES, ACTION_FEATURES
    activeFeature = []
    for i in range(OBSERVATION_SIZE):
        index = FEATURE_MAP[i].index(observation[i])
        if i < STATE_LEN:
            basisRow = np.zeros(len(STATE_FEATURES))
            basisRow[index]  = 1
            activeFeature.append(list(basisRow))
        else:
            basisRow = np.zeros(len(ACTION_FEATURES))
            basisRow[index] = 1
            activeFeature.append(list(basisRow))
    # Flatten active feature  
    activeFeature = [item for sublist in activeFeature for item in sublist]
    # Get all indexes that are 1
    activeIndex = [i for i, item in enumerate(activeFeature) if item == 1]
    return activeFeature, activeIndex


# Input: list of active indexes
# Output: powerset containing all subsets of active indexes as such: feature, relevance
def getPowerSet(activeIndexes):
    powerSet = []
    # Initialize relevance values  
    initialRelevance = 0
    # +1 to include the complete set as a set
    for i in range(len(activeIndexes) + 1):
        conjunctions = [[set(combo), initialRelevance] for combo in itertools.combinations(activeIndexes, i)]
        powerSet.append(conjunctions)
    # Flatten powerset into 1-dim list
    powerSet = [item for sublist in powerSet for item in sublist] 
    return powerSet

# Input: list of initial active indexes 
# Output: powerset containing 
def getInitialDiscoveredUndiscovered(initialActiveIndexes):
    # Generate initial discovered set
    discovered = dict()
    for i in range(len(initialActiveIndexes)):
        key = str(i)
        discovered.update({key : set([initialActiveIndexes[i]])})
    # Generate initial undiscovered set
    initialPowerSet = getPowerSet(initialActiveIndexes)
    # Undiscovered set does not include the size 1 initial features or the empty set, only conjunctions. 
    undiscovered =  [item for i, item in enumerate(initialPowerSet) if len(item[0]) > 1]    
    return discovered, undiscovered

# Input: set
# Output: list of subsets
def subset(mySet):
    listSubset = []
    for i in range(len(mySet) + 1):
        listSubset.append(set(itertools.combinations(mySet, i)))
    listSubset = [item for sublist in listSubset for item in sublist] 
    return listSubset


# Input: active indexes and current basis function 
# Output: temp basis function for checking if all features active 
def getBasisFromActiveIndex(activeIndex, basisFunction):
    tempBasis = np.zeros(len(basisFunction))
    for i in range(len(activeIndex)):
        tempBasis[activeIndex[i]] = 1.0
    return tempBasis

# Input: active indexes, current basis function, discovered
# Output: temp basis function with subsets = 0 and conjunction = 1
def activateConjunction(activeIndex, basisFunction, conjunctionKey):
    for i in range(len(activeIndex)):
        basisFunction[activeIndex[i]] = 0.0
    activeConjunctionIndex = int(conjunctionKey)
    basisFunction[activeConjunctionIndex] = 1.0
    return basisFunction 
 
# Input: discovered set, basis function
# Output: modified basis function 
def conjunctionBasisFunction(discoveredFeatures, basisFunction): 
    for key, feature in discoveredFeatures.items():
        if len(feature) > 1: 
            featureSubsets = subset(feature)
            for j in range(len(featureSubsets)):
                activeIndex = list(featureSubsets[j])
                tempBasis = getBasisFromActiveIndex(activeIndex, basisFunction)
                if np.array_equal(tempBasis, basisFunction):
                    basisFunction = activateConjunction(activeIndex, basisFunction, key)              
    return basisFunction

# Input: Q-values
# Output: action, action index 
def epsilonGreedy(qValues):
    global EPSILON, POSSIBLE_ACTIONS
    randomNumber = np.random.uniform(0, 1)
    if randomNumber < EPSILON:
        actionIndex = np.argmax(qValues)
        return POSSIBLE_ACTIONS[actionIndex], actionIndex
    else: 
        actionIndex = np.random.randint(len(qValues))
        return POSSIBLE_ACTIONS[actionIndex], actionIndex

# Input: Q-values
# Output: action vector 
def jointEpsilonGreedy(qValues):
    jointAction = []
    jointActionIndex = []
    for i in range(ACTION_LEN):
        action, actionIndex = epsilonGreedy(qValues)
        jointAction.append(action)
        jointActionIndex.append(actionIndex)
    return jointAction, jointActionIndex

# Sparse iFDD Algorithm
# Input: td error, threshold, discovered features,  undiscovered features, additional feature index
# Output: updated discovered and undiscovered sets
def discover(tdError, threshold, discoveredFeatures, undiscoveredFeatures, additionalFeatureIndex, currentBasisFunction, weights): 
    global GAMMA, THETA 
    activeFeatures = [i for i, value in enumerate(currentBasisFunction) if value == 1.0]
    # Get powerset of active features 
    current_PowerSet = getPowerSet(activeFeatures)
    # Discover any features 
    activeOldFeatures = [i for i, feature in enumerate(undiscoveredFeatures) if feature in current_PowerSet and feature != set([])]
    for i in range(len(activeOldFeatures)):
        feature = undiscoveredFeatures[i]
        feature[1] += tdError 
        # Discover the feature if exceed the threshold and delete from undiscoveredFeatures
        # Discover feature with probability GAMMA  
        randomNumber = np.random.uniform(0, 1)
        if randomNumber < GAMMA and feature[1] > threshold and feature != set([]):
            additionalFeatureIndex += 1
            key = str(additionalFeatureIndex)
            discoveredFeatures[key] = feature[0]
            # Replace with undiscoverable relevance
            # Update sizes of basis and weights
            currentBasisFunction.append(0.0)
            weights.append(np.random.uniform(low = -1, high = 1, size = 3))
            # print('Feature:', feature, 'TDerror:', tdError, 'Feature Index:', additionalFeatureIndex)
            undiscoveredFeatures[i][1] = -1e20
                                                                              
    # Add new features  
    newUndiscoveredFeatures = [i for i, feature in enumerate(current_PowerSet) 
                               if feature[0] not in discoveredFeatures.values() 
                               and feature not in undiscoveredFeatures
                               and feature != set([])]
    for i in range(len(newUndiscoveredFeatures)): 
        feature = current_PowerSet[i]
        randomNumber = np.random.uniform(0, 1)
        if randomNumber < THETA:
            undiscoveredFeatures.append(feature)           
    return discoveredFeatures, undiscoveredFeatures, additionalFeatureIndex  

# Input: weights, activeIndex, actionIndex, reward, oldQValue, newQValues, tdError
# Output: weights 
def updateWeights(weights, activeIndex, actionIndex, reward, oldQValue, newQValues):
    for i in range(len(activeIndex)): 
        tdError = reward + DISCOUNT_FACTOR * (max(newQValues) - oldQValue)  
        weights[activeIndex[i]][actionIndex] += LEARNING_RATE * tdError
    return weights, tdError

# Input: initial basis function
# Output: random weights 
def initializeWeights(basisFunction):
    weightVector = []
    for i in range(len(basisFunction)):
        weightVector.append(np.random.uniform(low = -1, high = 1, size = 3))
    return weightVector

[2018-01-06 18:25:14,328] Making new env: Swimmer-v1


State Features: [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5] 
Action Features: [-0.036, 0.0, 0.036] 
Feature Map: [[-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-5, -4, -3, -2, -1, 0, -1, -2, -3, -4, -5], [-0.036, 0.0, 0.036], [-0.036, 0.0, 0.036]]


In [35]:
# For running Q-Learning alone 
def QLearning(epsilon, episodeLen):
    start_time = time.time()
    TDErrorList1 = []
    RewardList1 = []
    totalReward1 = 0
    EPSILON = epsilon
    for episode in range(1):
        env.reset()
        # Start episode with random initial state and action   
        initialState, initialAction = generateRandomStateActionPair()
        # Process initial state and action
        initialObservation = observationProcessor(initialState, initialAction)
        # Get initial basis function and active indices 
        initialBasisFunction, initialActiveIndex = basisFunction(initialObservation)
        print('Initial Active Features:', initialActiveIndex)
        # Initialize weights 
        WEIGHT_VECTOR = initializeWeights(initialBasisFunction)
        print('Len Initial Weight Vector:', len(WEIGHT_VECTOR))
        # Divide into initial discovered and undiscovered feature sets
        DISCOVERED, UNDISCOVERED = getInitialDiscoveredUndiscovered(initialActiveIndex)
        print('Len Initial Discovered Set:', len(DISCOVERED), '\nLen Initial Undiscovered Set:', len(UNDISCOVERED))
        # Begin learning 
        oldObservation = initialObservation 
        # Multiply basis function and weights together
        oldQValues = np.dot(initialBasisFunction, WEIGHT_VECTOR)
        for timeSteps in range(episodeLen):
            # Choose an action using epsilon-greedy
            action, actionIndex = jointEpsilonGreedy(oldQValues)
            # Get old Q value
            oldQValue = 0.5 * (oldQValues[actionIndex[0]] + oldQValues[actionIndex[1]])
            # Take a step in the environment 
            state, reward, done, info = env.step(action)
            # Process observation
            currentObservation = observationProcessor(state, action)
            # Get basis function
            currentBasisFunction, currentActiveIndex = basisFunction(currentObservation)
            # Calculate new Q-values 
            currentQValues = np.dot(currentBasisFunction, WEIGHT_VECTOR)
            # Update weights
            WEIGHT_VECTOR, currentTDError = updateWeights(WEIGHT_VECTOR, 
                                          currentActiveIndex, 
                                          actionIndex, 
                                          reward, 
                                          oldQValue, 
                                          currentQValues)
            # Logging 
            # Outputs information every 100000 timesteps
            if timeSteps % 500000 == 0:
                current_time = time.time()
                print('Timestep:', timeSteps, 
                      'Time elapsed:', round(current_time - start_time, 2),
                      'Cumulative Reward:', round(totalReward1, 4))

            # Set current observation and Q-value to old
            EPSILON += 0.00000001
            totalReward1 += reward
            TDErrorList1.append(currentTDError)
            RewardList1.append(totalReward1)
            oldObservation = currentObservation
            oldQValues = currentQValues 
    end_time = time.time()
    print('Training Time:', round(end_time - start_time, 2))
    return TDErrorList1, RewardList1

# Running Q-learning with initial state and action
def QLearning2(epsilon, episodeLen, initState, initAction):
    start_time = time.time()
    TDErrorList1 = []
    RewardList1 = []
    totalReward1 = 0
    EPSILON = epsilon
    for episode in range(1):
        env.reset()
        # Process initial state and action
        initialObservation = observationProcessor(initState, initAction)
        # Get initial basis function and active indices 
        initialBasisFunction, initialActiveIndex = basisFunction(initialObservation)
        #print('Initial Active Features:', initialActiveIndex)
        # Initialize weights 
        WEIGHT_VECTOR = initializeWeights(initialBasisFunction)
        #print('Len Initial Weight Vector:', len(WEIGHT_VECTOR))
        # Divide into initial discovered and undiscovered feature sets
        DISCOVERED, UNDISCOVERED = getInitialDiscoveredUndiscovered(initialActiveIndex)
        print('Len Initial Discovered Set:', len(DISCOVERED), '\nLen Initial Undiscovered Set:', len(UNDISCOVERED))
        # Begin learning 
        oldObservation = initialObservation 
        # Multiply basis function and weights together
        oldQValues = np.dot(initialBasisFunction, WEIGHT_VECTOR)
        for timeSteps in range(episodeLen):
            # Choose an action using epsilon-greedy
            action, actionIndex = jointEpsilonGreedy(oldQValues)
            # Get old Q value
            oldQValue = 0.5 * (oldQValues[actionIndex[0]] + oldQValues[actionIndex[1]])
            # Take a step in the environment 
            state, reward, done, info = env.step(action)
            # Process observation
            currentObservation = observationProcessor(state, action)
            # Get basis function
            currentBasisFunction, currentActiveIndex = basisFunction(currentObservation)
            # Calculate new Q-values 
            currentQValues = np.dot(currentBasisFunction, WEIGHT_VECTOR)
            # Update weights
            WEIGHT_VECTOR, currentTDError = updateWeights(WEIGHT_VECTOR, 
                                          currentActiveIndex, 
                                          actionIndex, 
                                          reward, 
                                          oldQValue, 
                                          currentQValues)
            # Logging 
            # Outputs information every 100000 timesteps
            if timeSteps % 500000 == 0:
                current_time = time.time()
                print('Timestep:', timeSteps, 
                      'Time elapsed:', round(current_time - start_time, 2),
                      'Cumulative Reward:', round(totalReward1, 4))

            # Set current observation and Q-value to old
            EPSILON += 0.00000001
            totalReward1 += reward
            TDErrorList1.append(currentTDError)
            RewardList1.append(totalReward1)
            oldObservation = currentObservation
            oldQValues = currentQValues 
    end_time = time.time()
    print('Training Time:', round(end_time - start_time, 2))
    return TDErrorList1, RewardList1

In [34]:
# For running Q-Learning w/ Sparse iFDD 
def QLearningIFDD(epsilon, episodeLen, myDiscoverFrequency):
    start_time = time.time()
    TDErrorList2 = []
    RewardList2 = []
    DiscoverTimeList = []
    totalReward2 = 0
    currentTDError = 0
    EPSILON = epsilon
    discoverFrequency = myDiscoverFrequency
    global THRESHOLD
    for episode in range(1):
        env.reset()
        # Start episode with random initial state and action   
        initialState, initialAction = generateRandomStateActionPair()
        # Process initial state and action
        initialObservation = observationProcessor(initialState, initialAction)
        # Get initial basis function and active indices 
        initialBasisFunction, initialActiveIndex = basisFunction(initialObservation)
        print('Initial Active Features:', initialActiveIndex)
        # Initialize ADDITIONAL_FEATURE_INDEX 
        ADDITIONAL_FEATURE_INDEX = len(initialBasisFunction)
        # Initialize weights 
        WEIGHT_VECTOR = initializeWeights(initialBasisFunction)
        print('Len Initial Weight Vector:', len(WEIGHT_VECTOR))
        # Divide into initial discovered and undiscovered feature sets
        DISCOVERED, UNDISCOVERED = getInitialDiscoveredUndiscovered(initialActiveIndex)
        print('Len Initial Discovered Set:', len(DISCOVERED), '\nLen Initial Undiscovered Set:', len(UNDISCOVERED))
        # Begin learning 
        oldObservation = initialObservation 
        # Multiply basis function and weights together
        oldQValues = np.dot(initialBasisFunction, WEIGHT_VECTOR)
        currentBasisFunction = []
        for timeSteps in range(episodeLen):
            # Choose an action using epsilon-greedy
            action, actionIndex = jointEpsilonGreedy(oldQValues)
            # Get old Q value, which is the average of both Q-values
            oldQValue = 0.5 * (oldQValues[actionIndex[0]] + oldQValues[actionIndex[1]])
            # Take a step in the environment 
            state, reward, done, info = env.step(action)
            # Process observation
            currentObservation = observationProcessor(state, action)
            # Get basis function - unmodified basis function for first time step only
            unmodifiedBasisFunction, currentActiveIndex = basisFunction(currentObservation)
            if timeSteps == 0:
                currentBasisFunction = unmodifiedBasisFunction
            else:
                currentBasisFunction = currentBasisFunction

            # Get modified basis function by inactivating subsets of conjunctions
            currentBasisFunction = conjunctionBasisFunction(DISCOVERED, currentBasisFunction)
            currentActiveIndex = [i for i, item in enumerate(currentBasisFunction) if item == 1]

           # Calculate new Q-values 
            currentQValues = np.dot(currentBasisFunction, WEIGHT_VECTOR)
            # Update weights using off-policy Q-learning
            WEIGHT_VECTOR, currentTDError = updateWeights(WEIGHT_VECTOR, 
                                          currentActiveIndex, 
                                          actionIndex, 
                                          reward, 
                                          oldQValue, 
                                          currentQValues)

            # Discover new features
            PREVIOUS_DISCOVERED = DISCOVERED 
            if timeSteps % discoverFrequency == 0:
                DISCOVERED, UNDISCOVERED, ADDITIONAL_FEATURE_INDEX = discover(currentTDError,
                                                                            THRESHOLD,
                                                                            DISCOVERED,
                                                                            UNDISCOVERED,
                                                                            ADDITIONAL_FEATURE_INDEX,
                                                                            currentBasisFunction,
                                                                            WEIGHT_VECTOR) 
                if len(PREVIOUS_DISCOVERED) < len(DISCOVERED):
                    current_time = time.time()
                    DiscoverTimeList.append(round(current_time - start_time, 2))
                    discoverFrequency += discoverFrequency

            # Logging
            # Outputs information every 100000 timesteps
            if timeSteps % 500000 == 0:
                current_time = time.time()
                print('Timestep:', timeSteps, 
                      'Elapsed:', round(current_time - start_time, 2),
                      'DISCOVERED:', len(DISCOVERED),
                      'UNDISCOVERED:', len(UNDISCOVERED), 
                      'Basis:', len(currentBasisFunction),
                      'Weights:', len(WEIGHT_VECTOR),
                      'Cumul. Reward:', round(totalReward2, 4))

            # Set current observation and Q-value to old
            EPSILON += 0.00000001
            totalReward2 += reward
            TDErrorList2.append(currentTDError)
            RewardList2.append(totalReward2)
            oldObservation = currentObservation
            oldQValues = currentQValues
            
    end_time = time.time()
    print('Training Time:', round(end_time - start_time, 2))
    return TDErrorList2, RewardList2

# For running Q-Learning w/ Sparse iFDD 
def QLearningIFDD2(epsilon, episodeLen, myDiscoverFrequency, initState, initAction):
    start_time = time.time()
    TDErrorList2 = []
    RewardList2 = []
    DiscoverTimeList = []
    totalReward2 = 0
    currentTDError = 0
    EPSILON = epsilon
    discoverFrequency = myDiscoverFrequency
    global THRESHOLD
    for episode in range(1):
        env.reset()
        # Process initial state and action
        initialObservation = observationProcessor(initState, initAction)
        # Get initial basis function and active indices 
        initialBasisFunction, initialActiveIndex = basisFunction(initialObservation)
        #print('Initial Active Features:', initialActiveIndex)
        # Initialize ADDITIONAL_FEATURE_INDEX 
        ADDITIONAL_FEATURE_INDEX = len(initialBasisFunction)
        # Initialize weights 
        WEIGHT_VECTOR = initializeWeights(initialBasisFunction)
        #print('Len Initial Weight Vector:', len(WEIGHT_VECTOR))
        # Divide into initial discovered and undiscovered feature sets
        DISCOVERED, UNDISCOVERED = getInitialDiscoveredUndiscovered(initialActiveIndex)
        print('Len Initial Discovered Set:', len(DISCOVERED), '\nLen Initial Undiscovered Set:', len(UNDISCOVERED))
        # Begin learning 
        oldObservation = initialObservation 
        # Multiply basis function and weights together
        oldQValues = np.dot(initialBasisFunction, WEIGHT_VECTOR)
        currentBasisFunction = []
        for timeSteps in range(episodeLen):
            # Choose an action using epsilon-greedy
            action, actionIndex = jointEpsilonGreedy(oldQValues)
            # Get old Q value, which is the average of both Q-values
            oldQValue = 0.5 * (oldQValues[actionIndex[0]] + oldQValues[actionIndex[1]])
            # Take a step in the environment 
            state, reward, done, info = env.step(action)
            # Process observation
            currentObservation = observationProcessor(state, action)
            # Get basis function - unmodified basis function for first time step only
            unmodifiedBasisFunction, currentActiveIndex = basisFunction(currentObservation)
            if timeSteps == 0:
                currentBasisFunction = unmodifiedBasisFunction
            else:
                currentBasisFunction = currentBasisFunction

            # Get modified basis function by inactivating subsets of conjunctions
            currentBasisFunction = conjunctionBasisFunction(DISCOVERED, currentBasisFunction)
            currentActiveIndex = [i for i, item in enumerate(currentBasisFunction) if item == 1]

           # Calculate new Q-values 
            currentQValues = np.dot(currentBasisFunction, WEIGHT_VECTOR)
            # Update weights using off-policy Q-learning
            WEIGHT_VECTOR, currentTDError = updateWeights(WEIGHT_VECTOR, 
                                          currentActiveIndex, 
                                          actionIndex, 
                                          reward, 
                                          oldQValue, 
                                          currentQValues)

            # Discover new features
            PREVIOUS_DISCOVERED = DISCOVERED 
            if timeSteps % discoverFrequency == 0:
                DISCOVERED, UNDISCOVERED, ADDITIONAL_FEATURE_INDEX = discover(currentTDError,
                                                                            THRESHOLD,
                                                                            DISCOVERED,
                                                                            UNDISCOVERED,
                                                                            ADDITIONAL_FEATURE_INDEX,
                                                                            currentBasisFunction,
                                                                            WEIGHT_VECTOR) 
                if len(PREVIOUS_DISCOVERED) < len(DISCOVERED):
                    current_time = time.time()
                    DiscoverTimeList.append(round(current_time - start_time, 2))
                    discoverFrequency += discoverFrequency

            # Logging
            # Outputs information every 100000 timesteps
            if timeSteps % 500000 == 0:
                current_time = time.time()
                print('Timestep:', timeSteps, 
                      'Elapsed:', round(current_time - start_time, 2),
                      'DISCOVERED:', len(DISCOVERED),
                      'UNDISCOVERED:', len(UNDISCOVERED), 
                      'Basis:', len(currentBasisFunction),
                      'Weights:', len(WEIGHT_VECTOR),
                      'Cumul. Reward:', round(totalReward2, 4))

            # Set current observation and Q-value to old
            EPSILON += 0.00000001
            totalReward2 += reward
            TDErrorList2.append(currentTDError)
            RewardList2.append(totalReward2)
            oldObservation = currentObservation
            oldQValues = currentQValues
            
    end_time = time.time()
    print('Training Time:', round(end_time - start_time, 2))
    return TDErrorList2, RewardList2

### batchQSiFDDTrain
This method runs trials of Q-Learning vs. Q-Learning w/ Sparse iFDD as well as provides real-time training logs and a plot of the results at the end.
- trials: number of trials
- epsilon: e-greedy action selection parameter
- episodeLen: number of timesteps to run
- myDiscoverFrequency: frequency to start and get doubled after every discovery algorithm iteration

In [45]:
def batchQSiFDDTrain(trials, epsilon, episodeLen, myDiscoverFrequency):
    print('-------BEGIN', trials, 'trials of Q-Learning vs. Q-Learning w/ Sparse iFDD for', episodeLen, 'episodes.-------')
    episodeLen += 1
    start_time = time.time()
    for i in range(trials):
        print('Trial:', i + 1)
        # Initialize with same initial features for all experiments
        initState, initAction = generateRandomStateActionPair()
        initialObservation = observationProcessor(initState, initAction)
        initialBasisFunction, initialActiveIndex = basisFunction(initialObservation)
        print('Initial Active Features for Current Trial:', initialActiveIndex)
        print('-----------Q-Learning-----------')
        QLearnTD, QLearnReward = QLearning2(epsilon, episodeLen, initState, initAction)
        print('-----------Q-Learning w/ Sparse iFDD-----------')
        QLearniFDDTD, QLearniFDDReward = QLearningIFDD2(epsilon, episodeLen, myDiscoverFrequency, initState, initAction)
        plt.plot(QLearnReward[1::10000], 'r--')
        plt.plot(QLearniFDDReward[1::10000], 'b--')
        current_time = time.time()
        plt.title('Q-Learning w/o (red) vs. w/ Sparse iFDD (blue) for Initial Features:' + str(initialActiveIndex))
        plt.xlabel('Time Steps (x 10^4)')
        plt.ylabel('Reward')
        plt.show() 
        print('Time Elapsed:', round(current_time - start_time, 2))
    # Show the plot
    end_time = time.time()
    print('Experiment complete!')
    print('Total Training Time:', round((end_time - start_time)/3600, 2), 'hours')         