# Simple Q- Learning
### Finding Ten Discrete Meals To Fit Calorie Requirements

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from meal_plan_env import DayPlan

Reading in nutrition and reqs

In [2]:
df = pd.read_csv('C:/Users/J/Desktop/Businesses/Meal_Maker/Scraped_Data/combined_nutrition_small/nutrition_sm_processed_ss.csv',dtype = {'food_key':int,'ingredients_list':object}, encoding='ISO-8859-1')
df = df.fillna(0)

In [3]:
num_reqs_df = pd.read_csv("C:/Users/J/Fitness/Meal Plans/num_reqs_df.csv")

In [26]:
num_reqs_df

Unnamed: 0,calories,protein_g,fat_g,saturated_fat_g,carb_g,fiber_g,sugar_g,sodium_mg,cholesterol_mg,calcium_mg,iron_mg,vit_a_mcg,vit_c_mg
0,2226.178233,222.617823,86.573598,12.367657,139.13614,31.166495,27.827228,2400,300,1000,18,5000,60


Reading in partial meal plan

In [4]:
part_pl = pd.read_csv("C:/Users/J/Fitness/partial_plan.csv")

In [5]:
part_agg = pd.DataFrame([list(map(lambda x: sum(part_pl[x]),num_reqs_df.columns.values))], columns=num_reqs_df.columns.values)

In [6]:
new_reqs_df =  num_reqs_df - part_agg

Will look for 3 states that best completes this meal plan

### Environment Set Up

In [7]:
sp_df = df[(df.food_type_grp=='restaurant') & (df.brand=='Sweetgreen')]

In [10]:
pl = DayPlan(sp_df)

In [11]:
pl.reset()

In [12]:
pl.set_num_reqs(new_reqs_df)

### Model Set Up

Simple - layer Q - Network that learns which food item to pick given 15 or less to get to an optimal calorie count with additional reward for better nutritional value

Neural Network: 

One vector with one-hot encoding for state # for each state that exists. 
For each above vector, a vector of all 450k Q values

The loss function is the difference between the current predicted Q values and the 'target' value
So it is trying to get as accurate as possible in predicting the Q values for each state/action

Because 15 states x 450K actions is going to be too costly for an example problem, will prove out the example using a partial meal plan: 

In [13]:
# Reset Tensorflow
tf.reset_default_graph()
# Shape Tensor - 1 x 10
inputs = tf.placeholder(shape =[1,10],dtype=tf.float32)
# Randomized Weights For Each Action
W = tf.Variable(tf.random_uniform([10, len(pl.df)],0,0.1))
# Matrix Multiply the inputs (states) times the weights (actions)
Qout = tf.matmul(inputs, W)
# Choose index of the largest value in the output accross the 1st axis (row) - choosing the largest expected 'reward'
predict = tf.argmax(Qout, 1)
# Insert placeholder for action
nextQ = tf.placeholder(shape=[1,len(pl.df)], dtype=tf.float32)
# Loss function:  Q_target - Q
loss = tf.reduce_sum(tf.square(nextQ-Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate = 0.1)

In [14]:
updateModel = trainer.minimize(loss)

In [23]:
np.identity(10)

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [6]:
np.identity(10)[0:0+1]

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

### Running Model

In [None]:
init = tf.initialize_all_variables()

# Setting learning parameters
y=.99 
e =0.1 # probability of random pick
n = 3 # number of states
num_episodes = 10000
# Lists to capture total rewards per episode
rList = []
# 
with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        # Reset environment
        pl.reset()
        # Start at state 0
        s = 0
        # Total expected reward value at state
        rAll = 0
        # Set 'done' variable
        d = False
        # 
        # Q Network Iteration
        j = 0
        while j < 99:
            j+=1
            # 'predict' :
            #     take the max of Qout, 1. 
            # 'Qout':
            #     matrix multiply inputs x weights. Weights is a matrix with 1 row per state and and one column per action
            #     W holds...
            #     
            # 'feed_dict={inputs:np.identity(10)[s:s+1]}' :
            #     Choose an action greedily from Q-Network. Includes e chance of random action
            #     maps the 1 x 10 placeholder to one row of the identity matrix corresponding to the value randomly returned
            #     by the reset function. Feed_dict maps the 1x10 graph elements to the identity matrix at row s+1
            # 'a'
            #    The result of tf evaluating predict: returning the index of the largest element accross the tensor
            # 'allQ'
            #    The result of tf evaluating Qout: which is the matrix multiplication of input (states) times weights 
            #    (states x actions)
            a, allQ = sess.run([predict, Qout], feed_dict={inputs:np.identity(n)[s:s+1]})
            #  try random number between 0 and 1. If it's higher than error threshold -- 
            if np.random.rand(1) > e:
            # 
            # 'a[0]' 
            #     holds the index of the highest Q-value along the first row. 
            # 'env.action_space.sample()'
            #     return random index randomly sampled from action space
                a[0] = env.rand_action
                
            # 'env.step(a[0])'
            #     submits an action (the index of the expected highest Q-value) and returns tuple (observation, reward, done, info)
            #     s1: 
            s1, r, d  = env.step(a[0])
            # Inputs times the weights at new state
            Q1 = sess.run(Qout,feed_dict={inputs:np.identity(n)[s1, s1+1]}) # state + 1 
            # Future expected reward for this next state - this might be a problem for meal planning since future 
            # Expected reward is dependent on the preceding state
            maxQ1 = np.max(Q1) 
            # Target Q = Q values returned from state s evaluations
            targetQ = allQ
            targetQ[0, a[0]] = r + y*maxQ1
            # This is a problem. The Q - value at a given state for a given action will depend on the previous states
            # (meals chosen). Therefore a policy-based agent needs to be used.

            

In [25]:
np.random.rand(1)

array([ 0.01943258])