In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import time
import random
from collections import defaultdict
from rsalgos.agent_deep_sarsa import DeepSARSAAgent
from rsalgos.agent_dqn import DQNAgent
from rsalgos.agent_actor_critic import DDPGAgent
from rsalgos.rec_sys_environment_2 import RecSysEnvironment
from utils.utils import *

Using TensorFlow backend.


## Read the necessary data file

In [3]:
DATA_BASE_PATH = 'data/'
METADATA_BASE_PATH = 'metadata/'
MODELS_BASE_PATH = 'models/'

EPISODES = 80000
RESTAURANT_FEATURES_N_DIM = 10
K_WINDOW = 5
MAX_RECOM_LENGTH = 30
SEED = 133610
USER_SAMPLE_SIZE=5500

In [4]:
def pre_process_data():
    user_restaurant_rating_file = DATA_BASE_PATH + 'filt_rest_review_data.tar.gz'
    simulation_file = DATA_BASE_PATH + 'dqn_simulation_data.csv'
    restaurant_embeddings_file = DATA_BASE_PATH + 'restaurant_context_embeddings.csv'
    restaurant_feat_file = DATA_BASE_PATH + 'rest_context_feat.csv'

    rest_embeddings_data = pd.read_csv(restaurant_embeddings_file)
    rest_embeddings_data = rest_embeddings_data.set_index('business_id')
    print('shape of restaurant embeddings data -', rest_embeddings_data.shape)
#     print('columns of restaurant embeddings data-', rest_embeddings_data.columns.values)

    simulation_data = pd.read_csv(simulation_file)
    print('shape of simulation data -', simulation_data.shape)
#     print('columns of simulation data-', simulation_data.columns.values)
    simulation_data['state'] = simulation_data.state. \
        apply(lambda x: np.asarray([each.split('|') for each in x.split(',')], dtype='float32'))
    simulation_data['action'] = simulation_data.action.apply(lambda x: np.asarray(x.split('|'), dtype='float32'))
    simulation_data['reward'] = simulation_data.reward.astype(float)
    # simulation_data['next_state'] = simulation_data.next_state. \
    #     apply(lambda x: np.asarray([each.split('|') for each in x.split(',')], dtype='float32'))

    rest_data = pd.read_csv(restaurant_feat_file)
    print('shape of restaurant data -', rest_data.shape)
#     print('columns of restaurant data-', rest_data.columns.values)

    # selected_rest_list = list(rest_data[rest_data.state == 'AZ'].business_id.unique())
    # cache_data = rest_embeddings_data[rest_embeddings_data.index.isin(selected_rest_list)]
    print('size of restaurant embeddings data after state filter --', rest_embeddings_data.shape)
    write_rest_data(rest_embeddings_data)

    del rest_embeddings_data
    del rest_data

    # select_user_list = list(user_rating_data[user_rating_data.business_id.isin(selected_rest_list)].user_id.unique())
    # select_user_list = list(simulation_data[simulation_data.user_id.isin(select_user_list)].user_id.unique())
    select_user_list = list(simulation_data.user_id.unique())
    print('unique users size from user data --', len(select_user_list))
    select_user_list = [random.choice(select_user_list) for _ in range(USER_SAMPLE_SIZE)]
    print('selected users size from user data --', len(select_user_list))
    simulation_data = simulation_data[simulation_data.user_id.isin(select_user_list)]

    def get_train_sessions(rows, perc=0.8):
        count = int(len(rows) * perc)
        return list(rows.head(count).index)

    def get_test_sessions(rows, perc=0.2):
        count = int(len(rows) * perc)
        return list(rows.tail(count).index)

    grpd_data = simulation_data.groupby(['user_id'])

    train_idxs = grpd_data.apply(get_train_sessions).reset_index()[0].values
    train_idxs = [idx for each in train_idxs for idx in each]
    simulation_data_train = simulation_data[simulation_data.index.isin(train_idxs)]
    print('size of simulation train data', simulation_data_train.shape)

    test_idnx = grpd_data.apply(get_test_sessions).reset_index()[0].values
    test_idnx = [idx for each in test_idnx for idx in each]
    simulation_data_test = simulation_data[simulation_data.index.isin(test_idnx)]
    print('size of simulation test data', simulation_data_test.shape)

    return simulation_data, simulation_data_train, simulation_data_test

In [5]:
def run_one_episode(env, agent, episode, policy_type, is_train=True):
    curr_state = env.reset()
    # print("starting episode for user {} --
    # with validation size {}".format(env.curr_user, len(env.curr_user_val.index)))
    done = False
    user_data = {'prec_vect': list(), 'reward_vect': list(), 'action': list()}
    maxq_track_array = list()
    train_loss_array = list()
    while not done:
        # 1. Pick epsilon-greedy action from possible actions for the current state
        one_action, max_qvalue = agent.get_action(curr_state)
        user_data['action'].append(one_action)
        maxq_track_array.append(max_qvalue)

        # 2. perform state transition from current state to next state
        current_state, action_reward, next_state, done = env.step(curr_state, one_action)
        if np.array_equal(current_state, next_state):
            user_data['prec_vect'].append(0)
        else:
            user_data['prec_vect'].append(1)

        # 3. Append the experience to the memory
        agent.append_sample(curr_state, one_action, action_reward, next_state, done, episode)

        # 4. Train the model by calling function agent.train_model
        if policy_type == 'onpolicy' and is_train:
            train_loss = agent.train_model()
            if train_loss:
                train_loss_array.append(train_loss)
                Q_LEARNING_TRACKING[episode] = train_loss_array

        # reset the current state to next_state
        curr_state = next_state
        user_data['reward_vect'].append(action_reward)
        if done:
            USER_STATE_REWARDS[str(env.curr_user)].append(user_data)
            EPISODE_TOTAL_REWARDS[episode] = user_data['reward_vect']
            Q_VALUE_TRACKING[episode] = maxq_track_array
            # print("completed one episode for user --", env.curr_user)

    # 4. Train the model by calling function agent.train_model
    if policy_type == 'offpolicy' and is_train:
        train_loss = agent.train_model()
        if train_loss:
            train_loss_array.extend(train_loss)
            Q_LEARNING_TRACKING[episode] = train_loss_array


In [6]:
def evaluate_rl_agent(a_type, user_data):
    final_df = pd.DataFrame()
    for user, value_dict in user_data.items():
        episode_df = get_evaluation_dataframe(user, value_dict)
        # print('user={} -- MAP={} and nDCG={}'.format(user, episode_df.avg_prec.mean(), episode_df.ndcg.mean()))
        final_df = pd.concat([final_df, episode_df])

    final_df.to_csv((METADATA_BASE_PATH + '{}_{}_user_evaluation_raw.csv').format(a_type, agent_key), index=False)
    final_df.groupby(['user_id']).agg({'avg_prec': 'mean', 'ndcg': 'mean'}).reset_index() \
        .to_csv((METADATA_BASE_PATH + '{}_{}_user_evaluation_metrics.csv').format(a_type, agent_key), index=False)
    map_train = final_df.avg_prec.mean()
    ndcg_train = final_df.ndcg.mean()
    print('Data metrics for shape {} -- MAP={} and nDCG={}'.format(final_df.shape, map_train, ndcg_train))

## Create Environment and Agent

In [7]:
sim_all_data, sim_train_data, sim_test_data = pre_process_data()
env_train = RecSysEnvironment(sim_train_data, sim_all_data, K_WINDOW)
env_test = RecSysEnvironment(sim_test_data, sim_all_data, K_WINDOW, False)

shape of restaurant embeddings data - (19590, 10)
shape of simulation data - (242679, 5)
shape of restaurant data - (19590, 51)
size of restaurant embeddings data after state filter -- (19590, 10)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19590/19590 [00:02<00:00, 6600.41it/s]


disk cache created with size --> 19590
unique users size from user data -- 15662
selected users size from user data -- 5500
size of simulation train data (55533, 5)
size of simulation test data (12593, 5)


In [8]:
agents_list = {'dqn': ('offpolicy', DQNAgent(K_WINDOW, RESTAURANT_FEATURES_N_DIM, MAX_RECOM_LENGTH)),
               'deep_sarsa': ('onpolicy', DeepSARSAAgent(K_WINDOW, RESTAURANT_FEATURES_N_DIM, MAX_RECOM_LENGTH)),
               'ddpg-ac': ('offpolicy', None)}

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
state_input (InputLayer)        (None, 5, 10)        0                                            
__________________________________________________________________________________________________
state_gru (GRU)                 (None, 5, 32)        4128        state_input[0][0]                
__________________________________________________________________________________________________
state_dense (Dense)             (None, 5, 32)        1056        state_gru[0][0]                  
__________________________________________________________________________________________________
action_input (InputLa

Total params: 11,057
Trainable params: 11,057
Non-trainable params: 0
__________________________________________________________________________________________________
None
rest contextual matrix created with shape  (19590, 10)
model saved to --> models/nearest_neighbour.model


In [9]:
for agent_key, (policy, agent_obj) in agents_list.items():
    # Tracking for convergence update them after each episode
    USER_STATE_REWARDS = defaultdict(list)  # user_id -> [[rewards for eps1], [rewards for eps2]....[rewards for eps-n]]
    EPISODE_TOTAL_REWARDS = defaultdict()  # episode_num -> [rewards given across users]
    Q_VALUE_TRACKING = defaultdict()
    Q_LEARNING_TRACKING = defaultdict()

    if not agent_obj:
        continue

    start_time = time.time()
    trained_model = None
    for episodei in tqdm(range(EPISODES + 1)):
        run_one_episode(env_train, agent_obj, episodei, policy)

        # save the model
        if episodei % 10000 == 0:
            agent_obj.model.save_weights((MODELS_BASE_PATH+"{}_{}_{}.h5").format(agent_key, policy, episodei))
            trained_model = agent_obj.model

        if episodei % 50 == 0:
            save_obj(USER_STATE_REWARDS,
                     (METADATA_BASE_PATH+'{}_user_state_rewards.pkl').format(agent_key))
            save_obj(EPISODE_TOTAL_REWARDS,
                     (METADATA_BASE_PATH+'{}_episode_total_rewards.pkl').format(agent_key))
            save_obj(Q_LEARNING_TRACKING,
                     (METADATA_BASE_PATH+'{}_dqn_train_loss.pkl').format(agent_key))
            save_obj(Q_VALUE_TRACKING,
                     (METADATA_BASE_PATH+'{}_qvalue_tracking.pkl').format(agent_key))

        if episodei % 1000 == 0:
            end_time = time.time()
            print("episode:", episodei, "  memory length:", len(agent_obj.memory))  # , "  epsilon:", agent.epsilon
            print('time taken for 100 episode --> %.2fs' % (end_time - start_time))
            start_time = time.time()

    # perform MAP/nDCG calculation for episodes.
    evaluate_rl_agent('train', USER_STATE_REWARDS)

    # Plot for the graphs.
    plot_from_dict(EPISODE_TOTAL_REWARDS, np.sum,
                   (METADATA_BASE_PATH+'{}_rewards_tracking.png').format(agent_key))
    plot_from_dict(Q_LEARNING_TRACKING, np.mean,
                   (METADATA_BASE_PATH + '{}_qnetwork_loss.png').format(agent_key))
    plot_from_dict(Q_VALUE_TRACKING, np.mean,
                   (METADATA_BASE_PATH + '{}_qvalue_tracking.png').format(agent_key))

    # Tracking for convergence update them after each episode
    USER_STATE_REWARDS = defaultdict(list)  # user_id -> [[rewards for eps1], [rewards for eps2]....[rewards for eps-n]]
    EPISODE_TOTAL_REWARDS = defaultdict()  # episode_num -> [rewards given across users]
    Q_VALUE_TRACKING = defaultdict()
    Q_LEARNING_TRACKING = defaultdict()

    # perform testing part of the code
    test_user_len = len(sim_test_data.user_id.unique())
    print('starting testing for {} users'.format(test_user_len))
    start_time = time.time()
    for episodej in tqdm(range(test_user_len)):
        run_one_episode(env_test, agent_obj, episodej, policy)

        save_obj(USER_STATE_REWARDS,
                 (METADATA_BASE_PATH + '{}_test_user_state_rewards.pkl').format(agent_key))
        save_obj(EPISODE_TOTAL_REWARDS,
                 (METADATA_BASE_PATH + '{}_test_episode_total_rewards.pkl').format(agent_key))
        save_obj(Q_LEARNING_TRACKING,
                 (METADATA_BASE_PATH + '{}_test_dqn_train_loss.pkl').format(agent_key))
        save_obj(Q_VALUE_TRACKING,
                 (METADATA_BASE_PATH + '{}_test_qvalue_tracking.pkl').format(agent_key))

    end_time = time.time()
    print('time taken for testing --> %.2fs' % (end_time - start_time))

    # perform MAP/nDCG calculation for episodes.
    evaluate_rl_agent('test', USER_STATE_REWARDS)

print('complete the run')

  0%|                                                                                                                                            | 0/80001 [00:00<?, ?it/s]

episode: 0   memory length: 25
time taken for 100 episode --> 0.52s


  0%|                                                                                                                                  | 5/80001 [00:01<7:13:36,  3.07it/s]

shape of state [[[2.89822054 3.47892594 1.06399012 ... 0.58652413 1.10245585 5.89149857]
  [5.14138651 3.78431296 2.32210803 ... 0.         1.19027317 5.14881134]
  [4.25716305 2.81618214 0.88631278 ... 0.         2.20914054 5.5387125 ]
  [5.09753036 4.07359886 0.92043173 ... 0.10226995 1.2448802  4.97452307]
  [2.02654839 3.04835939 1.75809526 ... 0.6986264  2.07754326 5.61793327]]

 [[3.54699993 7.64970112 4.83746052 ... 1.666574   0.17373422 4.18438625]
  [1.94410431 8.5033741  4.39287233 ... 2.77670479 0.30099955 3.35215545]
  [1.1348249  7.32183    2.8241048  ... 3.3491764  5.8981175  4.2505846 ]
  [3.547      7.649701   4.8374605  ... 1.666574   0.17373422 4.1843863 ]
  [4.3297443  7.029106   4.641712   ... 0.9588646  0.         3.1037579 ]]

 [[7.15149975 1.29526079 0.67365825 ... 5.27467871 3.86672211 4.76880455]
  [4.71916199 4.58128548 0.         ... 3.48298359 4.98702192 3.54305625]
  [4.71916199 4.58128548 0.         ... 3.48298359 4.98702192 3.54305625]
  [6.62681055 3.323

 [ 2.50396454]]
Instructions for updating:
Use tf.cast instead.


  0%|                                                                                                                                 | 6/80001 [00:05<29:43:49,  1.34s/it]

shape of state [[[4.68306    8.837797   7.1171765  ... 2.8928094  1.740715   2.3261504 ]
  [3.1463861  7.682682   5.3214965  ... 3.6857934  1.2890936  3.5295353 ]
  [4.395478   6.284205   4.960505   ... 4.101352   1.5802894  2.7280862 ]
  [3.0299468  8.083819   7.5520577  ... 4.0654507  0.         2.9524407 ]
  [2.124771   8.626455   7.5416107  ... 4.1491528  1.4669076  1.7991059 ]]

 [[1.35264099 7.06843185 4.65418148 ... 4.18182564 3.00115609 3.43503952]
  [2.442563   7.995541   5.086634   ... 3.6969213  2.661691   2.06701   ]
  [1.3437781  7.611675   3.9929967  ... 3.173174   2.5467386  4.3769445 ]
  [2.6521368  8.81059    6.840335   ... 3.581142   0.73892915 2.750829  ]
  [0.83097744 6.2166624  3.837267   ... 5.9176664  4.022296   4.0407763 ]]

 [[1.3437781  7.611675   3.9929967  ... 3.173174   2.5467386  4.3769445 ]
  [2.6521368  8.81059    6.840335   ... 3.581142   0.73892915 2.750829  ]
  [0.83097744 6.2166624  3.837267   ... 5.9176664  4.022296   4.0407763 ]
  [3.591444   9.051

 [ 3.65364885e+00]]


  0%|                                                                                                                                 | 7/80001 [00:06<26:19:02,  1.18s/it]

shape of state [[[1.7096125  3.630256   1.3381704  ... 1.233098   4.700547   5.737247  ]
  [2.230536   4.7749724  1.2272868  ... 1.9799986  5.4029636  6.9132977 ]
  [1.7440851  3.8238103  1.1728518  ... 1.9976275  5.595217   6.443542  ]
  [1.7386341  3.1017213  2.2413356  ... 1.8874599  5.084311   5.833848  ]
  [1.8475978  2.993978   2.2821486  ... 0.8766092  4.2527795  4.8566766 ]]

 [[1.16824794 6.49225044 8.47637081 ... 4.75013018 3.94327927 1.29102707]
  [3.16938543 7.63273191 3.63015723 ... 1.89971268 3.4884665  3.26903653]
  [1.72837877 8.30561352 2.50257635 ... 3.03849506 6.84129381 3.41493011]
  [3.54699993 7.64970112 4.83746052 ... 1.666574   0.17373422 4.18438625]
  [1.94410431 8.5033741  4.39287233 ... 2.77670479 0.30099955 3.35215545]]

 [[7.15149975 1.29526079 0.67365825 ... 5.27467871 3.86672211 4.76880455]
  [4.71916199 4.58128548 0.         ... 3.48298359 4.98702192 3.54305625]
  [4.71916199 4.58128548 0.         ... 3.48298359 4.98702192 3.54305625]
  [6.62681055 3.323




KeyboardInterrupt: 

In [None]:
# THE END