In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import tensorflow as tf
import tflearn
import numpy as np
from sklearn.model_selection import train_test_split

import drqn
import student as st

import concept_dependency_graph as cdg
from experience_buffer import ExperienceBuffer
import dataset_utils as d_utils
import utils
import models_dict_utils

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

### Preprocessing Data for DRQN
We take the data from data generator and save them into traces of (s,a,r,sp) tuples.

Each trajectory corresponds to a trace.

If trajectory has length n, then trace will have length n-1. (since we need the next state sp)

In [15]:
data = d_utils.load_data(filename="../synthetic_data/test-n10000-l3-random.pickle")

In [16]:
dqn_data = d_utils.preprocess_data_for_dqn(data, reward_model="dense")

In [4]:
# Single Trace
print (dqn_data[0])

[[array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]), array([ 1.,  0.,  0.,  0.,  0.]), 0.20000000000000001, array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])], [array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  1.]), 0.20000000000000001, array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.])]]


In [5]:
# First tuple in a trace
s,a,r,sp = dqn_data[0][0]
print (s)
print (a)
print (r)
print (sp)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
[ 1.  0.  0.  0.  0.]
0.2
[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [6]:
# Last tuple
s,a,r,sp = dqn_data[0][-1]
print (s)
print (a)
print (r)
print (sp)

[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  1.]
0.2
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]


In [17]:
dqn_data_train, dqn_data_test = train_test_split(dqn_data, test_size=0.2)

### Creating a DRQN model and training it

In [12]:
model_id = "test_model_drqn"


In [13]:
# Create the model object
model = drqn.DRQNModel(model_id, timesteps=2)

In [14]:
# Initialize trainer object inside the model
model.init_trainer()

In [18]:
# Creating training and validation data
train_buffer = ExperienceBuffer()
train_buffer.buffer = dqn_data_train
train_buffer.buffer_sz = len(train_buffer.buffer)

val_buffer = ExperienceBuffer()
val_buffer.buffer = dqn_data_test
val_buffer.buffer_sz = len(val_buffer.buffer)

In [20]:
# train the model (uses the previously initialized trainer object)
date_time_string = datetime.datetime.now().strftime("%m-%d-%Y_%H-%M-%S")
run_id = "{}".format(date_time_string)
model.train(train_buffer, val_buffer, n_epoch=2,
              run_id=run_id, load_checkpoint=True)

Training Step: 38859  | total loss: [1m[32m0.09421[0m[0m | time: 2.151s
[2K| Optimizer | epoch: 002 | loss: 0.09421 -- iter: 6976/8000


KeyboardInterrupt: 

In [13]:
# init evaluator of the model
model.init_evaluator()

In [14]:
# Create inputs (states / observations so far) to use for predictions
from drqn import stack_batch
train_batch = train_buffer.sample_in_order(4)

# make sure that batches are over multiple timesteps, should be of shape (batch_sz, n_timesteps, ?)
s_batch_train = stack_batch(train_batch[:, :, 0])  # current states

In [15]:
# Use model to predict next action
actions, q_vals = model.predict(s_batch_train, last_timestep_only=True)

1


In [16]:
q_vals

array([[ 0.30812424,  0.30810687,  0.30813327,  0.30798167,  0.30711538],
       [ 0.43627185,  0.43623003,  0.43638146,  0.43616685,  0.43632478],
       [ 0.43779975,  0.43772992,  0.4377695 ,  0.43766207,  0.43770874],
       [ 0.39733657,  0.39734417,  0.39731672,  0.39723107,  0.39730671]])

In [17]:
actions

array([2, 2, 0, 1])

In [23]:
# if we want to predict on data with different number of timesteps then we trained on, 
# create a new model but using the same checkpoint

eval_model = drqn.DRQNModel(model_id, timesteps=10)

In [24]:
eval_model.init_evaluator()
# now the internal RNN will be unrolled over 10 timesteps. 
# You can still pass in inputs that have fewer than 10, in which case remaining timesteps will be padded.

In [25]:
eval_model.predict(s_batch_train, last_timestep_only=True)

(array([2, 2, 0, 1]),
 array([[ 0.30812424,  0.30810687,  0.30813327,  0.30798167,  0.30711538],
        [ 0.43627185,  0.43623003,  0.43638146,  0.43616685,  0.43632478],
        [ 0.43779975,  0.43772992,  0.4377695 ,  0.43766207,  0.43770874],
        [ 0.39733657,  0.39734417,  0.39731672,  0.39723107,  0.39730671]]))

##  Testing the model

In [11]:
from drqn_tests import *

In [3]:
n_trajectories = 10
n_concepts = 5
horizon = 6
model_id = "test_model_drqn"
from simple_mdp import create_custom_dependency
dgraph = create_custom_dependency()

In [4]:
test_model = drqn.DRQNModel(model_id=model_id, timesteps=horizon)
test_model.init_evaluator()

In [5]:
learn_prob = 0.15
student = st.Student(n=n_concepts, p_trans_satisfied=learn_prob, p_trans_not_satisfied=0.0, p_get_ex_correct_if_concepts_learned=1.0)

In [6]:

k = test_drqn_single(dgraph, student, horizon, test_model, DEBUG=True)

ERROR [ 1.  0.  0.  0.  0.] executed non-optimal action 0
ERROR [ 1.  0.  0.  0.  0.] executed non-optimal action 0


In [7]:
k

array([ 1.,  0.,  0.,  0.,  0.])

In [9]:
test_drqn_chunk(n_trajectories, dgraph, student, model_id, horizon)

traj i 0
traj i 1
traj i 2
traj i 3
traj i 4
traj i 5
traj i 6
traj i 7
traj i 8
traj i 9


0.29999999999999999

### Final Test Function:

In [10]:
test_drqn(model_id=model_id)

Testing model: test_model_drqn
horizon: 6
traj i 0
traj i 1
traj i 2
traj i 3
traj i 4
traj i 5
traj i 6
traj i 7
traj i 8
traj i 9
Generating data for 1000 students with behavior policy expert and sequence length 6.
Average posttest true: 0.385
Average posttest drqn: 0.22
