In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [59]:
data = pd.read_csv('medium.csv')
for i in range(7):
    print(np.mean(data[data['a']==i+1], axis=0))

s     20704.245824
a         1.000000
r      -225.000000
sp    20559.436800
dtype: float64
s     22042.765088
a         2.000000
r      -100.000000
sp    21769.112335
dtype: float64
s     22437.206572
a         3.000000
r         7.169118
sp    22354.051333
dtype: float64
s     25153.318619
a         4.000000
r        53.833731
sp    25152.342690
dtype: float64
s     28807.271524
a         5.000000
r       370.743558
sp    28924.262906
dtype: float64
s     28017.474345
a         6.000000
r       -57.419630
sp    28290.251011
dtype: float64
s     31277.671073
a         7.000000
r       435.917224
sp    31536.695978
dtype: float64


In [48]:
pd.set_option('display.max_rows', 20)

In [52]:
ldata = pd.read_csv('large.csv')
for i in range(9):
    print(np.mean(ldata[ldata['a']==i+1], axis=0))

s     249227.883320
a          1.000000
r          0.599205
sp    249229.700623
dtype: float64
s     247637.264498
a          2.000000
r          0.662378
sp    247634.938021
dtype: float64
s     249018.038712
a          3.000000
r          0.647132
sp    248890.903666
dtype: float64
s     247785.495623
a          4.000000
r          0.472430
sp    247920.526938
dtype: float64
s     248467.520269
a          5.000000
r          0.000000
sp    247732.669242
dtype: float64
s     248272.216759
a          6.000000
r          0.000000
sp    248207.272711
dtype: float64
s     247886.430268
a          7.000000
r          0.000000
sp    248068.842091
dtype: float64
s     247692.857440
a          8.000000
r          0.000000
sp    247867.612082
dtype: float64
s     248491.998843
a          9.000000
r          0.000000
sp    248746.238074
dtype: float64


In [39]:
np.unique(ldata['r'])

array([-10,  -5,   0,   5,  10,  50, 100], dtype=int64)

In [None]:
ldata[ldata['r'] == 50]

Unnamed: 0,s,a,r,sp
1019,301211,3,100,301111
1024,301211,3,100,301111
1484,300413,4,100,301013
1538,301113,3,100,301013
1542,301014,2,100,301013
...,...,...,...,...
99444,301113,3,100,301013
99555,301014,2,100,301013
99706,301113,3,100,301013
99794,301211,3,100,301111


In [None]:
class QLearningModel:
    def __init__(self, n_s, n_a, gamma, alpha):
        self.Q = np.ones([n_s, n_a])
        self.gamma = gamma
        self.alpha = alpha
        
    def q_update(self, s, a, r, sp):
        self.Q[s, a] += self.alpha * (r + self.gamma*np.max(self.Q[sp]) - self.Q[s, a])
        pass

    def learn(self, data, max_iter):
        for i in range(max_iter):
            for s, a, r, sp in data: self.q_update(s, a, r, sp)
        pass


def write_policy(policy, filename):
    np.savetxt(filename, policy, fmt='%d')


def find_small_policy(filename, n_s, n_a, gamma, alpha, max_iter):
    q_model = QLearningModel(n_s, n_a, gamma, alpha)
    data = pd.read_csv(filename).values
    #Decrement state and action for zero indexing
    data[:,(0, 1, 3)] -= 1

    q_model.learn(data, max_iter)
    policy = np.argmax(q_model.Q, axis=1) + 1
    write_policy(policy, filename.replace('.csv', '.policy'))


def find_med_policy(filename, n_s, n_a, gamma, alpha, max_iter):
    q_model = QLearningModel(n_s, n_a, gamma, alpha)
    data = pd.read_csv(filename).values
    #Decrement state and action for zero indexing
    data[:,(0, 1, 3)] -= 1

    q_model.learn(data, max_iter)
    policy = 4*np.ones(q_model.Q.shape[0])
    for ind in data[:,0]:
        policy[ind] = np.argmax(q_model.Q[ind]) + 1
    write_policy(policy, filename.replace('.csv', '.policy'))


def find_large_policy(filename, n_s, n_a, gamma, alpha, max_iter):
    q_model = QLearningModel(n_s, n_a, gamma, alpha)
    data = pd.read_csv(filename).values
    #Decrement state and action for zero indexing
    data[:,(0, 1, 3)] -= 1

    q_model.learn(data, max_iter)
    #policy = 2*np.ones(q_model.Q.shape[0])
    policy = np.random.choice([1,2,3,4], size=q_model.Q.shape[0])
    for ind in data[:,0]:
        policy[ind] = np.argmax(q_model.Q[ind]) + 1
    #policy = np.argmax(q_model.Q, axis=1) + 1
    write_policy(policy, filename.replace('.csv', '.policy'))
    

In [99]:
find_med_policy('medium.csv', 50000, 7, 1, 0.4, 100)

In [None]:
find_large_policy('large.csv',302020, 9, 0.95, 0.08, 100)

In [68]:
find_small_policy('small.csv', 100, 4, 0.95, 0.12, 100)