In [1]:
%install-location $cwd/swift-install


In [2]:
import Foundation
import TensorFlow

In [3]:
import Python
let gym = Python.import("gym")

In [4]:
public typealias TF = Tensor<Float32>


In [5]:
struct StateRecord {
    public var state: TF
}


struct Observation {
    public let obs: Int
}


struct Action {
    public let action: Int
}

struct Reward{
    public let reward: Float
}

struct StateRewardPair : Hashable {
    public var state: Observation
    public var act: Action
    
    static func == (lhs: StateRewardPair, rhs: StateRewardPair) -> Bool {
        return (lhs.state.obs == rhs.state.obs) && (lhs.act.action == rhs.act.action)
    }
    
    func hash(into hasher: inout Hasher) {
        hasher.combine(state.obs)
        hasher.combine(act.action)
    }
}

In [6]:
func select_greedy_action(s: StateRecord, 
                          obs: Observation) -> Action {
    let (_, a) = best_action_value(s:s, cs:obs)
    
    return a
}

func best_action_value(s: StateRecord, 
                       cs: Observation) -> (Reward, Action)
{
    
    let q_row = s.state[cs.obs]

    let max_value = Float(q_row.max().scalars[0])
    let best_action = Int(q_row.argmax().scalars[0])
    
    return (Reward(reward:max_value), Action(action:best_action))
}

In [7]:
var a = TF(zeros: [10,4])

In [8]:
a[0]

[0.0, 0.0, 0.0, 0.0]


In [9]:
print(type(of:a[3].argmax()))
print(type(of: a[0][0].scalars[0]))

Tensor<Int32>
Float


In [10]:
let random = Python.import("numpy.random")

In [11]:
func select_eps_greedy_action(s: StateRecord, 
                              obs: Observation, 
                              n_actions: Int, 
                              eps: Float) -> Action {
    
    let (val, act) = best_action_value(s:s, cs: obs)
    
    let r = Float(  random.random() )!
    
    if (r < eps) {
        let r_act = Int(random.randint(0, n_actions))!
        return Action(action: r_act)
        
    } else {
        return act
    }
}


In [12]:
func q_learning(sr: StateRecord, current_obs: Observation, 
                next_obs: Observation, r: Reward, a: Action, 
                g: Float, lr: Float) -> StateRecord
{
    let (best_value, _) = best_action_value(s:sr, 
                                             cs: next_obs)
    
    let Q_target = r.reward + g * best_value.reward
    let Q_error = Q_target - sr.state[current_obs.obs, a.action]
    
    var new_state = StateRecord(state: sr.state)
    
    new_state.state[current_obs.obs, a.action] += lr * Q_error
    
    return new_state
}

In [13]:
func step_env(env: PythonObject, act: Action) -> 
                    (Observation, 
                     Reward, 
                     Bool) {
    let res = env.step(act.action)
    let obs = Int(res[0])!
    let rew = Float(res[1])!
    let done = Bool(res[2])!
                        
    return (Observation(obs: obs), 
            Reward(reward: rew),
            done)
    
}
func reset_env(env: PythonObject) -> Observation {
    
    let o = Int( env.reset() )!
    return Observation(obs: o)
}

In [14]:
func test_game(env: PythonObject, s: StateRecord, 
                no_games: Int ) -> Reward
{
    var reward_games : [Float] = []
    var rewards : Float = 0.0
    let myRange: Range = 0..<no_games
    
    for i in myRange {
        var obs = reset_env(env: env)
        var next_obs : Observation
        var reward : Reward
        
        var done = false
        while (!done) {
            var next_act = select_greedy_action(s: s, obs: obs)
            
            (obs, reward, done) = step_env(env:env, act:next_act )
            
            rewards += reward.reward
            
            if (done) {
                reward_games.append(rewards)
            }
        }
        
    }
    
    return Reward(reward:(rewards/Float(no_games)))
}

# Create Hyperparams

In [15]:
let Gamma : Float = 0.95

var epsilon: Float = 1.0
let EPS_DECAY_RATE : Float = 0.99939
let LEARNING_RATE : Float = 0.8

let TEST_EPISODES = 100
let MAX_GAMES = 15001

# Setup game

In [16]:
let env = gym.make("FrozenLake-v0")
let o = Int(env.reset())!
o

0


In [17]:
let obs_length: Int = Int(env.observation_space.n)!
let n_actions: Int = Int(env.action_space.n)!
print(obs_length)
print(n_actions)

16
4


In [18]:
var state_matrix = TF(zeros: [obs_length, n_actions])

In [19]:
var games_count = 0
var test_rewards_list : [Reward] = []

var obs = Observation(obs:o)
var sr = StateRecord(state:state_matrix)

In [21]:
games_count = 0
epsilon = 1.0

var total_reward : Float = 0

while (games_count < MAX_GAMES) {
    let act = select_eps_greedy_action(s: sr, obs:obs, 
                                       n_actions:n_actions,
                                       eps: epsilon
                                      )
    
    let (next_obs, reward, done) = step_env(env: env, act:act)
    sr = q_learning(sr: sr, current_obs: obs, next_obs:next_obs, 
                   r: reward, a: act, g: Gamma,
                   lr: LEARNING_RATE)
    obs = next_obs
    total_reward += reward.reward
    //print(total_reward)
    if (done) {
        epsilon *= EPS_DECAY_RATE
    
        if ((games_count % 1000) == 0) {
            let test_reward = test_game(env: env, s: sr, no_games: TEST_EPISODES)
            test_rewards_list.append(test_reward)
            print("Games count: " + String(games_count) + " Epsilon: " + String(epsilon))
            print(test_reward.reward)
        }
        
        // do testing logic... 
        obs = reset_env(env: env)
        games_count+=1
        total_reward=0
    }
}

Games count: 0 Epsilon: 0.99939
0.0
Games count: 1000 Epsilon: 0.54292196
0.5
Games count: 2000 Epsilon: 0.29494384
0.49
Games count: 3000 Epsilon: 0.16022928
0.16
Games count: 4000 Epsilon: 0.0870451
0.28
Games count: 5000 Epsilon: 0.04728754
0.66
Games count: 6000 Epsilon: 0.025689088
0.59
Games count: 7000 Epsilon: 0.013955676
0.61
Games count: 8000 Epsilon: 0.0075814645
0.78
Games count: 9000 Epsilon: 0.004118649
0.78
Games count: 10000 Epsilon: 0.0022374657
0.63
Games count: 11000 Epsilon: 0.0012155101
0.51
Games count: 12000 Epsilon: 0.0006603301
0.59
Games count: 13000 Epsilon: 0.0003587266
0.75
Games count: 14000 Epsilon: 0.00019487953
0.69
Games count: 15000 Epsilon: 0.000105869
0.41


In [None]:
sr.state

In [None]:
let myRange: Range = 0..<16

In [None]:
for i in myRange {
    print(sr.state[i].argmax())
}

In [None]:
sr.state[14]

In [None]:
env.step(0)

In [None]:
test_game(env: env, s: sr, no_games: TEST_EPISODES)
