In [1]:
%install-location $cwd/swift-install


In [2]:
import Foundation
import TensorFlow

In [3]:
import Python
let gym = Python.import("gym")

In [4]:
public typealias TF = Tensor<Float32>


In [5]:
struct StateRecord {
    public var state: TF
}


struct Observation {
    public let obs: Int
}


struct Action {
    public let action: Int
}

struct Reward{
    public let reward: Float
}

struct StateActionPair : Hashable {
    public var state: Observation
    public var act: Action
    
    static func == (lhs: StateActionPair, rhs: StateActionPair) -> Bool {
        return (lhs.state.obs == rhs.state.obs) && (lhs.act.action == rhs.act.action)
    }
    
    func hash(into hasher: inout Hasher) {
        hasher.combine(state.obs)
        hasher.combine(act.action)
    }
}

In [6]:
public typealias SA = Dictionary<StateActionPair, Float>


In [7]:
var stateDict : SA

In [8]:
func select_greedy_action(s: SA, 
                          obs: Observation) -> Action {
    
    let (_, a) = best_action_value(s:s, cs:obs)
    
    return a
}

func best_action_value(s: SA, 
                       cs: Observation) -> (Reward, Action)
{
    var best_value: Float = 0
    var best_action = Action(action:0)
    let r : Range = 0..<4
    for i in r {
        var a = Action(action:i)
        var pair  = StateActionPair(state: cs, act:a) 
        if ( (s[pair] ?? 0) > best_value) {
            best_value = s[pair] ?? 0
            best_action = a
        }
    }
    
    return (Reward(reward:best_value), 
            best_action)
}

In [9]:
let random = Python.import("numpy.random")

In [10]:
func select_eps_greedy_action(s: SA, 
                              obs: Observation, 
                              n_actions: Int, 
                              eps: Float) -> Action {
    
    let (val, act) = best_action_value(s:s, cs: obs)
    
    let r = Float( random.random() )!
    
    if (r < eps) {
        let r_act = Int(random.randint(0, n_actions))!
        return Action(action: r_act)
        
    } else {
        return act
    }
}


In [11]:
func q_learning(sr:inout SA, current_obs: Observation, 
                next_obs: Observation, r: Reward, a: Action, 
                g: Float, lr: Float) -> SA
{
    let (best_value, _) = best_action_value(s:sr, 
                                             cs: next_obs)
    
    let Q_target = r.reward + g * best_value.reward
    let pair = StateActionPair(state: current_obs, act:a)
    let Q_error = Q_target - (sr[pair] ?? 0)
    
    
    sr[pair] = (sr[pair] ?? 0) + lr * Q_error
    
    return sr
}

In [12]:
func step_env(env: PythonObject, act: Action) -> 
                    (Observation, 
                     Reward, 
                     Bool) {
    let res = env.step(act.action)
    let obs = Int(res[0])!
    let rew = Float(res[1])!
    let done = Bool(res[2])!
                        
    return (Observation(obs: obs), 
            Reward(reward: rew),
            done)
    
}
func reset_env(env: PythonObject) -> Observation {
    
    let o = Int( env.reset() )!
    return Observation(obs: o)
}

In [13]:
func test_game(env: PythonObject, s: SA, 
                no_games: Int ) -> Reward
{
    var reward_games : [Float] = []
    var rewards : Float = 0.0
    let myRange: Range = 0..<no_games
    
    for i in myRange {
        var obs = reset_env(env: env)
        var next_obs : Observation
        var reward : Reward
        
        var done = false
        while (!done) {
            var next_act = select_greedy_action(s: s, obs: obs)
            
            (obs, reward, done) = step_env(env:env, act:next_act )
            
            rewards += reward.reward
            
            if (done) {
                reward_games.append(rewards)
            }
        }
        
    }
    
    return Reward(reward:(rewards/Float(no_games)))
}

# Create Hyperparams

In [14]:
let Gamma : Float = 0.95

var epsilon: Float = 1.0
let EPS_DECAY_RATE : Float = 0.99939
let LEARNING_RATE : Float = 0.8

let TEST_EPISODES = 100
let MAX_GAMES = 150001

# Setup game

In [15]:
let env = gym.make("FrozenLake-v0")
let o = Int(env.reset())!
o

0


In [16]:
let obs_length: Int = Int(env.observation_space.n)!
let n_actions: Int = Int(env.action_space.n)!
print(obs_length)
print(n_actions)

16
4


In [17]:
var state_matrix = TF(zeros: [obs_length, n_actions])

In [18]:
// STill need to initialise the dictionary!!!

In [19]:
var games_count = 0
var test_rewards_list : [Reward] = []

var obs = Observation(obs:o)
var sr : SA = [StateActionPair(state:Observation(obs: 0), act:Action(action:0)) : 0]


In [20]:
games_count = 0
epsilon = 1.0

var total_reward : Float = 0

while (games_count < MAX_GAMES) {
    let act = select_eps_greedy_action(s: sr, obs:obs, 
                                       n_actions:n_actions,
                                       eps: epsilon
                                      )
    
    let (next_obs, reward, done) = step_env(env: env, act:act)
    sr = q_learning(sr: &sr, current_obs: obs, next_obs:next_obs, 
                   r: reward, a: act, g: Gamma,
                   lr: LEARNING_RATE)
    obs = next_obs
    total_reward += reward.reward
    //print(total_reward)
    if (done) {
        epsilon *= EPS_DECAY_RATE
    
        if ((games_count % 1000) == 0) {
            let test_reward = test_game(env: env, s: sr, no_games: TEST_EPISODES)
            test_rewards_list.append(test_reward)
            print("Games count: " + String(games_count) + " Epsilon: " + String(epsilon))
            print(test_reward.reward)
        }
        
        // do testing logic... 
        obs = reset_env(env: env)
        games_count+=1
        total_reward=0
    }
}

Games count: 0 Epsilon: 0.99939
0.0
Games count: 1000 Epsilon: 0.54292196
0.08
Games count: 2000 Epsilon: 0.29494384
0.63
Games count: 3000 Epsilon: 0.16022928
0.58
Games count: 4000 Epsilon: 0.0870451
0.38
Games count: 5000 Epsilon: 0.04728754
0.56
Games count: 6000 Epsilon: 0.025689088
0.62
Games count: 7000 Epsilon: 0.013955676
0.73
Games count: 8000 Epsilon: 0.0075814645
0.73
Games count: 9000 Epsilon: 0.004118649
0.72
Games count: 10000 Epsilon: 0.0022374657
0.69
Games count: 11000 Epsilon: 0.0012155101
0.5
Games count: 12000 Epsilon: 0.0006603301
0.73
Games count: 13000 Epsilon: 0.0003587266
0.67
Games count: 14000 Epsilon: 0.00019487953
0.75
Games count: 15000 Epsilon: 0.000105869
0.78
Games count: 16000 Epsilon: 5.751362e-05
0.68
Games count: 17000 Epsilon: 3.1244417e-05
0.74
Games count: 18000 Epsilon: 1.6973609e-05
0.81
Games count: 19000 Epsilon: 9.220952e-06
0.66
Games count: 20000 Epsilon: 5.0093154e-06
0.71
Games count: 21000 Epsilon: 2.7213264e-06
0.72
Games count: 22000

: 

In [21]:
test_game(env: env, s: sr, no_games: TEST_EPISODES)


▿ Reward
  - reward : 0.75
