In [None]:
%install-location $cwd/swift-install
%install '.package(path: "~/git/swiftai")' SwiftAI

In [None]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

In [None]:
//export
import Path
import SwiftAI
import TensorFlow

In [None]:
public typealias TF = Tensor<Float>
public typealias TI = Tensor<Int32>

In [None]:
//export
import Python

let gym = Python.import("gym")
let np = Python.import("numpy")

In [None]:

struct Observation {
    public let obs: TF
}

let empty_obs = Observation(obs: TF(zeros: [1, 210, 160, 3]))

struct Action {
    public let action: Int
}

struct Reward : AdditiveArithmetic {
    public let r: Float
    
}
let zero_reward = Reward(r:0)

// Define how to add reward objects
func +(lhs: Reward, rhs: Reward) -> Reward {
        let r = lhs.r + rhs.r
        return Reward(r:r)
}

# Need to deal with the Gym environment

In [None]:
func np_to_tf(_ arr: PythonObject) -> TF {
    return Tensor<Float> ( numpy: np.array(arr.reshape([1, 210,160,3]), dtype: np.float32))!
}

In [None]:
func np_to_obs(_ arr: PythonObject) -> Observation {
    let obs = Tensor<Float> ( numpy: np.array(arr.reshape([1, 210,160,3]), dtype: np.float32))
    if obs == nil {
        print("np_to_obs: Oh no! observation did not parse!")
    }
    return Observation(obs: obs!)
}

In [None]:
func reset_env(_ env: PythonObject) -> Observation {
    return np_to_obs(env.reset())
}

func step_env(env: PythonObject, act: Action) -> (Observation, Reward, Bool) {
    let res = env.step(act.action)
    let obs = np_to_obs(res[0])
    let r = Float(res[1])!
    let cont = Bool(res[2])!
    
    return (obs, Reward(r:r), cont )
}

In [None]:
// steps n times, summing reward over steps.
func step_n_times(n: Int = 4, env: PythonObject, act: Action) ->
    (Observation, Reward, Bool)
{
    var rew = Reward(r: 0)

    var res = env.step(act.action)
    rew = rew + Reward(r: Float(res[1])! )
    
    let range: ClosedRange<Int> = 1...n
    for _ in range {
        res = env.step(act.action)
        rew = rew + Reward(r: Float(res[1])! )
    }
    
    let obs = np_to_obs(res[0])
    let running  = Bool(res[2])!
    
    return (obs, rew, running)
}

# Now try to build a DQN

In [None]:
public struct DQNModel: Layer {
    public var conv1: ConvLayer
    public var conv2: ConvLayer
    public var conv3: ConvLayer
    public var pool = GlobalAvgPool2D<Float>()
    public var linear1: Dense<Float>
    public var linear2: Dense<Float>

    
    public init(nActions: Int){
        conv1 = ConvLayer(3, 32, ks: 8)
        conv2 = ConvLayer(32, 64, ks: 4, stride: 2)
        conv3 = ConvLayer(64, 64, ks: 3)
        linear1 = Dense(inputSize: 64, outputSize: 256, activation: relu) 
        linear2 = Dense(inputSize: 256, outputSize: nActions) 
    }
    
    @differentiable
    public func callAsFunction(_ input: TF) -> TF {
       return input.compose(conv1, conv2, conv3, pool, linear1, linear2)
        
    }
}

In [None]:
public struct AgentHyperParams {
    public var epsilon_start : Float = 1
    public var epsilon_decay : Float = 0.000001
    public var epsilon_final : Float = 0.02
    public let learning_rate : Float
    public let gamma: Float
    public let num_actions: Int
    
    public init(lr: Float, ga: Float, na: Int) {
        learning_rate=lr
        gamma = ga
        num_actions = na
    }
}
var params =  AgentHyperParams(lr:0.8, ga:0.99, na:6)

In [None]:
let random = Python.import("numpy.random")

In [None]:
public struct Agent {
    public var model: DQNModel
    public var target: DQNModel    // The target is frozen while the model learns against it.
    
    public var transition_obs_buffer: [Observation]
    public var transition_act_buffer: [Action]
    public var transition_rew_buffer: [Reward]
    
    public var epsilon : Float
    public var params: AgentHyperParams
    
    public var steps: Int // Number of training steps taken
    
    public init(par: AgentHyperParams) {
        params = par

        model = DQNModel(nActions: params.num_actions)
        target = DQNModel(nActions: params.num_actions)
        target.variables = model.variables // hopefully this will do a copy not pass by reference... 
        
        transition_obs_buffer = []
        transition_act_buffer = []   
        transition_rew_buffer = []   
        epsilon = params.epsilon_start
        steps = 0

    }
    
    
    public mutating func act_eps_greedy(obs: Observation) -> Action {
        let r = Float( random.random() )!
        steps += 1

        
        if r < epsilon {
              let a = Int (random.randint(6) )!
              return Action(action: a)
        }
        print("Taking real action")
        //let a = Int (random.randint(6) )!
         
        let a = Int(target(obs.obs).argmax().scalar!)
        
        return Action(action: a )
    }
    
    
    public mutating func sample_and_optimize(batch_size: Int) -> () {
        // probably a bad heuristic... 
        if transition_obs_buffer.count < batch_size*16 {
            return
        }
        
        // now sample randomly and learn. 
    }
    
    
    public mutating func add_env_feedback(obs: Observation, act: Action, rew: Reward) -> () {
        transition_obs_buffer.append(obs)
        transition_act_buffer.append(act)
        transition_rew_buffer.append(rew)
        epsilon = max(params.epsilon_final, params.epsilon_start - Float(steps)*params.epsilon_decay)

    }
    
}

In [None]:
var p1 = DQNModel(nActions: 6)

In [None]:
var vars = p1.variables

In [None]:
var agent = Agent(par: params)

In [None]:
p1.variables = vars

In [None]:
var  env = gym.make("PongNoFrameskip-v4")
var s1 = env.reset().reshape([1, 210,160,3])

In [None]:
// how many actions are there... 
env.action_space

In [None]:
var current_obs : Observation
var next_obs : Observation
var rew : Reward
var finished: Bool = false
next_obs = Observation(obs: np_to_tf(s1))

let total_games = 10
var count = 0

In [None]:
while (count < total_games) {
    current_obs = next_obs
    
    let act: Action = agent.act_eps_greedy(obs: current_obs)
    
    (next_obs , rew, finished) = step_n_times(env: env, act:act )
    
    agent.add_env_feedback(obs: current_obs, act: act, rew: rew )
    agent.sample_and_optimize(batch_size: 32)
    
    if finished {
        count += 1
        next_obs = reset_env(env)
        print("game " + String(count) + " starting ")
    }
}

In [None]:
agent.steps

In [None]:
var a = agent.transition_obs_buffer[0].obs

In [None]:
Int(agent.target(a).argmax().scalar!)

In [None]:
var (a,b,c) = step_n_times(n:10, env: env, act: Action(action:1))

In [None]:
agent.transition_obs_buffer

In [None]:
(a.obs - np_to_tf(s1)).sum()

In [None]:
var r = np_to_tf(s1)
var res = p1(r)

In [None]:
var a = res.argmax()

In [None]:
Int(a.scalar!)

In [None]:
func select_greedy_action(s: SA, 
                          obs: Observation) -> Action {
    
    let (_, a) = best_action_value(s:s, cs:obs)
    
    return a
}

func best_action_value(s: SA, 
                       cs: Observation) -> (Reward, Action)
{
    var best_value: Float = 0
    var best_action = Action(action:0)
    let r : Range = 0..<4
    for i in r {
        var a = Action(action:i)
        var pair  = StateActionPair(state: cs, act:a) 
        if ( (s[pair] ?? 0) > best_value) {
            best_value = s[pair] ?? 0
            best_action = a
        }
    }
    
    return (Reward(reward:best_value), 
            best_action)
}

In [None]:
func select_eps_greedy_action(s: SA, 
                              obs: Observation, 
                              n_actions: Int, 
                              eps: Float) -> Action {
    
    let (val, act) = best_action_value(s:s, cs: obs)
    
    let r = Float( random.random() )!
    
    if (r < eps) {
        let r_act = Int(random.randint(0, n_actions))!
        return Action(action: r_act)
        
    } else {
        return act
    }
}

func q_learning(sr:inout SA, current_obs: Observation, 
                next_obs: Observation, r: Reward, a: Action, 
                g: Float, lr: Float) -> SA
{
    let (best_value, _) = best_action_value(s:sr, 
                                             cs: next_obs)
    
    let Q_target = r.reward + g * best_value.reward
    let pair = StateActionPair(state: current_obs, act:a)
    let Q_error = Q_target - (sr[pair] ?? 0)
    
    
    sr[pair] = (sr[pair] ?? 0) + lr * Q_error
    
    return sr
}

In [None]:
let Gamma : Float = 0.95

var epsilon: Float = 1.0
let EPS_DECAY_RATE : Float = 0.99939
let LEARNING_RATE : Float = 0.8

let TEST_EPISODES = 100
let MAX_GAMES = 150001

In [None]:
games_count = 0
epsilon = 1.0

var total_reward : Float = 0

while (games_count < MAX_GAMES) {
    let act = select_eps_greedy_action(s: sr, obs:obs, 
                                       n_actions:n_actions,
                                       eps: epsilon
                                      )
    
    let (next_obs, reward, done) = step_env(env: env, act:act)
    sr = q_learning(sr: &sr, current_obs: obs, next_obs:next_obs, 
                   r: reward, a: act, g: Gamma,
                   lr: LEARNING_RATE)
    obs = next_obs
    total_reward += reward.reward
    //print(total_reward)
    if (done) {
        epsilon *= EPS_DECAY_RATE
    
        if ((games_count % 1000) == 0) {
            let test_reward = test_game(env: env, s: sr, no_games: TEST_EPISODES)
            test_rewards_list.append(test_reward)
            print("Games count: " + String(games_count) + " Epsilon: " + String(epsilon))
            print(test_reward.reward)
        }
        
        // do testing logic... 
        obs = reset_env(env: env)
        games_count+=1
        total_reward=0
    }
}

In [None]:
p1.shape

In [None]:
//reset_env(env)

In [None]:
//step_n_times(n: 3, env: env, act: Action(action:1))