In [None]:
%install-location $cwd/swift-install
%install '.package(path: "~/git/swiftai")' SwiftAI

In [None]:
%include "EnableIPythonDisplay.swift"
IPythonDisplay.shell.enable_matplotlib("inline")

In [None]:
//export
import Path
import SwiftAI
import TensorFlow

In [None]:
public typealias TF = Tensor<Float>
public typealias TI = Tensor<Int32>

In [None]:
//export
import Python

let gym = Python.import("gym")
let np = Python.import("numpy")

In [None]:

struct Observation {
    public let obs: TF
}

let empty_obs = Observation(obs: TF(zeros: [1, 210, 160, 3]))

struct Action {
    public let action: Int
}

struct Reward : AdditiveArithmetic {
    public let r: Float
    
}
let zero_reward = Reward(r:0)

// Define how to add reward objects
func +(lhs: Reward, rhs: Reward) -> Reward {
        let r = lhs.r + rhs.r
        return Reward(r:r)
}

# Need to deal with the Gym environment

In [None]:
func np_to_tf(_ arr: PythonObject) -> TF {
    return Tensor<Float> ( numpy: np.array(arr.reshape([1, 210,160,3]), dtype: np.float32))!
}

In [None]:
func np_to_obs(_ arr: PythonObject) -> Observation {
    let obs = Tensor<Float> ( numpy: np.array(arr.reshape([1, 210,160,3]), dtype: np.float32))
    if obs == nil {
        print("np_to_obs: Oh no! observation did not parse!")
    }
    return Observation(obs: obs!)
}

In [None]:
func reset_env(_ env: PythonObject) -> Observation {
    return np_to_obs(env.reset())
}

func step_env(env: PythonObject, act: Action) -> (Observation, Reward, Bool) {
    let res = env.step(act.action)
    let obs = np_to_obs(res[0])
    let r = Float(res[1])!
    let cont = Bool(res[2])!
    
    return (obs, Reward(r:r), cont )
}

In [None]:
// steps n times, summing reward over steps.
func step_n_times(n: Int = 4, env: PythonObject, act: Action) ->
    (Observation, Reward, Bool)
{
    var rew = Reward(r: 0)

    var res = env.step(act.action)
    rew = rew + Reward(r: Float(res[1])! )
    
    let range: ClosedRange<Int> = 1...n
    for _ in range {
        res = env.step(act.action)
        rew = rew + Reward(r: Float(res[1])! )
    }
    
    let obs = np_to_obs(res[0])
    let running  = Bool(res[2])!
    
    return (obs, rew, running)
}

# Now try to build a DQN

In [None]:
public struct DQNModel: Layer {
    public var conv1: ConvLayer
    public var conv2: ConvLayer
    public var conv3: ConvLayer
    public var pool = GlobalAvgPool2D<Float>()
    public var linear1: Dense<Float>
    public var linear2: Dense<Float>

    
    public init(nActions: Int){
        conv1 = ConvLayer(3, 32, ks: 8)
        conv2 = ConvLayer(32, 64, ks: 4, stride: 2)
        conv3 = ConvLayer(64, 64, ks: 3)
        linear1 = Dense(inputSize: 64, outputSize: 256, activation: relu) 
        linear2 = Dense(inputSize: 256, outputSize: nActions) 
    }
    
    @differentiable
    public func callAsFunction(_ input: TF) -> TF {
       return input.compose(conv1, conv2, conv3, pool, linear1, linear2)
        
    }
}

In [None]:
public struct AgentHyperParams {
    public var epsilon_start : Float = 1
    public var epsilon_decay : Float = 0.000001
    public var epsilon_final : Float = 0.02
    public let learning_rate : Float
    public let gamma: Float
    public let num_actions: Int
    
    public init(lr: Float, ga: Float, na: Int) {
        learning_rate=lr
        gamma = ga
        num_actions = na
    }
}
var params =  AgentHyperParams(lr:0.8, ga:0.99, na:6)

In [None]:
let random = Python.import("numpy.random")

In [None]:
let vvv = agent.transition_obs_buffer[0].obs
vvv

In [None]:
public struct PongBatch<Obs: Differentiable & TensorGroup,
                        Act: Differentiable & TensorGroup,
                        Labels: TensorGroup>: TensorGroup {
    public var x_obs: Obs
    public var x_act: Act
   
    public var yb: Labels

}

In [None]:
let bb = PongBatch(x_obs:vvv, x_act:vvv, yb:vvv)
let tds = Dataset(elements: bb)

In [None]:
tds.batched(32).shuffled(sampleCount:1, randomSeed:42)

In [None]:
let t = Tensor(randomStandardUniform: [10,10])
print(type(of:t))

In [None]:
t.concatenated(with: t).shape

In [None]:
public struct Agent {
    public var model: DQNModel
    public var target: DQNModel    // The target is frozen while the model learns against it.
    
    public var transition_obs_buffer: TF 
    public var transition_act_buffer: TI 
    public var transition_rew_buffer: [Int] 
    public var transition_done_buffer: [Bool] 
    
    public var epsilon : Float
    public var params: AgentHyperParams
    
    public var steps: Int // Number of training steps taken
    
    public init(par: AgentHyperParams) {
        params = par

        model = DQNModel(nActions: params.num_actions)
        target = DQNModel(nActions: params.num_actions)
        target.variables = model.variables // hopefully this will do a copy not pass by reference... 
    
        epsilon = params.epsilon_start
        steps = 0

    }
    
    public mutating func act_eps_greedy(obs: Observation) -> Action {
        let r = Float( random.random() )!
        steps += 1

        
        if r < epsilon {
              let a = Int (random.randint(6) )!
              return Action(action: a)
        }
        print("Taking real action")
        //let a = Int (random.randint(6) )!
         
        let a = Int(target(obs.obs).argmax().scalar!)
        
        return Action(action: a )
    }
    
    
    public mutating func sample_and_optimize(batch_size: Int) -> () {
        // probably a bad heuristic... 
        let n_samples = transition_obs_buffer.shape[0]
        
        if n_samples < batch_size*16 {
            return
        }
        
        let index = batchedRanges(start: 0, end: n_samples, bs: batch_size)
        //meanAbsoluteError(observed, expected)
        // now sample randomly and learn. 
    }
    
    
    public mutating func add_env_feedback(obs: Observation, act: Action, rew: Reward, done: Bool) -> () {
        transition_obs_buffer.concatenated(with: obs.obs)
        transition_act_buffer.concatenated(with: act.action)
        transition_rew_buffer.append(rew)
        transition_done_buffer.append(done)
        epsilon = max(params.epsilon_final, params.epsilon_start - Float(steps)*params.epsilon_decay)

    }
    
}

In [None]:
var p1 = DQNModel(nActions: 6)

In [None]:
var agent = Agent(par: params)

In [None]:
var  env = gym.make("PongNoFrameskip-v4")
var s1 = env.reset().reshape([1, 210,160,3])

In [None]:
// how many actions are there... 
env.action_space

In [None]:
var current_obs : Observation
var next_obs : Observation
var rew : Reward
var finished: Bool = false
next_obs = Observation(obs: np_to_tf(s1))

let total_games = 10
var count = 0

In [None]:
while (count < total_games) {
    current_obs = next_obs
    
    let act: Action = agent.act_eps_greedy(obs: current_obs)
    
    (next_obs , rew, finished) = step_n_times(env: env, act:act )
    
    agent.add_env_feedback(obs: current_obs, act: act, rew: rew, done: finished )
    agent.sample_and_optimize(batch_size: 32)
    
    if finished {
        count += 1
        next_obs = reset_env(env)
        print("game " + String(count) + " starting ")
    }
}