# Protein Folding DQN

### Imports

In [None]:
import argparse
import math
import random
from copy import deepcopy

import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn

import matplotlib.pyplot as plt
%matplotlib inline

from collections import deque


### Use Cuda

In [None]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    print("Using GPU: GPU requested and available.")
    dtype = torch.cuda.FloatTensor
    dtypelong = torch.cuda.LongTensor
else:
    print("NOT Using GPU: GPU not requested or not available.")
    dtype = torch.FloatTensor
    dtypelong = torch.LongTensor

### Agent

In [None]:
class Agent:
    def __init__(self, env, q_network, target_q_network):
        self.env = env
        self.q_network = q_network
        self.target_q_network = target_q_network
        self.num_actions = env.action_space.n

    def act(self, state, epsilon):
        """DQN action - max q-value w/ epsilon greedy exploration."""
        if random.random() > epsilon:
            state = torch.tensor(np.float32(state)).type(dtype).unsqueeze(0)
            q_value = self.q_network.forward(state)
            return q_value.max(1)[1].data[0]
        return torch.tensor(random.randrange(self.env.action_space.n))

### Replay Buffer

In [None]:
from collections import deque

class ReplayBuffer(object):
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        state = np.expand_dims(state, 0)
        next_state = np.expand_dims(next_state, 0)

        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(
            *random.sample(self.buffer, batch_size)
        )
        return np.concatenate(state), action, reward, np.concatenate(next_state), done

    def __len__(self):
        return len(self.buffer)
    
replay_size = 5000
replay_buffer = ReplayBuffer(replay_size)

### Epsilon Greedy Exploration

In [None]:
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 500

epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

In [None]:
plt.plot([epsilon_by_frame(i) for i in range(10000)])

### Computing Temporal Difference Loss

In [None]:
def compute_td_loss(agent, batch_size, replay_buffer, optimizer, gamma):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)
    state = torch.tensor(np.float32(state)).type(dtype)
    next_state = torch.tensor(np.float32(next_state)).type(dtype)
    action = torch.tensor(action).type(dtypelong)
    reward = torch.tensor(reward).type(dtype)
    done = torch.tensor(done).type(dtype)

    # Normal DDQN update
    q_values = agent.q_network(state)
    q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
    # double q-learning
    online_next_q_values = agent.q_network(next_state)
    _, max_indicies = torch.max(online_next_q_values, dim=1)
    target_q_values = agent.target_q_network(next_state)
    next_q_value = torch.gather(target_q_values, 1, max_indicies.unsqueeze(1))

    expected_q_value = reward + gamma * next_q_value.squeeze() * (1 - done)
    loss = (q_value - expected_q_value.data.detach()).pow(2).mean()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

### Update the target network

In [None]:
def soft_update(q_network, target_q_network, tau):
    for t_param, param in zip(target_q_network.parameters(), q_network.parameters()):
        if t_param is param:
            continue
        new_param = tau * param.data + (1.0 - tau) * t_param.data
        t_param.data.copy_(new_param)


def hard_update(q_network, target_q_network):
    for t_param, param in zip(target_q_network.parameters(), q_network.parameters()):
        if t_param is param:
            continue
        new_param = param.data
        t_param.data.copy_(new_param)

### Training

In [None]:
learning_rate = 0.001
target_update_rate = 0.1
gamma = 0.99
target_network_update_f = 100
num_timesteps = 5000
log_every = 200
batch_size = 32

def run_gym(env):
    
    print(env.seq)
    
    # Q and target Q networks are global paraneters
    agent = Agent(env, q_network, target_q_network)
    optimizer = optim.Adam(q_network.parameters(), lr = learning_rate)
    
    losses, all_rewards = [], []
    episode_reward = 0
    state = env.reset()

    for ts in range(1, num_timesteps + 1):
        epsilon = epsilon_by_frame(ts)
        action = agent.act(state, epsilon)
        
        next_state, reward, done, _ = env.step(int(action.cpu()))
            
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0

        if len(replay_buffer) > batch_size:
            # Update the q-network & the target network
            loss = compute_td_loss(agent, batch_size, replay_buffer, optimizer, gamma)
            losses.append(loss.data)

            if ts % target_network_update_f == 0:
                # soft_update(agent.q_network, agent.target_q_network, target_update_rate)
                hard_update(agent.q_network, agent.target_q_network)

        if ts % log_every == 0:
            out_str = "Timestep {}".format(ts)
            if len(all_rewards) > 0:
                out_str += ", Reward: {}".format(all_rewards[-1])
            if len(losses) > 0:
                out_str += ", TD Loss: {}".format(losses[-1])
            print(out_str)
        
    return losses, all_rewards, agent

### Plot Loss & Rewards

In [None]:
def plot(losses, rewards):
    plt.figure(figsize = (20,20))
    plt.subplot(211)
    plt.title("Rewards")
    plt.plot(rewards)
    plt.subplot(212)
    plt.title("Loss")
    plt.plot(losses)
    plt.show()

### Run trained agent on environment

In [None]:
def run_agent(agent, env):
    env = env
    state = env.reset()
    while True:
        action = agent.act(state, 0)
        next_state, reward, done, info = env.step(int(action.cpu()))
        # env.render()
        print(action)
        state=next_state
        if done:
            print("Reward: {} | Actions: {}".format(reward, info['actions']))
            break

## DQN with Linear Model

In [None]:
from lattice2d_linear_env import Lattice2DLinearEnv

class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.input_shape = input_shape
        self.num_actions = num_actions
        self.layers = nn.Sequential(
            nn.Linear(self.input_shape[0], 64), nn.ReLU(), nn.Linear(64, self.num_actions)
        )

    def forward(self, x):
        return self.layers(x)
    
env = Lattice2DLinearEnv("H")
q_network = DQN(env.observation_space.shape, env.action_space.n)
target_q_network = deepcopy(q_network)

if USE_CUDA:
    q_network = q_network.cuda()
    target_q_network = target_q_network.cuda()

### Train on different sequences

In [None]:
import itertools

# Env params
collision_penalty = -2
trap_penalty = 0.5

max_seq_length = 5
seq_dict = {}

# Run gym on all sequences with length <= max_seq_length
for i in range(3, max_seq_length + 1):
    print(i)
    for j in list(itertools.product('HP', repeat = i)):
        seq = ''.join(j)
        env = Lattice2DLinearEnv(seq, collision_penalty, trap_penalty)
        if i <= 4:
            reward, actions = env.all_combs()
            seq_dict.update( {seq : reward})
        else:
            losses, rewards, agent = run_gym(env)
            seq_dict.update( {seq : rewards[-1]})

In [None]:
env = Lattice2DLinearEnv("HPPHHP")
run_gym(env)

In [None]:
env = Lattice2DLinearEnv("HHHHHP")
agent = Agent(env, q_network, target_q_network)
run_agent(agent, env)

## DQN with CNN Model

In [None]:
from lattice2d_cnn_env import Lattice2DCNNEnv

class CnnDQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(CnnDQN, self).__init__()
        self.input_shape = (1,201,201,)
        self.num_actions = num_actions
        self.features = nn.Sequential(
            nn.Conv2d(self.input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
        )
        self.fc = nn.Sequential(
            nn.Linear(self.feature_size(), 512),
            nn.ReLU(),
            nn.Linear(512, self.num_actions),
        )
        
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

    def feature_size(self):
        return self.features(torch.zeros(1, *self.input_shape)).view(1, -1).size(1)

env = Lattice2DCNNEnv("H")
q_network = CnnDQN(env.observation_space.shape, env.action_space.n)
target_q_network = deepcopy(q_network)

if USE_CUDA:
    q_network = q_network.cuda()
    target_q_network = target_q_network.cuda()

## Train on different sequences

In [None]:
import itertools

# Env params
collision_penalty = -2
trap_penalty = 0.5

max_seq_length = 10
seq_dict = {}

target_network_update_f = 1000
num_timesteps = 1000000
log_every = 10000

# Run gym on all sequences with length <= max_seq_length
for i in range(max_seq_length):
    for j in list(itertools.product('HP', repeat = i)):
        seq = ''.join(j)
        env = Lattice2DCNNEnv(seq, collision_penalty, trap_penalty)
        if i <= 10:
            reward, actions = env.all_combs()
            seq_dict.update( {seq : reward})
        else:
            losses, rewards, agent = run_gym(env)
            seq_dict.update( {seq : rewards[-1]})

In [None]:
env = Lattice2DCNNEnv("HPHPHHPHPPHPHHPPHPH", collision_penalty, trap_penalty)
run_agent(agent, env)