In [1]:
from __future__ import division
import gym
import torch
import random
import numpy as np
import torch
from PIL import Image
import torch
import torch.nn as nn
from collections import namedtuple
from collections import deque
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm
from matplotlib.pyplot import imshow
from PIL import Image
from wrappers import make_atari, wrap_deepmind, wrap_pytorch
import queue
from torch import optim
import matplotlib.pyplot as plt
import math
import pandas as pd 
import os 
import pickle 

In [2]:
class NoisyNet(nn.Module):
    def __init__(self, in_channels, out_channels, init_std = 0.5):
        super(NoisyNet, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.weights_mu = nn.Parameter(torch.empty(out_channels, in_channels)) 
        self.bias_mu = nn.Parameter(torch.empty(out_channels))
        self.weights_sigma = nn.Parameter(torch.empty(out_channels, in_channels))
        self.bias_sigma = nn.Parameter(torch.empty(out_channels))
        self.register_buffer('weight_epsilon',torch.empty(out_channels, in_channels))
        self.register_buffer('bias_epsilon',torch.empty(out_channels))
        self.init_std = init_std
        self.resetNoise()
        self.resetWeights()
       
   
    def resetNoise(self):
        epsilon_i = torch.randn(self.in_channels)
        epsilon_i = epsilon_i.sign().mul_(epsilon_i.abs().sqrt_())
        epsilon_j = torch.randn(self.out_channels)
        epsilon_j = epsilon_j.sign().mul_(epsilon_j.abs().sqrt_())
        self.weight_epsilon.copy_(epsilon_j.ger(epsilon_i))
        self.bias_epsilon.copy_(epsilon_j)
       
   
    def resetWeights(self):
        mu_range = 1 / math.sqrt(self.in_channels)
        self.weights_mu.data.uniform_(-mu_range, mu_range)
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.weights_sigma.data.fill_(self.init_std / math.sqrt(self.in_channels))
        self.bias_sigma.data.fill_(self.init_std / math.sqrt(self.out_channels))
       
   
    def forward(self, input):
        if not self.training:
            return F.linear(input, self.weights_mu, self.bias_mu)
        else:
            weights = self.weights_mu + self.weights_sigma * self.weight_epsilon
            biases = self.bias_mu + self.bias_sigma * self.bias_epsilon
            return F.linear(input, weights, biases)

In [3]:
class QNet(torch.nn.Module):
    def __init__(self,obs_shape,act_shape,atoms):
        super(QNet, self).__init__()
        self.atoms = atoms
        self.act_shape = act_shape

        self.conv1 = nn.Conv2d(4, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.selu = nn.SELU()
#         self.fc1 = nn.Linear(7*7*64,512)
#         self.fc2 = nn.Linear(512,1)
#         self.fc3 = nn.Linear(7*7*64,512)
#         self.fc4 = nn.Linear(512,act_shape)
        self.fc1 = NoisyNet(7*7*64,512)
        self.fc2 = NoisyNet(512,atoms)
        self.fc3 = NoisyNet(7*7*64,512)
        self.fc4 = NoisyNet(512,act_shape*atoms)
        self.softmax = nn.Softmax(dim=2)
        
    def forward(self, x):
        x=x/255
        
        #Conv
        x = self.conv1(x)
        x = self.selu(x)
        x = self.conv2(x)
        x = self.selu(x)
        x = self.conv3(x)
        x = self.selu(x)
        x = x.view(x.shape[0],-1)
        
        #Fc
        x1 = x
        x = self.fc1(x)
        x = self.selu(x)
        v = self.fc2(x)
        
        x1 = self.fc3(x1)
        x1 = self.selu(x1)
        adv = self.fc4(x1)
        
        #Reshaping value and advantage functions to add probabilities of each atom for each action
        value = v.view(v.shape[0],1,self.atoms)
        adv = adv.view(adv.shape[0],self.act_shape,self.atoms)
        
        q_s_a = value + adv - adv.mean(1,keepdim=True)
        
        #probability of each atom for all actions
        q_s_a = self.softmax(q_s_a)
        
        return q_s_a
    
    
    def reset_noise(self):
        for name, module in self.named_children():
            if 'fc' in name:
                module.resetNoise()

In [4]:
def eps_greedy(epsilon,state,net,atoms):
    if(np.random.random()<epsilon):
        action = np.random.randint(ACT_SHAPE)
    else:
        #Finding the expected value of each action (sum(pi*zi))
        qvalues = net(state)
        expected_values = torch.matmul(qvalues,atoms)
        action = torch.argmax(expected_values).item()
    return action    

In [5]:
env    = make_atari('PongNoFrameskip-v4')
env    = wrap_deepmind(env)
env    = wrap_pytorch(env)
VMIN = -10
VMAX = 10
N_ATOMS = 51
atoms = torch.linspace(VMIN,VMAX,N_ATOMS).cuda()

In [6]:
def test(net,evaluation_episodes):
    state = env.reset()
    net.eval()
    state = torch.Tensor(state).cuda()
    state = state.unsqueeze(0)
    rewards = []
    count = 0
    episode_reward = 0
    while(True):
        action = eps_greedy(0,state,net,atoms)  
        next_state, reward, done,info = env.step(action)
        next_state = torch.Tensor(next_state).unsqueeze(0).cuda()
        state = next_state
        episode_reward += reward
        if done:
            count += 1
            print('Episode ',count,end=' ')
            print('Reward ',episode_reward)
            rewards.append(episode_reward)
            state = env.reset()
            state = torch.Tensor(state).cuda()
            state = state.unsqueeze(0)
            episode_reward = 0
        if(count == evaluation_episodes):
            break
    env.close()


    return sum(rewards)/len(rewards)

In [7]:
np.random.seed(123)
torch.manual_seed(np.random.randint(1, 10000)) 
if torch.cuda.is_available():
    torch.manual_seed(np.random.randint(1, 10000))
net = QNet(env.observation_space.shape,env.action_space.n,51).cuda()
net.load_state_dict(torch.load('./rainbow_imp-logs/rb-model4400000.pth'))
net.eval()
test(net,50)

Episode  1 Reward  18.0
Episode  2 Reward  18.0
Episode  3 Reward  19.0
Episode  4 Reward  18.0
Episode  5 Reward  18.0
Episode  6 Reward  18.0
Episode  7 Reward  18.0
Episode  8 Reward  18.0
Episode  9 Reward  19.0
Episode  10 Reward  19.0
Episode  11 Reward  18.0
Episode  12 Reward  18.0
Episode  13 Reward  19.0
Episode  14 Reward  18.0
Episode  15 Reward  19.0
Episode  16 Reward  19.0
Episode  17 Reward  18.0
Episode  18 Reward  18.0
Episode  19 Reward  18.0
Episode  20 Reward  18.0
Episode  21 Reward  18.0
Episode  22 Reward  18.0
Episode  23 Reward  18.0
Episode  24 Reward  18.0
Episode  25 Reward  18.0
Episode  26 Reward  18.0
Episode  27 Reward  18.0
Episode  28 Reward  18.0
Episode  29 Reward  19.0
Episode  30 Reward  18.0
Episode  31 Reward  18.0
Episode  32 Reward  18.0
Episode  33 Reward  19.0
Episode  34 Reward  18.0
Episode  35 Reward  18.0
Episode  36 Reward  18.0
Episode  37 Reward  18.0
Episode  38 Reward  18.0
Episode  39 Reward  19.0
Episode  40 Reward  18.0
Episode  

18.26

In [8]:
rdf = pd.read_csv('./rainbow-logs/rewards.csv')
rdf2 = pd.read_csv('./rainbow-logs/rewards2.csv')
ldf = pd.read_csv('./rainbow-logs/losses.csv')
ldf2 = pd.read_csv('./rainbow-logs/losses2.csv')

In [9]:
rdf_final = pd.concat([rdf,rdf2],axis = 0)

In [11]:
rdf_final.shape

(5889, 3)

In [12]:
rdf.shape

(5110, 3)

In [13]:
rdf2.shape

(779, 3)

In [14]:
rdf_final.head()

Unnamed: 0,index,episode,rewards
0,762,1,-21.0
1,1526,2,-21.0
2,2288,3,-21.0
3,3049,4,-21.0
4,3808,5,-21.0


In [15]:
rdf_final.to_csv("./rainbow-logs/rewards_comb.csv",sep = ',', index = None)

In [16]:
ldf_final = pd.concat([ldf,ldf2],axis = 0)

In [17]:
ldf_final.shape

(9770, 2)

In [None]:
ldf