# Pip install all needed packages

In [1]:
!pip install gym
!pip install tensorflow==2.3.0
!pip install keras
!pip install keras-rl2

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable




# Imports

In [2]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import math


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

2022-02-19 19:10:08.770431: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2022-02-19 19:10:08.770464: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Global variables

In [3]:
depthOfCode = 10
rows = 2
cols = 2

# Functions

In [4]:
# swap is given actions which is a tuple of actions or a action, where every action is a tuple with the values
# of two qubits (x, y) whos values should be swaped. x and y are ints between 0 and 8 corresponding to 
# the following qubit notation:
#         [[0, 1, 2],
#          [3, 4, 5],
#          [6, 7, 8]]

# ex. of a tuple of actions: ((0, 3), (4, 5), (7, 8))

# ex. of a single action: (0, 1)

# in the case of a single action we make a list out of it so it's iterable to minimize code
def swap(state, actions):
    if type(actions[0]) != tuple:
        actions = [actions]
    for action in actions:
        pos0, pos1 = action
        
        col0 = pos0%cols
        row0 = int((pos0-col0)/cols)  
        col1 = pos1%cols
        row1 = int((pos1-col1)/cols)
        
        for i in range(len(state)):
            state[i][row0][col0], state[i][row1][col1] = state[i][row1][col1], state[i][row0][col0]


In [5]:
# getNeighbors returns a list of the qubit notations of all neighbors to a specific qubit. 
# I.e. qubits above, below, right and left of the specific qubit.

def getNeighbors(state, row_number, column_number):
    a = [[state[i][j] if  i >= 0 and i < len(state) and j >= 0 and j < len(state[0]) else -1
                    for j in range(column_number-1, column_number+2)]
                        for i in range(row_number-1, row_number+2)]
    return [a[0][1], a[1][0], a[1][2], a[2][1]]

In [6]:
#                         [[1,0,0], [[1,0,0],   [[1,0,0],         [[1,0,0],  
# Takes in a state like [  [1,0,2],  [1,0,2], ,  [1,0,2], , ... ,  [1,0,2], ] and checks if all the pairs of 
#                          [2,0,0]]  [2,0,0]]    [2,0,0]]          [2,0,0]] 
# numbers in the first slice are neighbors and if so returns True else returns False

def isExecutableState(state):
    for row in range(len(state[0])):
        for col in range(len(state[0][0])):
            if state[0][row][col] > 0:
                if not state[0][row][col] in getNeighbors(state[0], row, col):
                    return False

    return True

In [7]:
# We use this once to get all the different swap combinations. I.e. all acceptable combinations of one to four
# swaps. This are the different actions we cound make in one timestep.

def getPossibleActions(maxSwapsPerTimeStep=math.floor(rows*cols/2)):
    state = np.arange(rows*cols).reshape((rows,cols))
    
    possibleActions = getPossibleActionsSub(state, [], maxSwapsPerTimeStep)
    
    possibleActions = set(map(lambda x: tuple(sorted(x)), possibleActions ))
    
    possibleActions = list(possibleActions)
    possibleActions.append((0, 0))
    
    return possibleActions
    
def getPossibleActionsSub(state, used, maxSwapsPerTimeStep):
    if maxSwapsPerTimeStep == 0:
        return np.asarray([])
    
    possibleActions = []
    
    for i in range(len(state)):
        for j in range(len(state[0])):
            
            usedtmp = used.copy()
            
            if not state[i][j] in usedtmp:
                neighbors = getNeighbors(state, i, j)
                for neighbor in neighbors:
                    if neighbor >= 0 and not (neighbor, state[i][j]) in possibleActions and not neighbor in usedtmp:
                        possibleActions.append((state[i][j], neighbor))
                        usedtmp.append(state[i][j])
                        usedtmp.append(neighbor)
 
                        for action in getPossibleActionsSub(state, usedtmp, maxSwapsPerTimeStep-1):
                            if type(action) == tuple:
                                possibleActions.append([(state[i][j], neighbor), action])
                            elif type(action) == list:
                                action.append((state[i][j], neighbor))
                                possibleActions.append(action)
                                
        
    return possibleActions

In [8]:
# Creates a shuffled Matrix simulatinga slice of quantum code with one to max amount 
# of operations per timestep

# Ex1. [[0, 1, 0],
#       [1, 2, 2],
#       [3, 0, 3]]

# Ex2. [[2, 1],
#       [2, 1]]

def makeStateSlice():
    random = np.random.choice([x for x in range(2, rows*cols+2) if x % 2])
    stateSlice = np.ceil(np.arange(1, random)/2)
    stateSlice = np.append(stateSlice, np.zeros(rows*cols-random+1))
    np.random.shuffle(stateSlice)
    return stateSlice.reshape((rows,cols))

In [9]:
# Makes a state out of depthOfCode amount of slices
def makeState():
    state = np.zeros((depthOfCode,rows,cols))
    for i in range(len(state)):
        state[i] = makeStateSlice()
    return state

# Tests some of the functions

In [10]:
state = np.zeros((depthOfCode,rows,cols))
for i in range(len(state)):
    state[i] = np.arange(rows*cols).reshape((rows,cols))

#print(state)

possibleActions = getPossibleActions()
print(len(possibleActions))
#print(possibleActions)

a = possibleActions[0]

print(a)

statetmp = np.arange(rows*cols).reshape((rows,cols))

swap(state, a)

print(state[0])

#for i in range(len(possibleActions)):
#output = set(map(lambda x: tuple(sorted(x)),possibleActions))

#print(len(output))

7
(0, 1)
[[1. 0.]
 [2. 3.]]


Test function makeState

In [11]:
print(makeState())

[[[2. 2.]
  [1. 1.]]

 [[0. 1.]
  [1. 0.]]

 [[0. 1.]
  [1. 0.]]

 [[1. 1.]
  [2. 2.]]

 [[1. 0.]
  [0. 1.]]

 [[2. 1.]
  [1. 2.]]

 [[0. 0.]
  [1. 1.]]

 [[1. 0.]
  [1. 0.]]

 [[1. 0.]
  [1. 0.]]

 [[1. 0.]
  [1. 0.]]]


# Enviotment definition and sub functions

In [12]:
possibleActions = getPossibleActions()

In [13]:
#Our enviorment
class Kvant(Env):
    def __init__(self):
        #array of possible actions
        self.possibleActions = possibleActions
        
        #self.possibleActions = getPossibleActions(1) #this for only 1 swap at a time
        
        #Number of actions we can take
        self.action_space = Discrete(len(self.possibleActions))
        
        #
        self.observation_space = Box(low=0, high=math.floor(rows*cols/2), shape=(depthOfCode, rows, cols), dtype=np.int)
        
        #The start state
        self.state = makeState()
        
        #max amount of layers per episode
        self.maxLayers = depthOfCode
        
    def step(self, action):
        
        actions = self.possibleActions[action]

        swap(self.state, actions)
         
        
        # Rewards 
        if actions == (0,0):
            reward = 0
        else:
            reward = -1
        
        if isExecutableState(self.state):
            
            reward += 5
            
            # remove the exicutable slice and add a new random slice at the tail
            self.state = np.roll(self.state, -1, axis = 0)
            self.state[depthOfCode - 1] = makeStateSlice()
            
            self.maxLayers -= 1
            
            # we are not done except if this was the last layer we can work on this episode
            if self.maxLayers <= 0:
                done = True
            else:
                done = False
            
        else:
            done = False
        
        info = {}
        
        return self.state, reward, done, info
        
    def render(self):
        pass
    
    def reset(self):
        self.state = makeState()
        #self.maxTimeSteps = 5
        self.maxLayers = depthOfCode
        return self.state

# Testing the enviorment

In [14]:
env = Kvant()

In [15]:
env.observation_space.sample()

array([[[2, 1],
        [1, 2]],

       [[1, 0],
        [1, 1]],

       [[0, 2],
        [2, 2]],

       [[0, 0],
        [1, 0]],

       [[1, 2],
        [2, 1]],

       [[1, 1],
        [0, 2]],

       [[0, 1],
        [1, 1]],

       [[0, 0],
        [1, 2]],

       [[0, 2],
        [0, 0]],

       [[1, 0],
        [1, 2]]])

## test just random actions

In [16]:
episodes = 500
scores = []
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    scores.append(score)

In [17]:
for i in range(min(scores), max(scores)+1):
    if scores.count(i) != 0:
        print('Number of', i, 'is', scores.count(i))

Number of 26 is 1
Number of 27 is 1
Number of 28 is 1
Number of 29 is 2
Number of 30 is 8
Number of 31 is 17
Number of 32 is 28
Number of 33 is 28
Number of 34 is 40
Number of 35 is 44
Number of 36 is 66
Number of 37 is 83
Number of 38 is 64
Number of 39 is 58
Number of 40 is 40
Number of 41 is 12
Number of 42 is 5
Number of 43 is 1
Number of 45 is 1


In [18]:
print(np.mean(scores))

36.314


# Create a Deep Learning Model with Keras

In [19]:
actions = env.action_space.n

In [20]:
def build_model(actions):
    model = Sequential()
    model.add(Dense(20, activation='relu', input_shape=(1, depthOfCode, cols, rows)))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [21]:
model = build_model(actions)

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1, 10, 2, 20)      60        
_________________________________________________________________
flatten (Flatten)            (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               40100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 7)                 707       
Total params: 50,967
Trainable params: 50,967
Non-trainable params: 0
_________________________________________________________________


# Q network

In [30]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=200000, window_length=1)
    agent = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return agent

In [31]:
agent = build_agent(model, actions)
agent.compile(Adam(lr=1e-3), metrics=['mae'])
agent.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
634 episodes - episode_reward: 36.427 [26.000, 43.000] - loss: 52873625.522 - mae: 37933.362 - mean_q: 47438.556

Interval 2 (10000 steps performed)
633 episodes - episode_reward: 36.509 [20.000, 44.000] - loss: 244820560.000 - mae: 80865.453 - mean_q: 100413.609

Interval 3 (20000 steps performed)
642 episodes - episode_reward: 36.657 [27.000, 45.000] - loss: 550402304.000 - mae: 119830.477 - mean_q: 148302.406

Interval 4 (30000 steps performed)
630 episodes - episode_reward: 36.395 [26.000, 45.000] - loss: 884111232.000 - mae: 148671.750 - mean_q: 183982.109

Interval 5 (40000 steps performed)
done, took 433.278 seconds


<tensorflow.python.keras.callbacks.History at 0x7fe22bbe1580>

In [25]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...


KeyboardInterrupt: 