## Reinforcement Learning

La class DQNAgent permet de cr√©er et d'entrainer un model

In [12]:
# INITIALIZATION: libraries, parameters, network...

from keras.models import Sequential, load_model      # One layer after the other
from keras.layers import Dense, Flatten  # Dense layers are fully connected layers, Flatten layers flatten out multidimensional inputs
from collections import deque            # For storing moves
from keras.optimizers import Adam

import numpy as np
import gym                                # To train our network
import random

# Deep Q-learning Agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        return load_model('LunarLander-v22.h5')
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
        if len(agent.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += self.gamma * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def save_model(self):
        self.model.save('model.h5')
        
    def load(self):
        self.model = load_model('model.h5')

## Choix d'un environnement

On choisit un environnement par exemple 'CartPole-v0'

In [3]:
env = gym.make('LunarLander-v2')
env.observation_space, env.action_space

(Box(8,), Discrete(4))

## Entrainement

La cellule d'apres permet de commencer l'entrainement sur un environnement

In [14]:
episodes = 1000
state_size, action_size = (8, 4)
agent = DQNAgent(state_size, action_size)
# Iterate the game
for e in range(episodes):
    # reset state in the beginning of each game
    done = False
    score = 0.
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    while not done:
        # Decide action
        action = agent.act(state)
        # Advance the game to the next frame based on the action.
        # Reward is 1 for every frame the pole survived
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        # Remember the previous state, action, reward, and done
        agent.remember(state, action, reward, next_state, done)
        # make next_state the new current state for the next frame.
        state = next_state
        score += reward
    print("episode: {}/{}, score: {}".format(e, episodes, score))
    # train the agent with the experience of the episode
    agent.replay(32)
    
agent.model.save('LunarLander-v22.h5')

episode: 0/1000, score: -189.42808946845554
episode: 1/1000, score: -170.91264618491863
episode: 2/1000, score: -221.71707284264176
episode: 3/1000, score: -239.64673066238555
episode: 4/1000, score: -166.84489294879603
episode: 5/1000, score: -91.0439504419072
episode: 6/1000, score: -121.62861409300011
episode: 7/1000, score: -159.00651255079893
episode: 8/1000, score: -125.25373244807561
episode: 9/1000, score: -90.95735934941132
episode: 10/1000, score: -206.9662258514125
episode: 11/1000, score: -89.1272195411003
episode: 12/1000, score: -133.25377448860786
episode: 13/1000, score: -160.06513483763206
episode: 14/1000, score: -27.596424116610436
episode: 15/1000, score: -68.44160471691957
episode: 16/1000, score: -92.38133986536398
episode: 17/1000, score: -152.07705346195655
episode: 18/1000, score: -135.3522639451736
episode: 19/1000, score: -256.93596248387166
episode: 20/1000, score: -97.36147760680514
episode: 21/1000, score: -118.93018004782104
episode: 22/1000, score: -163.

episode: 183/1000, score: -141.7686986311798
episode: 184/1000, score: -26.544866627950114
episode: 185/1000, score: -51.262131241487666
episode: 186/1000, score: -117.57741555597374
episode: 187/1000, score: -181.9139697271287
episode: 188/1000, score: -26.695835385529577
episode: 189/1000, score: -206.6490593223392
episode: 190/1000, score: -57.900707859640974
episode: 191/1000, score: -119.69525799010782
episode: 192/1000, score: -65.97410278333246
episode: 193/1000, score: -23.05514937839901
episode: 194/1000, score: -33.9857641087393
episode: 195/1000, score: -248.98091245206743
episode: 196/1000, score: -112.93826179599996
episode: 197/1000, score: -120.17659783016937
episode: 198/1000, score: -141.36202831112018
episode: 199/1000, score: -59.89356990552682
episode: 200/1000, score: -131.83734321005483
episode: 201/1000, score: -126.98746917485913
episode: 202/1000, score: -125.31493010402774
episode: 203/1000, score: -138.92785354205694
episode: 204/1000, score: -105.85757641529

episode: 364/1000, score: 15.105489874664045
episode: 365/1000, score: -110.13410210795999
episode: 366/1000, score: -157.90924681939617
episode: 367/1000, score: -113.11493623709669
episode: 368/1000, score: 54.21142799701312
episode: 369/1000, score: 183.83115824186214
episode: 370/1000, score: -77.20461405696548
episode: 371/1000, score: -109.13921703266402
episode: 372/1000, score: -141.6628497182538
episode: 373/1000, score: 31.607189596562296
episode: 374/1000, score: -298.40784620883517
episode: 375/1000, score: 43.42334668491014
episode: 376/1000, score: -16.507261266791446
episode: 377/1000, score: -56.08507147958962
episode: 378/1000, score: 107.7761512727809
episode: 379/1000, score: -218.50324363501062
episode: 380/1000, score: -249.27523488959872
episode: 381/1000, score: -108.4185621926916
episode: 382/1000, score: -308.2489831016928
episode: 383/1000, score: -229.22872154385547
episode: 384/1000, score: -252.83406779265434
episode: 385/1000, score: -96.54326889897915
epi

episode: 545/1000, score: -100.61929860654497
episode: 546/1000, score: -92.37589143408906
episode: 547/1000, score: -84.73604823601265
episode: 548/1000, score: -93.28065739650464
episode: 549/1000, score: -33.53794868202369
episode: 550/1000, score: -7.235400480733976
episode: 551/1000, score: -72.58289382190853
episode: 552/1000, score: -157.24137678554368
episode: 553/1000, score: -95.24121942979318
episode: 554/1000, score: -203.17997205956374
episode: 555/1000, score: -264.50092487809565
episode: 556/1000, score: -91.34096829411372
episode: 557/1000, score: -119.51290610857959
episode: 558/1000, score: -140.79821232798818
episode: 559/1000, score: -103.75106825251194
episode: 560/1000, score: -150.51441087095725
episode: 561/1000, score: -53.12988749280032
episode: 562/1000, score: -119.28415866559976
episode: 563/1000, score: -150.93365759370434
episode: 564/1000, score: -145.84333479975962
episode: 565/1000, score: -150.14503658698828
episode: 566/1000, score: -87.4323716178987

episode: 726/1000, score: -113.40467956465284
episode: 727/1000, score: -92.68206680299907
episode: 728/1000, score: -160.2221350882577
episode: 729/1000, score: -53.40359172592836
episode: 730/1000, score: -119.63480493069198
episode: 731/1000, score: -86.03332345621709
episode: 732/1000, score: -205.2943415231357
episode: 733/1000, score: -88.64581010017136
episode: 734/1000, score: -91.67028514763203
episode: 735/1000, score: -84.04146317300561
episode: 736/1000, score: -120.59152451712357
episode: 737/1000, score: -94.03384400212522
episode: 738/1000, score: -123.14087346872016
episode: 739/1000, score: -99.02594802876844
episode: 740/1000, score: -87.91063402708892
episode: 741/1000, score: -99.1738321761028
episode: 742/1000, score: -114.84682376693961
episode: 743/1000, score: -99.59823269297686
episode: 744/1000, score: -114.17712253291616
episode: 745/1000, score: -145.754086027712
episode: 746/1000, score: -120.78926204807595
episode: 747/1000, score: -106.40808036108899
epis

episode: 907/1000, score: -83.85398804922535
episode: 908/1000, score: -55.520891071084726
episode: 909/1000, score: -105.84496730587207
episode: 910/1000, score: -103.95200251013111
episode: 911/1000, score: -24.87948062412316
episode: 912/1000, score: -27.004073979500475
episode: 913/1000, score: -95.09747082582847
episode: 914/1000, score: -9.82979485910222
episode: 915/1000, score: -112.58147502749414
episode: 916/1000, score: -66.56219630728404
episode: 917/1000, score: -136.4121528878612
episode: 918/1000, score: -107.98504734467592
episode: 919/1000, score: -61.69048212640769
episode: 920/1000, score: -71.92701544414818
episode: 921/1000, score: -67.32844247739182
episode: 922/1000, score: -63.13906953820778
episode: 923/1000, score: -62.03690639157841
episode: 924/1000, score: -141.65876131836106
episode: 925/1000, score: -83.64779595123849
episode: 926/1000, score: -99.16010068718788
episode: 927/1000, score: -29.215752375986185
episode: 928/1000, score: -44.13597701764019
epi

In [None]:
# CartPole-v0 state=4 action=2
# LunarLander-v2 state=8 action=4