In [1]:
import gym
import numpy as np
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression
from statistics import median, mean
from collections import Counter
import random

In [2]:
class agent():
    def __init__(self):
        self.LR = 1e-3
        self.env = gym.make("CartPole-v0")
        self.model = self.neural_network_model()
        self.env.reset()
        self.goal_steps = 500
        self.score_requirement = 50
        self.initial_games = 100000
     
    
    def neural_network_model(self):
        input_size = 4
        LR = 1e-3
        network = input_data(shape=[None, input_size, 1], name='input')
        network = fully_connected(network, 128, activation='relu')
        network = dropout(network,.8)
        network = fully_connected(network, 256, activation='relu')
        network = dropout(network,.8)
        network = fully_connected(network, 512, activation='relu')
        network = dropout(network,.8)
        network = fully_connected(network, 256, activation='relu')
        network = dropout(network,.8)
        network = fully_connected(network, 128, activation='relu')
        network = dropout(network,.8)
        network = fully_connected(network, 2, activation='softmax')
        network = regression(network, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets')
        model = tflearn.DNN(network, tensorboard_dir='/tmp/tensorflow_logs')

        return model
        
    def generate_data(self):
        # [OBS, MOVES]
        training_data = []
        # all scores:
        scores = []
        # just the scores that met our threshold:
        accepted_scores = []
        # iterate through however many games we want:
        for _ in range(self.initial_games):
            score = 0
            # moves specifically from this environment:
            game_memory = []
            # previous observation that we saw
            prev_observation = []
            # for each frame in 200
            for _ in range(self.goal_steps):
                # choose random action (0 or 1)
                action = random.randrange(0,2)
                # do it!
                observation, reward, done, info = self.env.step(action)

                # notice that the observation is returned FROM the action
                # so we'll store the previous observation here, pairing
                # the prev observation to the action we'll take.
                if len(prev_observation) > 0 :
                    game_memory.append([prev_observation, action])
                prev_observation = observation
                score+=reward
                if done: break

            # IF our score is higher than our threshold, we'd like to save
            # every move we made
            # NOTE the reinforcement methodology here. 
            # all we're doing is reinforcing the score, we're not trying 
            # to influence the machine in any way as to HOW that score is 
            # reached.
            if score >= self.score_requirement:
                accepted_scores.append(score)
                for data in game_memory:
                    # convert to one-hot (this is the output layer for our neural network)
                    if data[1] == 1:
                        output = [0,1]
                    elif data[1] == 0:
                        output = [1,0]

                    # saving our training data
                    training_data.append([data[0], output])

            # reset env to play again
            self.env.reset()
            # save overall scores
            scores.append(score)

        # just in case you wanted to reference later
        training_data_save = np.array(training_data)
        np.save('saved.npy',training_data_save)

        # some stats here, to further illustrate the neural network magic!
        print('Average accepted score:',mean(accepted_scores))
        print('Median score for accepted scores:',median(accepted_scores))
        print(Counter(accepted_scores))

        return training_data
    
    def train_model(self, model=False):
        training_data = np.load('saved.npy')
        X = np.array([i[0] for i in training_data]).reshape(-1,len(training_data[0][0]),1)
        y = [i[1] for i in training_data]

        self.model.fit({'input': X}, {'targets': y}, n_epoch=1, snapshot_step=500, show_metric=True, run_id='openai_learning')
        self.model.save('model_save/test')
        return model
    
    def play(self, render = False, num = 100):
        scores = []
        choices = []
        for each_game in range(num):
            score = 0
            game_memory = []
            prev_obs = []
            self.env.reset()
            for _ in range(self.goal_steps):
                if render:
                    self.env.render()

                if len(prev_obs)==0:
                    action = random.randrange(0,2)
                else:
                    action = np.argmax(self.model.predict(prev_obs.reshape(-1,len(prev_obs),1))[0])

                choices.append(action)

                new_observation, reward, done, info = self.env.step(action)
                prev_obs = new_observation
                game_memory.append([new_observation, action])
                score+=reward
                if done: break

            scores.append(score)

        print('Average Score:',sum(scores)/len(scores))
        print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))
    def run(self):
        self.generate_data()
        self.train_model()
        self.play()

In [3]:
more = agent()
more.run()

Training Step: 3487  | total loss: [1m[32m0.65897[0m[0m | time: 51.394s
| Adam | epoch: 001 | loss: 0.65897 - acc: 0.6136 -- iter: 223168/223196
Training Step: 3488  | total loss: [1m[32m0.65979[0m[0m | time: 51.406s
| Adam | epoch: 001 | loss: 0.65979 - acc: 0.6069 -- iter: 223196/223196
--
INFO:tensorflow:/home/hedonist/Documents/openai_cartpole_tensorflow/ipython/model_save/test is not in all_model_checkpoint_paths. Manually adding it.


[2017-06-29 15:57:24,093] /home/hedonist/Documents/openai_cartpole_tensorflow/ipython/model_save/test is not in all_model_checkpoint_paths. Manually adding it.


Average Score: 173.84
choice 1:0.50483202945237  choice 0:0.49516797054763
