In [1]:
import gym
import random
import numpy as np
from tensorflow.keras.models     import Sequential
from tensorflow.keras.layers     import Dense
from tensorflow.keras.optimizers import Adam

In [16]:
env = gym.make('CartPole-v1')
env.reset()
goal_steps = 500
score_requirement = 60
intial_games = 10000

In [17]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 2)
            observation, reward, done, info = env.step(action)
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                training_data.append([data[0], output])
        
        env.reset()

    print(accepted_scores)
    
    return training_data

In [18]:
train = model_data_preparation()

[66.0, 60.0, 62.0, 82.0, 60.0, 60.0, 69.0, 102.0, 83.0, 67.0, 69.0, 66.0, 64.0, 61.0, 62.0, 72.0, 82.0, 93.0, 75.0, 71.0, 67.0, 74.0, 63.0, 64.0, 70.0, 65.0, 69.0, 61.0, 64.0, 61.0, 81.0, 61.0, 67.0, 102.0, 77.0, 72.0, 67.0, 71.0, 63.0, 88.0, 84.0, 60.0, 72.0, 65.0, 63.0, 60.0, 79.0, 76.0, 73.0, 67.0, 75.0, 67.0, 81.0, 80.0, 79.0, 65.0, 60.0, 61.0, 75.0, 91.0, 63.0, 87.0, 68.0, 61.0, 77.0, 65.0, 64.0, 82.0, 77.0, 63.0, 66.0, 63.0, 61.0, 119.0, 62.0, 62.0, 68.0, 62.0, 61.0, 90.0, 62.0, 60.0, 66.0, 60.0, 68.0, 74.0, 60.0, 72.0, 67.0, 63.0, 64.0, 63.0, 64.0, 73.0, 66.0, 67.0, 65.0, 63.0, 60.0, 65.0, 68.0, 78.0, 96.0, 65.0, 62.0, 67.0, 71.0, 62.0, 69.0, 75.0, 63.0, 78.0, 66.0, 70.0, 75.0, 101.0, 77.0, 65.0, 65.0, 63.0, 69.0, 66.0, 70.0, 63.0, 67.0, 70.0, 62.0, 92.0, 71.0, 73.0, 98.0, 81.0, 67.0, 73.0, 61.0, 77.0, 61.0, 71.0, 74.0, 62.0, 68.0, 67.0, 64.0, 76.0, 64.0, 70.0, 72.0, 89.0, 61.0, 61.0, 91.0, 64.0, 77.0]


In [19]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

In [20]:
def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    model.fit(X, y, epochs=3)
    return model

In [21]:
trained_model = train_model(train)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:

scores = []
choices = []
for each_game in range(100):
    score = 0
    prev_obs = []
    for step_index in range(goal_steps):
        env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        score+=reward
        if done:
            break

    env.reset()
    scores.append(score)

print(scores)
print('Average Score:', sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))

[158.0, 128.0, 156.0, 177.0, 226.0, 150.0, 176.0, 131.0, 153.0, 161.0, 500.0, 146.0, 168.0, 174.0, 168.0, 149.0, 152.0, 124.0, 195.0, 148.0, 142.0, 139.0, 172.0, 164.0, 166.0, 131.0, 163.0, 188.0, 149.0, 193.0, 338.0, 162.0, 221.0, 135.0, 124.0, 187.0, 143.0, 240.0, 171.0, 144.0, 172.0, 236.0, 224.0, 181.0, 188.0, 179.0, 206.0, 160.0, 128.0, 258.0, 141.0, 135.0, 157.0, 158.0, 138.0, 181.0, 142.0, 203.0, 163.0, 285.0, 193.0, 149.0, 180.0, 176.0, 130.0, 258.0, 230.0, 344.0, 157.0, 141.0, 199.0, 173.0, 147.0, 143.0, 364.0, 232.0, 155.0, 180.0, 179.0, 172.0, 200.0, 142.0, 143.0, 136.0, 187.0, 126.0, 197.0, 132.0, 139.0, 132.0, 191.0, 221.0, 223.0, 163.0, 174.0, 138.0, 161.0, 176.0, 297.0, 121.0]
Average Score: 179.48
choice 1:0.5273568085580566  choice 0:0.4726431914419434


In [27]:
env.close()