In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# gpu_info = !nvidia-smi
# gpu_info = '\n'.join(gpu_info)
# if gpu_info.find('failed') >= 0:
#   print('Not connected to a GPU')
# else:
#   print(gpu_info)

In [3]:
# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

E: Package 'python-software-properties' has no installation candidate
Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.27-0ubuntu1~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [4]:
## Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

In [5]:
# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}


Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [6]:
# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

In [7]:
# os.chdir('./drive')
import os
# os.listdir()

In [8]:
# suppose you make a directory called "try_colab" under your google drive
# you can change this path to the directory you want to work at 
path = './drive/STATSM231A/HW5' 
os.chdir(path)

# os.listdir('./')

In [9]:
# ! pip install gym

In [10]:
# install package pytorch lightening
# ! pip install pytorch-lightning
# ! pip install keras

In [11]:
from statistics import mean
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from collections import deque
import os
import csv
import numpy as np

SCORES_CSV_PATH = "./scores/scores.csv"
SCORES_PNG_PATH = "./scores/scores.png"
SOLVED_CSV_PATH = "./scores/solved.csv"
SOLVED_PNG_PATH = "./scores/solved.png"
AVERAGE_SCORE_TO_SOLVE = 195
CONSECUTIVE_RUNS_TO_SOLVE = 100


class ScoreLogger:

    def __init__(self, env_name):
        self.scores = deque(maxlen=CONSECUTIVE_RUNS_TO_SOLVE)
        self.env_name = env_name

        if os.path.exists(SCORES_PNG_PATH):
            os.remove(SCORES_PNG_PATH)
        if os.path.exists(SCORES_CSV_PATH):
            os.remove(SCORES_CSV_PATH)

    def add_score(self, score, run):
        self._save_csv(SCORES_CSV_PATH, score)
        self._save_png(input_path=SCORES_CSV_PATH,
                       output_path=SCORES_PNG_PATH,
                       x_label="runs",
                       y_label="scores",
                       average_of_n_last=CONSECUTIVE_RUNS_TO_SOLVE,
                       show_goal=True,
                       show_trend=True,
                       show_legend=True)
        self.scores.append(score)
        mean_score = np.mean(self.scores)
        print ("Scores: (min: " + str(min(self.scores)) + ", avg: " + str(mean_score) + ", max: " + str(max(self.scores)) + ")\n")
        if mean_score >= AVERAGE_SCORE_TO_SOLVE and len(self.scores) >= CONSECUTIVE_RUNS_TO_SOLVE:
            solve_score = run-CONSECUTIVE_RUNS_TO_SOLVE
            print ("Solved in " + str(solve_score) + " runs, " + str(run) + " total runs.")
            self._save_csv(SOLVED_CSV_PATH, solve_score)
            self._save_png(input_path=SOLVED_CSV_PATH,
                           output_path=SOLVED_PNG_PATH,
                           x_label="trials",
                           y_label="steps before solve",
                           average_of_n_last=None,
                           show_goal=False,
                           show_trend=False,
                           show_legend=False)
            exit()

    def _save_png(self, input_path, output_path, x_label, y_label, average_of_n_last, show_goal, show_trend, show_legend):
        x = []
        y = []
        with open(input_path, "r") as scores:
            reader = csv.reader(scores)
            data = list(reader)
            for i in range(0, len(data)):
                x.append(int(i))
                y.append(int(data[i][0]))

        plt.subplots()
        plt.plot(x, y, label="score per run")

        average_range = average_of_n_last if average_of_n_last is not None else len(x)
        plt.plot(x[-average_range:], [np.mean(y[-average_range:])] * len(y[-average_range:]), linestyle="--", label="last " + str(average_range) + " runs average")

        if show_goal:
            plt.plot(x, [AVERAGE_SCORE_TO_SOLVE] * len(x), linestyle=":", label=str(AVERAGE_SCORE_TO_SOLVE) + " score average goal")

        if show_trend and len(x) > 1:
            trend_x = x[1:]
            z = np.polyfit(np.array(trend_x), np.array(y[1:]), 1)
            p = np.poly1d(z)
            plt.plot(trend_x, p(trend_x), linestyle="-.",  label="trend")

        plt.title(self.env_name)
        plt.xlabel(x_label)
        plt.ylabel(y_label)

        if show_legend:
            plt.legend(loc="upper left")

        plt.savefig(output_path, bbox_inches="tight")
        plt.close()

    def _save_csv(self, path, score):
        if not os.path.exists(path):
            with open(path, "w"):
                pass
        scores_file = open(path, "a")
        with scores_file:
            writer = csv.writer(scores_file)
            writer.writerow([score])


In [12]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
# from keras.optimizers import Adam
from tensorflow.keras.optimizers import Adam # - Works


In [13]:
env = gym.make('CartPole-v1')
env.reset()
goal_steps = 500
score_requirement = 60
intial_games = 10000

In [14]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 2)
            observation, reward, done, info = env.step(action)
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                training_data.append([data[0], output])
        
        env.reset()

    print(accepted_scores)
    print(len(accepted_scores))
    
    return training_data

In [15]:
training_data = model_data_preparation()


[62.0, 62.0, 71.0, 60.0, 65.0, 61.0, 156.0, 69.0, 68.0, 63.0, 64.0, 68.0, 67.0, 64.0, 65.0, 103.0, 95.0, 62.0, 60.0, 68.0, 79.0, 97.0, 70.0, 81.0, 72.0, 61.0, 82.0, 72.0, 60.0, 66.0, 60.0, 106.0, 79.0, 60.0, 60.0, 78.0, 62.0, 65.0, 112.0, 72.0, 60.0, 86.0, 62.0, 79.0, 76.0, 90.0, 76.0, 62.0, 64.0, 60.0, 67.0, 88.0, 70.0, 80.0, 63.0, 87.0, 68.0, 98.0, 71.0, 62.0, 60.0, 76.0, 67.0, 100.0, 68.0, 88.0, 61.0, 80.0, 69.0, 80.0, 69.0, 77.0, 78.0, 64.0, 88.0, 60.0, 77.0, 63.0, 60.0, 64.0, 68.0, 84.0, 63.0, 63.0, 61.0, 68.0, 85.0, 84.0, 96.0, 65.0, 64.0, 60.0, 70.0, 67.0, 65.0, 60.0, 60.0, 64.0, 63.0, 66.0, 97.0, 61.0, 70.0, 112.0, 77.0, 84.0, 60.0, 61.0, 78.0, 69.0, 93.0, 68.0, 74.0, 64.0, 105.0, 63.0, 73.0, 65.0, 76.0, 76.0, 61.0, 63.0, 61.0, 72.0, 60.0, 69.0, 81.0, 71.0, 84.0, 84.0, 76.0, 67.0, 77.0, 68.0, 71.0, 99.0, 64.0, 78.0, 67.0, 75.0, 63.0, 63.0, 66.0, 87.0, 66.0, 83.0, 75.0, 63.0]
148


In [16]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer='adam')

    return model

def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    history = model.fit(X, y, epochs=50)
    return model, history


In [17]:
trained_model, history = train_model(training_data)
print(history.history.keys())

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
dict_keys(['loss'])


In [None]:
import gym
import numpy as np 
import tensorflow
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
# from keras.optimizers import Adam
import random as random
from collections import deque
import math
import datetime
import keras.backend as K
# from tensorflow import set_random_seed
# set_random_seed(0)
np.random.seed(0)
random.seed(0)




ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995


class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)


def cartpole():
    env = gym.make(ENV_NAME)
    score_logger = ScoreLogger(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while True:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break
            dqn_solver.experience_replay()



cartpole()

  super(Adam, self).__init__(name, **kwargs)


Run: 1, exploration: 1.0, score: 12
Scores: (min: 12, avg: 12.0, max: 12)

Run: 2, exploration: 0.9558895783575597, score: 17




Scores: (min: 12, avg: 14.5, max: 17)

Run: 3, exploration: 0.8475428503023453, score: 25
Scores: (min: 12, avg: 18.0, max: 25)

Run: 4, exploration: 0.7183288830986236, score: 34
Scores: (min: 12, avg: 22.0, max: 34)

Run: 5, exploration: 0.6242658676435396, score: 29
Scores: (min: 12, avg: 23.4, max: 34)

Run: 6, exploration: 0.5848838636585911, score: 14
Scores: (min: 12, avg: 21.833333333333332, max: 34)

Run: 7, exploration: 0.46444185833082485, score: 47
Scores: (min: 12, avg: 25.428571428571427, max: 47)

Run: 8, exploration: 0.3976004408064698, score: 32
Scores: (min: 12, avg: 26.25, max: 47)

Run: 9, exploration: 0.3706551064126331, score: 15
Scores: (min: 12, avg: 25.0, max: 47)

Run: 10, exploration: 0.30028896908517405, score: 43
Scores: (min: 12, avg: 26.8, max: 47)

Run: 11, exploration: 0.23965031961336, score: 46
Scores: (min: 12, avg: 28.545454545454547, max: 47)

Run: 12, exploration: 0.17475600159032884, score: 64
Scores: (min: 12, avg: 31.5, max: 64)

Run: 13, explo

In [19]:
import matplotlib.pyplot as plt
# summarize history for accuracy
plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('Loss Curve-DQN')
plt.ylabel('Loss')
plt.xlabel('Epoch')
#plt.legend(['Loss'], loc='upper left')
plt.savefig('loss_curve_DQN.png')

In [20]:
scores = []
choices = []
reward_history = []
for each_game in range(100):
    score = 0
    prev_obs = []
    for step_index in range(goal_steps):
        # Uncomment below line if you want to see how our bot is playing the game.
        #env.render()
        #print('Step:', step_index)
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        reward_history.append(reward)
        prev_obs = new_observation
        score+=reward
        if done:
            break
    #print('Game:', each_game)
    env.reset()
    scores.append(score)

print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))

[208.0, 500.0, 449.0, 218.0, 255.0, 189.0, 69.0, 226.0, 178.0, 111.0, 111.0, 363.0, 214.0, 455.0, 500.0, 500.0, 189.0, 476.0, 222.0, 500.0, 223.0, 500.0, 166.0, 500.0, 217.0, 84.0, 500.0, 270.0, 204.0, 194.0, 71.0, 82.0, 97.0, 500.0, 202.0, 182.0, 87.0, 372.0, 81.0, 346.0, 374.0, 233.0, 83.0, 183.0, 135.0, 110.0, 92.0, 77.0, 121.0, 500.0, 327.0, 251.0, 213.0, 116.0, 119.0, 500.0, 500.0, 500.0, 100.0, 107.0, 377.0, 107.0, 456.0, 308.0, 500.0, 330.0, 500.0, 193.0, 131.0, 363.0, 67.0, 166.0, 393.0, 204.0, 183.0, 490.0, 103.0, 177.0, 381.0, 122.0, 372.0, 477.0, 85.0, 102.0, 324.0, 317.0, 220.0, 500.0, 117.0, 75.0, 78.0, 226.0, 500.0, 431.0, 81.0, 145.0, 119.0, 170.0, 481.0, 500.0]
Average Score: 263.23
choice 1:0.49432055616761006  choice 0:0.5056794438323899


In [21]:
plt.plot(scores)
plt.show()