In [10]:
from __future__ import absolute_import, division, print_function,unicode_literals
import gym
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDRegressor

class SGDRegressor:
    def __init__(self, D):
        self.w = np.random.random(D) / np.sqrt(D)
        self.lr = 0.1
        
    def partial_fit(self, x, y):
        self.w += self.lr * (y - x.dot(self.w)).dot(x)
        
    def predict(self, x):
        return x.dot(self.w)
    
    
class FeatureTransformer:
    def __init__(self, env):
        observation_examples = np.random.random((20000, 4))*2-1
        scaler = StandardScaler()
        scaler.fit(observation_examples)
        
        #Using RDF kernels with different variances to cover different parts of the space
        featurizer = FeatureUnion([
            ('rbf1', RBFSampler(gamma=0.05, n_components=1000)),
            ('rbf2', RBFSampler(gamma=1.0, n_components=1000)),
            ('rbf3', RBFSampler(gamma=0.5, n_components=1000)),
            ('rbf4', RBFSampler(gamma=0.1, n_components=1000))
        ])
        
        feature_examples = featurizer.fit_transform(scaler.transform(observation_examples))
        
        self.dimensions = feature_examples.shape[1]
        self.scaler = scaler
        self.featurizer = featurizer
        
    def transform(self, observations):
        scaled = self.scaler.transform(observations)
        return self.featurizer.transform(scaled)
    

In [23]:
class Model:
    def __init__(self, env, feature_transformer):
        self.env = env
        self.models = []
        self.feature_transformer = feature_transformer
        for i in range(env.action_space.n):
            model = SGDRegressor(feature_transformer.dimensions, )
            self.models.append(model)
            
    def predict(self, s):
        x = self.feature_transformer.transformer(np.atleast_2d(s))
        return np.array([m.predict(x) [0] for m in self.models])
    
    def update(self, s, a, G):
        x = self.feature_transformer.transform(np.atleast_2d(s))
        self.models[a].partial_fit(x, [G])
        
    def sample_action(self, s, eps):
        if np.random.random() < eps:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.predict(s))
        
    def play_one(env, model, eps, gamma):
        observation = env.reset()
        #env.render()
        done = False
        totalreward = 0
        iters = 0
        while not done and iters < 2000:
            action = model.sample_action(observation, eps)
            prev_observation = observation
            observation, reward, done, info = env.step(action)
            
            if done:
                reward = -200
                
            #Update the model
            next = model.predict(observation)
            assert(len(next.shape) == 1)# print(next.shape)
            G = reward  + gamma * np.max(next)
            model.update(prev_observation, action, G)
            
            if reward == 1: # if the reward was changed
                totalreward += reward
                iters += 1
                
        return totalreward 

In [31]:
def plot_running_avg(totalrewards):
    N = len(totalrewards)
    running_avg = np.empty(N)
    for t in range(N):
        running_avg[t] = totalrewards[max(0, t-100):(t+1)].mean()
    plt.plot(running_avg)
    plt.title("Running Average")
plt.show()

In [32]:
def main():
    env = gym.make('CartPole-v0')
    ft = FeatureTransformer(env)
    model = Model(env, ft)
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)


    N = 500
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0/np.sqrt(n+1)
        totalreward = play_one(env, model, eps, gamma)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    print("total steps:", totalrewards.sum())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()


    plot_running_avg(totalrewards)
    
    
#if __name__ == '__main__':
       # main()

NameError: name 'play_one' is not defined