Skip to content
This repository has been archived by the owner on Oct 14, 2020. It is now read-only.


update gradient visualization
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Feb 3, 2018
1 parent 847f9e7 commit dab72d8
Show file tree
Hide file tree
Showing 7 changed files with 507 additions and 12 deletions.
1 change: 1 addition & 0 deletions
Expand Up @@ -17,6 +17,7 @@ Code Machine learning models from scratch. Trying to implement some optimizers a
13. [TSNE (original TSNE, Adaptive Momentum TSNE)](tsne)
14. [PCA](pca)
15. [Naive Bayes on TF*IDF Twitter dataset (gaussian, multinomial)](bayes-tfidf)
16. [Gradient Visualization for evolution based and derivative based (MSE, RMSE, MAE)](gradient-visualization)

*Some of results are not good because of softmax and cross entropy functions I code.*

Expand Down
130 changes: 130 additions & 0 deletions gradient-visualization/gradient-MAE.ipynb

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions gradient-visualization/gradient-MSE.ipynb

Large diffs are not rendered by default.

131 changes: 131 additions & 0 deletions gradient-visualization/gradient-RMSE.ipynb

Large diffs are not rendered by default.

116 changes: 116 additions & 0 deletions gradient-visualization/
@@ -0,0 +1,116 @@
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def subplot_evolution_strategies(step, learning_rate, sigma, population_size,
x_boundary = 1, y_boundary = 2,
step_x = 20, step_y = 50, midpoint = 0, ax=None):
if ax is None:
ax = plt.gca()
x = np.linspace(-x_boundary,x_boundary,step_x)
y = midpoint * x

def mean_abs_error(theta):
theta = np.atleast_2d(np.asarray(theta))
return np.mean(np.abs(y-hypothesis(x, theta)), axis=1)

def hypothesis(x, theta):
return theta * x

theta_grid = np.linspace(-y_boundary,y_boundary,step_y)
J_grid = mean_abs_error(theta_grid[:,np.newaxis])

ax.plot(theta_grid, J_grid)
theta = [-y_boundary]
J = [mean_abs_error(theta[0])[0]]
strings = 'X-axis steps:\n\n'
for j in range(step-1):
last_theta = theta[-1]
random_weight = np.random.randn(population_size, step_x)
population = np.zeros(population_size)
for l in range(population_size):
w_try = last_theta + sigma * random_weight[l]
population[l] = -mean_abs_error(w_try)
A = (population - np.mean(population)) / np.std(population)
current_theta = last_theta + learning_rate * np.mean((population_size * sigma) *, A))
strings += str(current_theta) + '\n'
colors = sns.color_palette("husl", step)
for j in range(1,step):
ax.annotate('', xy=(theta[j], J[j]), xytext=(theta[j-1], J[j-1]), arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},va='center', ha='center')
ax.scatter(theta, J, c=colors, s=40, lw=0)
ax.set_title('MAE function on Evolution Strategies')
return ax

def subplot_gradient_descent(step, learning_rate, technique,
x_boundary = 1, y_boundary = 2,
momentum = 0.9, rho = 0.9, epsilon = 1e-8,
b1 = 0.9, b2 = 0.999,
step_x = 20, step_y = 50, midpoint = 0, ax=None):
if ax is None:
ax = plt.gca()
x = np.linspace(-x_boundary,x_boundary,step_x)
y = midpoint * x

def mean_abs_error(theta):
theta = np.atleast_2d(np.asarray(theta))
return np.mean(np.abs(y-hypothesis(x, theta)), axis=1)

def hypothesis(x, theta):
return theta * x

theta_grid = np.linspace(-y_boundary,y_boundary,step_y)
J_grid = mean_abs_error(theta_grid[:,np.newaxis])

ax.plot(theta_grid, J_grid)
theta = [-y_boundary]
J = [mean_abs_error(theta[0])[0]]
strings = 'X-axis steps:\n\n'
velocity = np.zeros((1))
second_velocity = np.zeros((1))

for j in range(step-1):
last_theta = theta[-1]
if technique == 'gradient descent':
gradient = np.sum(np.sign(hypothesis(x, last_theta) - y) * x)
current_theta = last_theta - learning_rate * gradient
elif technique == 'momentum':
gradient = np.sum(np.sign(hypothesis(x, last_theta) - y) * x)
velocity = velocity * momentum + learning_rate * gradient
current_theta = last_theta - velocity
elif technique == 'nesterov':
gradient = np.sum(np.sign(hypothesis(x, last_theta - momentum * velocity) - y) * x)
velocity = velocity * momentum + learning_rate * gradient
current_theta = last_theta - velocity
elif technique == 'adagrad':
gradient = np.sum(np.sign(hypothesis(x, last_theta) - y) * x)
velocity += np.square(gradient)
current_theta = last_theta - learning_rate * gradient / np.sqrt(velocity + epsilon)
elif technique == 'rmsprop':
gradient = np.sum(np.sign(hypothesis(x, last_theta) - y) * x)
velocity += rho * velocity + (1 - rho) * np.square(gradient)
current_theta = last_theta - learning_rate * gradient / np.sqrt(velocity + epsilon)
elif technique == 'adam':
gradient = np.sum(np.sign(hypothesis(x, last_theta) - y) * x)
velocity += b1 * velocity + (1-b1) * gradient
second_velocity += b2 * second_velocity + (1-b2) * np.square(gradient)
velocity_hat = velocity / (1-b1)
second_velocity_hat = second_velocity / (1-b2)
current_theta = learning_rate * velocity_hat / np.sqrt(second_velocity_hat + epsilon)
raise Exception('Invalid optimizer')
strings += str(current_theta) + '\n'
colors = sns.color_palette("husl", step)
for j in range(1,step):
ax.annotate('', xy=(theta[j], J[j]), xytext=(theta[j-1], J[j-1]), arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},va='center', ha='center')
ax.scatter(theta, J, c=colors, s=40, lw=0)
ax.set_title('MAE function on %s Optimizer'%(technique))
return ax
12 changes: 6 additions & 6 deletions gradient-visualization/
Expand Up @@ -76,26 +76,26 @@ def hypothesis(x, theta):
for j in range(step-1):
last_theta = theta[-1]
if technique == 'gradient descent':
gradient = np.sum((hypothesis(x, last_theta) - y) * x)
gradient = np.sum(2*(hypothesis(x, last_theta) - y) * x)
current_theta = last_theta - learning_rate * gradient
elif technique == 'momentum':
gradient = np.sum((hypothesis(x, last_theta) - y) * x)
gradient = np.sum(2*(hypothesis(x, last_theta) - y) * x)
velocity = velocity * momentum + learning_rate * gradient
current_theta = last_theta - velocity
elif technique == 'nesterov':
gradient = np.sum((hypothesis(x, last_theta - momentum * velocity) - y) * x)
gradient = np.sum(2*(hypothesis(x, last_theta - momentum * velocity) - y) * x)
velocity = velocity * momentum + learning_rate * gradient
current_theta = last_theta - velocity
elif technique == 'adagrad':
gradient = np.sum((hypothesis(x, last_theta) - y) * x)
gradient = np.sum(2*(hypothesis(x, last_theta) - y) * x)
velocity += np.square(gradient)
current_theta = last_theta - learning_rate * gradient / np.sqrt(velocity + epsilon)
elif technique == 'rmsprop':
gradient = np.sum((hypothesis(x, last_theta) - y) * x)
gradient = np.sum(2*(hypothesis(x, last_theta) - y) * x)
velocity += rho * velocity + (1 - rho) * np.square(gradient)
current_theta = last_theta - learning_rate * gradient / np.sqrt(velocity + epsilon)
elif technique == 'adam':
gradient = np.sum((hypothesis(x, last_theta) - y) * x)
gradient = np.sum(2*(hypothesis(x, last_theta) - y) * x)
velocity += b1 * velocity + (1-b1) * gradient
second_velocity += b2 * second_velocity + (1-b2) * np.square(gradient)
velocity_hat = velocity / (1-b1)
Expand Down
116 changes: 116 additions & 0 deletions gradient-visualization/
@@ -0,0 +1,116 @@
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def subplot_evolution_strategies(step, learning_rate, sigma, population_size,
x_boundary = 1, y_boundary = 2,
step_x = 20, step_y = 50, midpoint = 0, ax=None):
if ax is None:
ax = plt.gca()
x = np.linspace(-x_boundary,x_boundary,step_x)
y = midpoint * x

def root_mean_square_error(theta):
theta = np.atleast_2d(np.asarray(theta))
return np.sqrt(np.mean((y-hypothesis(x, theta))**2, axis=1))

def hypothesis(x, theta):
return theta * x

theta_grid = np.linspace(-y_boundary,y_boundary,step_y)
J_grid = root_mean_square_error(theta_grid[:,np.newaxis])

ax.plot(theta_grid, J_grid)
theta = [-y_boundary]
J = [root_mean_square_error(theta[0])[0]]
strings = 'X-axis steps:\n\n'
for j in range(step-1):
last_theta = theta[-1]
random_weight = np.random.randn(population_size, step_x)
population = np.zeros(population_size)
for l in range(population_size):
w_try = last_theta + sigma * random_weight[l]
population[l] = -root_mean_square_error(w_try)
A = (population - np.mean(population)) / np.std(population)
current_theta = last_theta + learning_rate * np.mean((population_size * sigma) *, A))
strings += str(current_theta) + '\n'
colors = sns.color_palette("husl", step)
for j in range(1,step):
ax.annotate('', xy=(theta[j], J[j]), xytext=(theta[j-1], J[j-1]), arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},va='center', ha='center')
ax.scatter(theta, J, c=colors, s=40, lw=0)
ax.set_title('RMSE function on Evolution Strategies')
return ax

def subplot_gradient_descent(step, learning_rate, technique,
x_boundary = 1, y_boundary = 2,
momentum = 0.9, rho = 0.9, epsilon = 1e-8,
b1 = 0.9, b2 = 0.999,
step_x = 20, step_y = 50, midpoint = 0, ax=None):
if ax is None:
ax = plt.gca()
x = np.linspace(-x_boundary,x_boundary,step_x)
y = midpoint * x

def root_mean_square_error(theta):
theta = np.atleast_2d(np.asarray(theta))
return np.sqrt(np.mean((y-hypothesis(x, theta))**2, axis=1))

def hypothesis(x, theta):
return theta * x

theta_grid = np.linspace(-y_boundary,y_boundary,step_y)
J_grid = root_mean_square_error(theta_grid[:,np.newaxis])

ax.plot(theta_grid, J_grid)
theta = [-y_boundary]
J = [root_mean_square_error(theta[0])[0]]
strings = 'X-axis steps:\n\n'
velocity = np.zeros((1))
second_velocity = np.zeros((1))

for j in range(step-1):
last_theta = theta[-1]
if technique == 'gradient descent':
gradient = np.sum(np.sign((hypothesis(x, last_theta) - y)) * x)
current_theta = last_theta - learning_rate * gradient
elif technique == 'momentum':
gradient = np.sum(np.sign((hypothesis(x, last_theta) - y)) * x)
velocity = velocity * momentum + learning_rate * gradient
current_theta = last_theta - velocity
elif technique == 'nesterov':
gradient = np.sum(np.sign((hypothesis(x, last_theta - momentum * velocity) - y)) * x)
velocity = velocity * momentum + learning_rate * gradient
current_theta = last_theta - velocity
elif technique == 'adagrad':
gradient = np.sum(np.sign((hypothesis(x, last_theta) - y)) * x)
velocity += np.square(gradient)
current_theta = last_theta - learning_rate * gradient / np.sqrt(velocity + epsilon)
elif technique == 'rmsprop':
gradient = np.sum(((hypothesis(x, last_theta) - y)) * x)
velocity += rho * velocity + (1 - rho) * np.square(gradient)
current_theta = last_theta - learning_rate * gradient / np.sqrt(velocity + epsilon)
elif technique == 'adam':
gradient = np.sum(np.sign((hypothesis(x, last_theta) - y)) * x)
velocity += b1 * velocity + (1-b1) * gradient
second_velocity += b2 * second_velocity + (1-b2) * np.square(gradient)
velocity_hat = velocity / (1-b1)
second_velocity_hat = second_velocity / (1-b2)
current_theta = learning_rate * velocity_hat / np.sqrt(second_velocity_hat + epsilon)
raise Exception('Invalid optimizer')
strings += str(current_theta) + '\n'
colors = sns.color_palette("husl", step)
for j in range(1,step):
ax.annotate('', xy=(theta[j], J[j]), xytext=(theta[j-1], J[j-1]), arrowprops={'arrowstyle': '->', 'color': 'r', 'lw': 1},va='center', ha='center')
ax.scatter(theta, J, c=colors, s=40, lw=0)
ax.set_title('RMSE function on %s Optimizer'%(technique))
return ax

0 comments on commit dab72d8

Please sign in to comment.