PyTorch walkthrough generated by modifying & combining several tutorials:
  - https://morvanzhou.github.io/tutorials/
  - [Gradient descent and linear regression (stack overflow)](https://stackoverflow.com/questions/17784587/gradient-descent-using-python-and-numpy?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa)

This notebeook will walk through:
  * numpy/tensors, 
  * A small regression problem
  * Building a neural network using torch.nn
  * Regression and classification using NNs

Dependencies (tested on):
* torch: 0.3.0
* matplotlib

In [1]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

# Part 1: Numpy and PyTorch variables, tensors, etc.

# Part 2: Linear Regression without PyTorch

In [None]:
import numpy as np
import random

# m denotes the number of examples
def gradientDescent(x, y, theta, alpha, m, numIterations):
    xTrans = x.transpose()
    
    plt.ion()
    
    # Run for a certain number of iterations
    for i in range(0, numIterations):
        
        # Run the model forward to get predictions
        hypothesis = np.dot(x, theta)
        
        # Calculate the loss (L1 loss here)
        loss = hypothesis - y
        
        # sum cost per example (the 2 in denominator doesn't really matter here.
        # But to be consistent with the gradient, it is included)
        cost = np.sum(loss ** 2) / (2*m)
                
        # Calculate the average gradient per example
        gradient = np.dot(xTrans, loss) / m
        
        # Update the model (parameters) by learning rate and gradient
        # Note going in the direction of negative gradient!
        theta = theta - alpha * gradient
        
        
        if i % 10000 == 0:
            # Print out results
            print("Iteration %d | Cost: %f" % (i, cost))
        
        if i % 100000 == 0:
            plt.cla()
            plt.scatter(x[:,1], y)
            #plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
            #plt.text(0.5, 0, 'Loss=%.4f' % loss.data[0], fontdict={'size': 20, 'color':  'red'})
            plt.show()
            plt.pause(0.2)

    plt.ioff()
    return theta

In [None]:
# Data generation function with some noise
def genData(numPoints, slope, bias, variance):
    x = np.zeros(shape=(numPoints, 2))
    y = np.zeros(shape=numPoints)
    
    # basically a straight line
    for i in range(0, numPoints):
        # bias feature
        x[i][0] = 1
        x[i][1] = i
        
        # our target variable
        y[i] = (i*slope + bias) + random.uniform(0, 1) * variance
    return x, y



In [None]:

# Run data generation and optimization

# gen 100 points with a bias of 25 and 10 variance as a bit of noise
x, y = genData(100, 0.5, 50, 5)
m, n = np.shape(x)
#print('Dimensionality of data: X: %d,%d  Y: %d,1') % (m,n,y.shape[0])

numIterations= 100000
alpha = 0.0001
# Test: Change alpha (learning rate) to 0.001 and see what happens!

np.random.seed(0)
theta = np.random.rand(2)
theta = gradientDescent(x, y, theta, alpha, m, numIterations)
print(theta)



# Part 3: Variables, small functions, and linear regression in PyTorch 

In [None]:
import torch
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

In [None]:
# Generate some data
x_act = torch.linspace(-5, 5, 200)  # x data (tensor), shape=(200, 1)
x_act = Variable(x_act)
x_act_np = x_act.data.numpy()   # numpy array for plotting

In [None]:
# Create activation functions to visualize them
y_act_relu = F.relu(x_act).data.numpy()
y_act_sigmoid = F.sigmoid(x_act).data.numpy()
y_act_tanh = F.tanh(x_act).data.numpy()
y_act_softplus = F.softplus(x_act).data.numpy()


In [None]:
# Show them
plt.figure(1, figsize=(8, 6))
plt.subplot(221)
plt.plot(x_act_np, y_act_relu, c='red', label='relu')
plt.ylim((-1, 5))
plt.legend(loc='best')

plt.subplot(222)
plt.plot(x_act_np, y_act_sigmoid, c='red', label='sigmoid')
plt.ylim((-0.2, 1.2))
plt.legend(loc='best')

plt.subplot(223)
plt.plot(x_act_np, y_act_tanh, c='red', label='tanh')
plt.ylim((-1.2, 1.2))
plt.legend(loc='best')

plt.subplot(224)
plt.plot(x_act_np, y_act_softplus, c='red', label='softplus')
plt.ylim((-0.2, 6))
plt.legend(loc='best')

plt.show()

In [None]:

# Perform linear regression via PyTorch

from torch import optim

# Convert our data from numpy to PyTorch
# .float() is required because by default double is returned
# unsqueeze is required to add an empty dimension so x_torch is [100,1] not just [100]
x_torch = torch.from_numpy(x[:,1]).float().unsqueeze(1)
y_torch = torch.from_numpy(y).float().unsqueeze(1)

# Create model
model = torch.nn.Sequential()
model_linear = torch.nn.Linear(1, 1, bias=True)
model.add_module("linear", model_linear)

# Mean squared error loss
loss = torch.nn.MSELoss(size_average=True)

# Create optimizer
optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.0)

# Determine batch size (how many samples will be used per iteration)
batch_size = 100

num_iterations = 100000

# Run training loop
for i in range(num_iterations):
    cost = 0.
    num_batches = len(x_torch) // batch_size
    for k in range(num_batches):
        
        # Create batch
        start, end = k * batch_size, (k + 1) * batch_size
        x_batch = x_torch[start:end]
        y_batch = y_torch[start:end]
        
        # Create variables from data
        x_var = Variable(x_batch, requires_grad=False)
        y_var = Variable(y_batch, requires_grad=False)

        # Reset gradient
        optimizer.zero_grad()

        # Forward
        fx = model.forward(x_var.view(len(x_var), 1))
        output = loss.forward(fx, y_var)

        # Backward
        output.backward()

        # Update parameters
        optimizer.step()

        cost += output.data[0]
        
    
    if i % (num_iterations/10) == 0:
        print("Epoch = %d, cost = %s" % (i + 1, cost / num_batches))

print('\nLearned parameters:')
w = next(model.parameters()).data  # model has only one parameter
print("Weight = %.2f" % (w.numpy())) # will be approximately 2

print('Bias = %d' % (model_linear.bias))


# Part 4: Building a Neural Network for Regression

In [None]:
# There are many elements that are randomized in neural networks; 
# it is important to fix the seed to ensure reproducibility
torch.manual_seed(1)    

In [None]:
x_torch = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  # x data (tensor), values from -1 to 1, shape=(100, 1)
y_torch = x_torch.pow(2) + 0.2*torch.rand(x_torch.size())                 # noisy y data (tensor), shape=(100, 1)

#x_torch = torch.from_numpy(x[:,1]).float().unsqueeze(1)
#y_torch = torch.from_numpy(y).float().unsqueeze(1)

print(x_torch.size())
print(y_torch.size())

# torch can only train on Variable, so convert them to Variable
x_torch, y_torch = Variable(x_torch), Variable(y_torch)

plt.scatter(x_torch.data.numpy(), y_torch.data.numpy())
plt.show()

In [None]:
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

In [None]:
net = Net(n_feature=1, n_hidden=10, n_output=1)     # define the network
print(net)  # net architecture

In [None]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
loss_func = torch.nn.MSELoss()  # this is for regression mean squared loss

In [None]:
plt.ion()   # something about plotting

In [None]:
num_iterations = 1000
for t in range(num_iterations):
    prediction = net(x_torch)     # input x and predict based on x

    loss = loss_func(prediction, y_torch)     # must be (1. nn output, 2. target)

    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients

    if t % (num_iterations/10) == 0:
        print("Epoch = %d, loss = %s" % (t + 1, loss.data.numpy()))
        
        # plot and show learning process
        plt.cla()
        plt.scatter(x_torch.data.numpy(), y_torch.data.numpy())
        plt.plot(x_torch.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
        plt.text(0.5, 0, 'Loss=%.4f' % loss.data[0], fontdict={'size': 20, 'color':  'red'})
        plt.show()
        plt.pause(0.2)

plt.ioff()


# Part 4: Building a Neural Network for Classification

In [None]:
# make fake data
n_data = torch.ones(100, 2)
x0 = torch.normal(2*n_data, 1)      # class0 x data (tensor), shape=(100, 2)
y0 = torch.zeros(100)               # class0 y data (tensor), shape=(100, 1)
x1 = torch.normal(-2*n_data, 1)     # class1 x data (tensor), shape=(100, 2)
y1 = torch.ones(100)                # class1 y data (tensor), shape=(100, 1)
x = torch.cat((x0, x1), 0).type(torch.FloatTensor)  # shape (200, 2) FloatTensor = 32-bit floating
y = torch.cat((y0, y1), ).type(torch.LongTensor)    # shape (200,) LongTensor = 64-bit integer

# torch can only train on Variable, so convert them to Variable
x, y = Variable(x), Variable(y)

plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=y.data.numpy(), s=100, lw=0, cmap='spring')
plt.show()

In [None]:
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.out = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.out(x)
        return x

In [None]:
net = Net(n_feature=2, n_hidden=10, n_output=2)     # define the network
print(net)  # net architecture

# Loss and Optimizer
# Softmax is internally computed.
# Set parameters to be updated.
optimizer = torch.optim.SGD(net.parameters(), lr=0.02)
loss_func = torch.nn.CrossEntropyLoss()  # the target label is NOT an one-hotted

In [None]:
plt.ion()   # something about plotting

In [None]:
for t in range(100):
    out = net(x)                 # input x and predict based on x
    loss = loss_func(out, y)     # must be (1. nn output, 2. target), the target label is NOT one-hotted

    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients
    
    if t % 10 == 0 or t in [3, 6]:
        # plot and show learning process
        plt.cla()
        _, prediction = torch.max(F.softmax(out), 1)
        pred_y = prediction.data.numpy().squeeze()
        target_y = y.data.numpy()
        plt.scatter(x.data.numpy()[:, 0], x.data.numpy()[:, 1], c=pred_y, s=100, lw=0, cmap='spring')
        accuracy = sum(pred_y == target_y)/200.
        plt.text(1.5, -4, 'Accuracy=%.2f' % accuracy, fontdict={'size': 20, 'color':  'red'})
        plt.show()
        plt.pause(0.1)

plt.ioff()

In [None]:
# 
#out = net(x_all)
#_, prediction = torch.max(F.softmax(out), 1)
#pred_y = prediction.data.numpy().squeeze()