In [10]:
# %load /Users/hotbaby/github/firstcell.py
import warnings
warnings.filterwarnings('ignore')

import os
import sys
import copy
import tqdm
import math
import time
import heapq
import datetime
import itertools
import functools
import collections
import multiprocessing

import sklearn
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

import lightgbm as lgb
import xgboost as xgb
import catboost as cbt

pd.set_option('display.max_columns', None)

%matplotlib inline


# Learning Pytorch with Examples

As it core, Pytorch provides two main fetures:

* An n-dimensional Tensor, similar to numpy but can run on GPUs
* Automatic differentiation for building and training neural networks

## Tensors

Numpy provides an n-dimensional array object, and many functions for manipulating these arrays. Numpy is a generic framework for scientific computing; It does not know anything about computation graphs, or deep learning, or gradients. However we can easily use numpy to fit a two-layer network to random data by manually implementing the forward and backward pases through the network using numpy operations:

In [8]:
import numpy as np

# N is batch size; D_in is input dimensio;
# H is hidden dimension; D_out is output dimensin.
N, D_in, H, D_out = 64, 1000, 100, 10

# create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialzie weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

iterations = []
losses = []

for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    iterations.append(t)
    losses.append(loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33447692.586147226
1 34401835.8170425
2 38428616.47655048
3 38227918.955900975
4 29672959.254181065
5 17291590.956716552
6 8175109.097472004
7 3760339.252569851
8 1988641.218177071
9 1273825.6597851687
10 940588.8007317379
11 750712.2540707194
12 622505.2865585957
13 526387.0934218788
14 450404.42960885365
15 388542.98185050755
16 337374.24275714747
17 294699.47753952583
18 258778.5898167928
19 228270.625767621
20 202212.76136464305
21 179823.79186596852
22 160481.1422227469
23 143703.9444104074
24 129133.17304258805
25 116395.56418243691
26 105200.44745841174
27 95318.59316503195
28 86566.68319539854
29 78796.58486337948
30 71870.08204446886
31 65683.47066779397
32 60145.64937301133
33 55169.47287126069
34 50687.88510976778
35 46641.57081860729
36 42982.759276475364
37 39666.685886719635
38 36656.148052121745
39 33917.24932860592
40 31420.6503024494
41 29143.027723317537
42 27060.445702065786
43 25152.970788030158
44 23403.772594415906
45 21797.13384930251
46 20319.841729301887
47 1

370 0.03276293295316717
371 0.03167564347954832
372 0.03062519918664968
373 0.029610250744839858
374 0.02862946290449472
375 0.0276817428956946
376 0.02676608705170468
377 0.025881243019153965
378 0.02502623594414074
379 0.024200072326679534
380 0.023401377442895586
381 0.022629562916423117
382 0.021883755812634166
383 0.021162932484849205
384 0.02046635503956351
385 0.019793025932585334
386 0.019142383048182436
387 0.018513279496632985
388 0.017905289484600267
389 0.017317792880355936
390 0.01674977008639568
391 0.016200659634127444
392 0.015669849142816224
393 0.015156799204983836
394 0.014660952449955932
395 0.014181460614555656
396 0.013717953938381554
397 0.013269808829601546
398 0.012836552717094956
399 0.012417828409884056
400 0.01201292207066571
401 0.011621427808808978
402 0.011242980400073006
403 0.010877014840857822
404 0.010523148507387364
405 0.010180951425897828
406 0.00985008979522941
407 0.009530155686259378
408 0.009220823084472952
409 0.008921723655109622
410 0.008632

In [None]:
pd.Series(np.random.randn(100000)).hist

### Pytorch: Tensors

Here we introduced the most fundamental PyTorch concept: the **Tensor**. A PyTorch Tensor is indentical to a numpy array: A Tensor is an n-dimensional array, and PyTorch provides many functions for operating on these Tensors, Behind the scenes, Tensors can keep track of a computational graph an gradients, but they're also useful as a generic tool for scientific omputing.

In [55]:
import torch


dtype = torch.float
device = torch.device('cpu')

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predict y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
        
    # Backprop to compute gradient of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 456.66546630859375
199 1.7432337999343872
299 0.01134544238448143
399 0.000260656961472705
499 4.369526504888199e-05


## Autograd

### Pytoorch: Tensor and autograd

We can use `automatic differentiation` to automate the computation of backward passes in neural networks. The `autograd` package is PyTorch provides exactly this functionality. When using autograd, the forward pass of your network will define a `computation graph`; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you easiy compute gradients.

In [78]:
import torch


dtype = torch.float
device = torch.device('cpu')

# N is batch size; D_in is input dimension;
# H is hidden sieze, D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Settings requires_grad=False indicates that we do not need to compute gradients.
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)


learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors;
    # these are exactly the same operations  we uesed to compute the
    # forward pass using Tensors, but we do not need to keep references
    # to intermadiate values since we are not implementing the backward
    # pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape(1,)
    # loss.item() gets the scalar value held in the loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
        
    # Use autograd to compute the backward pass. This call will compute
    # the gradient of loss with respect to all Tensors with requires_grad=True
    # After this call w1.grad and w2.grad will be Tensors holding the
    # gradient of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manully update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but wee don't need to track
    # this in autograd.
    # An alternative way is to operate on weights.data and weights.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradients after updating weights.
        w1.grad.zero_()
        w2.grad.zero_()


99 257.0760192871094
199 0.8567467331886292
299 0.005119595676660538
399 0.0001518635544925928
499 3.0231831260607578e-05


### PyTorch: Defining new autograd funtions

Under the hood, each primitive operator is really two functions that operate on Tensors. The **forward** function computes output Tensors from input Tensors. The **backward** function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

In PyTorch we can easilly define our own autograd operator by defining a subclass of `torch.autograd.Function` and implementing the `forward` and `backward` functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Tensors containing input data.

In this example we define our own custom autograd function for performing the ReLU nonlinearity, and use it to implement our two-layer network:

In [1]:
import torch


class MyReLU(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input
        and return a Tensor containing the ouput. ctx is context object
        that can be used to stash information for backward computation.
        You can cache arbitrary objects for usein the backward pass using
        the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient
        of the loss with respect to the output, and we need to compute
        the gradient of the loss with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input<0] = 0
        return grad_input

In [5]:
dtype = torch.float
device = torch.device('cpu')

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensor for weights.
x = torch.randn(N, D_in, dtype=dtype, device=device)
y = torch.randn(N, D_out, dtype=dtype, device=device)

# Create random Tensor for weigths
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method.
    relu = MyReLU.apply
    
    # Forward pass: compute predicted y using operations; we 
    # compute ReLU using custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
        
    # Use autograd to compute the backward pass.
    loss.backward()
    
    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradient after updating weigths
        w1.grad.zero_()
        w2.grad.zero_()

99 239.82608032226562
199 0.7443724870681763
299 0.004589669406414032
399 0.00015712433378212154
499 3.4888969821622595e-05


# nn module

## PyTorch: nn

Computational graphs and autograd are a very powerful paradigm for defining complex operations and automatically taking derivatives; however for large neural network raw autograd canbe a bit too low-level.

When building neuralnetworks we frequently think of arranging the computation into **layers**, some of which have **learnable parameters** which will be optimized during leanrning.

In PyTorch, the `nn` package defines a set of **Modules**, which are roughly equivalent to neural network layers. A Module receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters.Then `nn` package also define a set of useful loss functions that are commonly used when training neural networks.

In this example, we use the `nn` package to implement our two-layer network:

In [7]:
import torch

# N is batch size; D_in is input dimensin;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers.
# nn.Sequential is a Module which contains other Module, and
# aplies them in sequence to produce its output. Each Linear
# Module computes output from input using a linear function, and
# holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss function;
# in this case we will use Mean Square Error(MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
        
    # Zero the gradients before runing the backward pass.
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

99 2.6743648052215576
199 0.050030674785375595
299 0.0022794355172663927
399 0.00015096722927410156
499 1.1786457434936892e-05
