In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
import sys

In [2]:
class FCN(nn.Module):
    def __init__(self,num_layers,num_nodes):
        super(FCN,self).__init__()
        if num_layers!=len(num_nodes)-1:
            sys.exit("Miss Match on number of layers")
        self.num_lay=num_layers
        self.num_nodes=num_nodes
        self.fc=nn.ModuleList()
        for i in range(num_layers):
            self.fc.append(nn.Linear(num_nodes[i],num_nodes[i+1]))
    
    def forward(self,x):
        for i in range(self.num_lay-1):
            x=F.relu(self.fc[i](x))
        x=self.fc[self.num_lay-1](x)
        return F.log_softmax(x)
    
    def print_param(self):
        net_W=[]
        net_b=[]
        for i in range(self.num_lay):
            net_W.append(self.fc[i].weight.data)
            net_b.append(self.fc[i].bias.data)
            #print(self.fc[i].weight.data)
            #print(self.fc[i].bias.data)
            
        return [net_W,net_b]
        
    

fcn_i1=FCN(3,[784,200,200,10])
print(fcn_i1)

FCN(
  (fc): ModuleList(
    (0): Linear(in_features=784, out_features=200, bias=True)
    (1): Linear(in_features=200, out_features=200, bias=True)
    (2): Linear(in_features=200, out_features=10, bias=True)
  )
)


In [3]:
learning_rate=0.01
epochs=10
batch_size=200
log_interval=10
optimizer = optim.SGD(fcn_i1.parameters(), lr=learning_rate, momentum=0.9)

criterion = nn.NLLLoss()
train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = Variable(data), Variable(target)
            # resize data from (batch_size, 1, 28, 28) to (batch_size, 28*28)
            data = data.view(-1, 28*28)
            optimizer.zero_grad()
            net_out = fcn_i1(data)
            loss = criterion(net_out, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                           100. * batch_idx / len(train_loader), loss.data[0]))


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


  app.launch_new_instance()






In [4]:
test_loss = 0
correct = 0
for data, target in test_loader:
    data, target = Variable(data, volatile=True), Variable(target)
    data = data.view(-1, 28 * 28)
    net_out = fcn_i1(data)
    #print(net_out)
    # sum up batch loss
    test_loss += criterion(net_out, target).data[0]
    pred = net_out.data.max(1)[1]  # get the index of the max log-probability
    correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

  app.launch_new_instance()



Test set: Average loss: 0.0003, Accuracy: 9789/10000 (98%)



In [5]:
import tensor_comprehensions as tc

In [6]:
[n_W,n_b]=fcn_i1.print_param()

In [7]:
#lang = """
#def fcrelu(float(B,M) I, float(N,M) W1, float(N) B1) -> (O1) {
#    O1(b, n) +=! I(b, m) * W1(n, m)
#    O1(b, n) = O1(b, n) + B1(n)
#    O1(b, n) = fmax(O1(b, n), 0)
#}
#"""

#lang1 = """
#def fc(float(B,M) I, float(N,M) W1,float(N) B1) -> (O1){
#     O1(b, n)+=! I(b,m)* W1(n, m)
#     O1(b, n) = O1(b, n) + B1(n)
#}
#"""

#lang2="""
#def softmax(float(N, D) I) -> (O, maxVal, expDistance, expSum) {
#    maxVal(n) max=! I(n, d)
#    expDistance(n, d) = exp(I(n, d) - maxVal(n))
#    expSum(n) +=! expDistance(n, d)
#    O(n, d) = expDistance(n, d) / expSum(n)
#}
#"""

#fcrelu = tc.define(lang, name="fcrelu")
#fc=tc.define(lang1,name="fc")
#softmax=tc.define(lang2,name="softmax")

lang = """
def fcrelunet(float(B,M) I, float(N,M) W1, float(N) B1,float(P,N) W2, float(P) B2,float(Q,P) W3, float(Q) B3) -> (O1,O2,O3,O4,O5,maxVal, expDistance, expSum) {
    O1(b, n) +=! I(b, m) * W1(n, m)
    O1(b, n) = O1(b, n) + B1(n)
    O1(b, n) = fmax(O1(b, n), 0)
    O2(b, p) +=! O1(b, n) * W2(p, n)
    O2(b, p) = O2(b, p) + B2(p)
    O2(b, p) = fmax(O2(b, p), 0)
    O3(b, q) +=! O2(b, p) * W3(q, p)
    O3(b, q) = O3(b, q) + B3(q)
    maxVal(b) max=! O3(b , q)
    expDistance(b ,q) = exp(O3(b , q) - maxVal(b))
    expSum(b) +=! expDistance(b , q)
    O4(b , q) = expDistance(b , q) / expSum(b)
    O5(b, q) = log(O4(b , q))
}
"""
#O5(b , q) = log(O4(b , q))
fcrelunet = tc.define(lang, name="fcrelunet")

In [8]:
#B_1,B_2, M, N = 28*28,1, 200, 200
# I= torch.ones(B_2, B_1).cuda() 
#W1, B1=torch.ones(M, B_1).cuda(), torch.ones(N).cuda()
#W1=n_W[0].cuda()
#B1=n_b[0].cuda()
#out1=torch.ones(B_2,N)
#W2=n_W[1].cuda()
#B2=n_b[1].cuda()
#W3=n_W[2].cuda()
#B3=n_b[2].cuda()

#fcrelu.autotune(I, W1, B1, cache="fcrelu_784_200_200.tc")
#out = fcrelu(I, W1, B1)

ID,B,M,N,P,Q=1,28*28,200,200,200,10
I= torch.ones(ID, B).cuda()
#W1=torch.ones(M, B).cuda()
#B1=torch.ones(N).cuda()
#W2=torch.ones(P, N).cuda()
#B2=torch.ones(P).cuda() 
#W3=torch.ones(Q, P).cuda()
#B3=torch.ones(Q).cuda()
W1=n_W[0].cuda()
B1=n_b[0].cuda()
W2=n_W[1].cuda()
B2=n_b[1].cuda()
W3=n_W[2].cuda()
B3=n_b[2].cuda()
#print(I.size())
#print(W1.size())
#print(B1.size())
#print(W2.size())
#print(B2.size())
#print(W3.size())
#print(B3.size())
#print("Actual Sizes")
#print(n_W[0].size())
#print(n_b[0].size())
#print(n_W[1].size())
#print(n_b[1].size())
#print(n_W[2].size())
#print(n_b[2].size())
fcrelunet.autotune(I, W1, B1, W2 , B2, W3, B3, cache="fcrelu_784_200_200_10.tc")
out=fcrelunet(I,W1,B1,W2,B2,W3,B3)

[INFO]: Autotuning cache will be saved to: fcrelu_784_200_200_10.tc.cuda/options
[INFO]: Tuned kernel options found, using those options


In [9]:
print(out[4])

Variable containing:
-4.0681 -3.0137 -3.4088 -1.5117 -7.9687 -0.4889 -5.0192 -2.9962 -6.7298 -4.7391
[torch.cuda.FloatTensor of size 1x10 (GPU 0)]



In [10]:
d=Variable(torch.ones(ID,B),requires_grad=False)
time=[]
for i in range(10):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    n_out=fcn_i1(d)
    end.record()
    torch.cuda.synchronize()
    time.append(start.elapsed_time(end))

time=np.array(time)
print('Mean Time on 10 iterations for cpu:')
print(np.mean(time))
print('Max Time on 10 iterations for cpu:')
print(np.max(time))
print('Min Time on 10 iterations for cpu:')
print(np.min(time))
print(n_out)

Mean Time on 10 iterations for cpu:
10.2730656147
Max Time on 10 iterations for cpu:
97.6877441406
Min Time on 10 iterations for cpu:
0.514496028423
Variable containing:
-4.0681 -3.0137 -3.4088 -1.5117 -7.9687 -0.4889 -5.0192 -2.9962 -6.7298 -4.7391
[torch.FloatTensor of size 1x10]



  app.launch_new_instance()
