In [355]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [358]:
class Module(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True
        self.n_y = 0
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    
    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.
        
        This includes 
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput
    

    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.
        
        Make sure to both store the data in `output` field and return it. 
        """
        
        # The easiest case:
            
        # self.output = input 
        # return self.output
        
        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input. 
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.
        
        The shape of `gradInput` is always the same as the shape of `input`.
        
        Make sure to both store the gradients in `gradInput` field and return it.
        """
        
        # The easiest case:
        
        # self.gradInput = gradOutput 
        # return self.gradInput
        
        pass   
    
    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass
    
    def zeroGradParameters(self): 
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass
        
    def getParameters(self):
        """
        Returns a list with its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
        
    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters. 
        If the module does not have parameters return empty list. 
        """
        return []
    
    def updateEMA(self):
        
        pass
    
    def gradEMA(self):
        return []
    
    def training(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True
    
    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False
    
    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Module"

In [359]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

In [360]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        super().__init__()
        
    def updateOutput(self, input, target): 
        
        # Use this trick to avoid numerical errors
        eps = 1e-15 
        input_clamp = np.clip(input, eps, 1 - eps)
        
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, target):
        
        # Use this trick to avoid numerical errors
        input_clamp = np.maximum(1e-15, np.minimum(input, 1 - 1e-15) )
                
        # Your code goes here. ################################################
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"

In [361]:
class ReLU(Module):
    
    def __init__(self):
        super().__init__()
        
    def forward(self, input):
        self.output = (input if input >= 0 else 0)
        return self.output
    
    def backward(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput, input > 0)
        return self.gradInput

In [362]:
class Sigmoid(Module):
    
    def __init__(self):
        super().__init__()
        
    def forward(self, input):
        npe = np.exp(input)
        self.output = npe / (1 + npe)
        return self.output
    
    def backward(self, input, gradOutput):
        npe = np.exp(input)
        self.gradInput = gradOutput * npe / ((npe + 1) * (npe + 1))

In [363]:
class LeakyReLU(Module):
    
    def __init__(self, slope=0.03):
        super().__init__()
        self.slope = slope
    
    def forward(self, input):
        self.output = input
        if self.output < 0:
            self.output *= self.slope
        return self.output
    
    def backward(self, input, gradOutput):
        input[input < 0] *= self.slope
        self.gradInput = gradOutput * input
        return self.gradInput

In [364]:
class MSECriterion(Criterion):
    def __init__(self):
        super().__init__()
        
    def updateOutput(self, input, target):   
        self.output = np.sum((input - target) * (input - target)) / (input.shape[0] + 1)
        return self.output
 
    def updateGradInput(self, input, target):
        self.gradInput = (input - target)
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

In [365]:
class Identity(Module):
    
    def __init__(self):
        super().__init__()
        
    def forward(self, input):
        self.output = input
        return self.output
    
    def backward(self, input, gradOutput):
        self.gradInput = np.ones_like(input)
        return self.gradInput

In [398]:
class LayerModule(Module):
    
    def __init__(self):
        super().__init__()
        self.sum = 0.
        
    def updateSum(self, input):
        self.sum = np.matmul(self.W, input)
        return self.sum
    
    def updateOutput(self, input):
        self.updateSum(input)
        self.output = self.actFunction.forward(self.sum)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        print(input.shape)
        print(gradOutput.shape)
        print(self.sum.shape)
        print(self.actFunction.backward(self.sum, gradOutput).shape)
        self.gradInput = np.matmul(gradOutput.reshape((gradOutput.shape[0], 1)), self.actFunction.backward(self.sum, gradOutput).reshape((1, self.sum.shape[0])))
        self.gradInput = np.matmul(self.gradInput, self.W)
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        self.gradW += gradOutput * self.actFunction.backward(self.sum, gradOutput) * input

In [399]:
class Linear(LayerModule):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super().__init__()

        # This is a nice initialization
        stdv = 1. / np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        self.gradEMA = np.zeros_like(self.W)

        self.actFunction = Identity()
        
    def updateSum(self, input):
        super().updateSum(input)
        
    def updateOutput(self, input):
        super().updateOutput(input)
        
    def updateGradInput(self, input, gradOutput):
        super().updateGradInput(input, gradOutput)
        
    def accGradParameters(self, input, gradOutput):
        super().accGradParameters(input, gradOutput)
        
    def updateEMA(self, gamma):
        self.gradEMA = gamma * self.gradEMA + (1 - gamma) * (self.gradW * self.gradW)
        
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return self.W
    
    def getGradParameters(self):
        return self.gradW
    
    def getGradEMA(self):
        return self.gradEMA
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

In [400]:
class Sequential(Module):
    
    def __init__ (self):
        super().__init__()
        np.random.seed()
        self.modules = []
   
    def add(self, module):
        self.modules.append(module)
        self.n_y = module.n_y

    def updateOutput(self, input):
        last_m = input
        for m in self.modules:
            m.forward(last_m)
            last_m = m.output
        self.output = last_m
        return self.output

    def backward(self, input, gradOutput):
        
        last_m = gradOutput
        n_out = len(self.modules) - 1
        
        self.modules[n_out].backward(self.modules[n_out - 1].output, gradOutput)
        
        for i in range(1, n_out):
            self.modules[n_out - i].backward(self.moudles[n_out - i - 1].output, self.modules[n_out + 1].gradInput)
            
        self.gradInput = self.modules[0].backward(input, gradOutput if n_out == 0 else self.modules[1].gradInput)
        
        return self.gradInput

    def get_batches(self, dataset, batch_size):
    
        X, Y = dataset
        n_samples = X.shape[0]
    
    # в начале каждой эпохи будем всё перемешивать
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        
        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            
            batch_idx = indices[start:end]
    
            yield np.array(X[batch_idx]), np.array(Y[batch_idx])
    
    def rmsprop(self, learning_rate):
        for md in self.modules:
            for w_layers, dw_layers, ema_layers in zip(md.W, md.gradW, md.gradEMA):
                updated_grad = dw_layers
                updated_grad *= (1.0 / np.sqrt(ema_layers + 1e-8))
                w_layers -= learning_rate * updated_grad
    
    def fit(self, X_tr, y_tr, n_epoch = 5, loss='mse', learning_rate = 0.1, batch_size = 1, gamma=0.1, makePlot = False):
        loss_history = []
        ldc = pow(6e-4 / learning_rate, 1. / n_epoch) 
        if loss == 'log':
            self.lossCrit = ClassNLLCriterion()
        else:
            self.lossCrit = MSECriterion()
        for itr in range(n_epoch):
            begin_time = time.time()
            for x_batch, y_batch in self.get_batches((X_tr, y_tr), batch_size):
                
                self.zeroGradParameters()
                
                pred = np.array([self.forward(x) for x in x_batch]).reshape((len(x_batch),))
                loss = self.lossCrit.forward(pred, y_batch)
                
                # backward — считаем все градиенты в обратном порядке
                dp = self.lossCrit.backward(pred, y_batch)
            
                k = np.zeros_like(self.getGradParameters)
                
                for x, y in zip(x_batch, dp):
                    self.backward(np.array(x), np.array([y]))
                
                for wl in self.getGradParameters():
                    wl /= len(x_batch)
                
                for x in self.modules:
                    x.updateEMA(gamma)
               
                # обновляем веса
                self.rmsprop(learning_rate)
                learning_rate *= 0.95
            print("Epoch " + str(itr + 1) + " of " + str(n_epoch) + ": " + str(time.time() - begin_time))
                
            loss_history.append(loss)
        
        if makePlot:
            plt.title("Training loss")
            plt.xlabel("epoch")
            plt.ylabel("loss")
            plt.plot(loss_history, 'b')
            plt.show()
            
    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        return [x.getGradParameters() for x in self.modules]
    
    def getGradEMA(self):
        return [x.getGradEMA() for x in self.modules]
                                                   
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)

In [401]:
class SoftMax(LayerModule):
    
    def __init_(self):
        super().__init__()
        
    def updateEMA(self, gamma):
        self.gradEMA = gamma * self.gradEMA + (1 - gamma) * (self.gradW * self.gradW)
        
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return self.W
    
    def getGradParameters(self):
        return self.gradW
    
    def getGradEMA(self):
        return self.gradEMA
    
    def __repr__(self):
        s = self.W.shape
        q = 'SoftMax %d -> %d' %(s[1],s[0])
        return q

In [402]:
class Logistic(Module):
    
    def __init__(self):
        super().__init__()
    
   # def forward(self, input):
        #self.output = 

In [403]:
md = Sequential()
md.add(Linear(2, 200))
md.add(Linear(200, 1))
X_train = np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [100, 155], [300, 100], [20, 14], [-40, 20], [3, 3]])
y_train = np.array([1, 3, 5, 7, 9, 255, 400, 34, -20, 6])
md.fit(X_train, y_train, n_epoch=100, loss='mse', learning_rate = 1e-1, batch_size=7, makePlot = True, gamma = 0.67)

(200,)
(1,)
(1,)
(1,)
(2,)
(1, 200)
(200,)
(200,)


ValueError: cannot reshape array of size 200 into shape (1,1)

In [294]:
md.forward([-100000, 100000])

array([0.12745548])