In [544]:
import linguamind.linalg as la
import linguamind.nn as nn

In [545]:
seed = la.Seed(0)

In [873]:
class SparseLinearInput():
    
    def __init__(self,input_dim, output_dim):
        
        self.sparse_output = False
        
        self.input_dim = input_dim # vocab size
        self.output_dim = output_dim # embedding dim
        
        self.weights = la.Matrix(input_dim,output_dim)
        
        self.bias = la.Vector(output_dim)
        self.bias.zero()
        
        self.input_indices = list(range(self.input_dim))
        
        self.output = la.Vector(output_dim).zero()
        
    def updateOutput(self, input_indices):
        self.input_indices = input_indices
        
        self.output.zero()
        for index in self.input_indices:
            self.output += self.weights[index]
            
    def accGradParameters(self, input, output_grad, alpha):
        # input is all 1s and is not used
        for index in self.input_indices:
            self.weights[index].addi(output_grad,-alpha)

In [874]:
class SparseLinearOutput():
    
    def __init__(self, input_dim, output_dim):
        
        self.sparse_output = True
        
        self.input_dim = input_dim # embedding dim
        self.output_dim = output_dim # output vocab size 
        
        self.weights = la.Matrix(output_dim,input_dim)
        
        self.output = la.Vector(output_dim).zero()
        self.output_indices = list(range(self.output_dim))
        self.input_grad = la.Vector(self.input_dim).zero()
    
    def updateOutput(self, input, output_indices=None):
        if(output_indices is None):
            output_indices = list(range(self.output_dim))
        self.output_indices = output_indices
        
        for index in self.output_indices:
            self.output.doti(index,input, self.weights[index])
            
    def updateInputGrad(self,output_grad):
        self.input_grad.set(self.weights[self.output_indices[0]],output_grad[self.output_indices[0]]) 
        for index in self.output_indices[1:]:
            self.input_grad.addi(self.weights[index],output_grad[index]) 
    
    def accGradParameters(self, input, output_grad, alpha):
        for index in self.output_indices:
            self.weights[index].addi(input,output_grad[index] * -alpha)


In [875]:
class Linear():
    def __init__(self, input_dim, output_dim):
        
        self.sparse_output = False
        
        self.input_dim = input_dim # embedding dim
        self.output_dim = output_dim # output vocab size 
        
        self.weights = la.Matrix(output_dim,input_dim)
        
        self.output = la.Vector(output_dim).zero()
        self.input_grad = la.Vector(self.input_dim).zero()
    
    def updateOutput(self, input):
        for index in range(self.output_dim):
            self.output.doti(index,input, self.weights[index])
    
    def updateInputGrad(self, output_grad):
        self.input_grad.set(self.weights[0],output_grad[0]) 
        for index in range(self.output_dim-1):
            self.input_grad.addi(self.weights[index+1],output_grad[index+1])
            
    def accGradParameters(self, input, output_grad, alpha):
        for index in range(self.output_dim):
            self.weights[index].addi(input,output_grad[index] * -alpha)

class Relu():
    def __init__(self, dim):
        self.resize(dim)
        self.sparse_output = False
    
    def resize(self,dim):
        self.input_dim = dim
        self.output_dim = dim
        
        self.weights = None
        self.output_indices = list(range(dim))
        
        self.output = la.Vector(self.output_dim).zero()
        self.input_grad = la.Vector(self.input_dim).zero()
    
    def updateOutput(self, input, output_indices = None):

        if(output_indices is None):
            output_indices = list(range(self.output_dim))
        self.output_indices = output_indices
        
        self.input_grad *= 0
        self.input_grad += input
        self.input_grad >= 0
        
        self.output *= 0
        self.output += input
        self.output *= self.input_grad
        
    def updateInputGrad(self, output_grad):
        self.input_grad *= output_grad
        
    def accGradParameters(self,alpha, input, output_grad):
        "do nothing"

In [876]:
class MSECriterion():
    
    def __init__(self):
        ""
        self.grad = la.Vector(32).zero()
        
    def forward(self,input,target,output_indices):
        self.output = input
        self.tmp_error = la.Vector(len(output_indices)).zero()
        for i,index in enumerate(output_indices):
            self.tmp_error[i] = self.output[index] - target[i]
            
        self.error = 0
        for i in range(len(output_indices)):
            self.error += self.tmp_error[i] * self.tmp_error[i]
        self.error /= len(self.tmp_error)
        
        return self.error
    
    def backward(self,output, target,output_indices):
        if(self.grad.size != output.size):
            self.grad = la.Vector(output.size).zero()
        for i,index in enumerate(output_indices):
            self.grad[index] = output[index] - target[i]
#             print(str(output[index]) + " - " + str(target[i]))
        return self.grad

In [877]:
class Sequential():
    
    def __init__(self):
        ""
        self.layers = list()
    
    def add(self,layer):
        
        self.layers.append(layer)
        self.output = self.layers[-1].output
    
    def forward(self,input_indices=(1,3,2), output_indices=(1,2,4)):
        
        self.layers[0].updateOutput(input_indices)
        
        sparse_output_until_end = False
        
        for index in range(len(self.layers)-2):
            if(self.layers[index+1].sparse_output):
                sparse_output_until_end = True
                
            if(sparse_output_until_end):
                self.layers[index+1].updateOutput(self.layers[index].output,output_indices)
            else:
                self.layers[index+1].updateOutput(self.layers[index].output)
            
        self.layers[-1].updateOutput(self.layers[-2].output,output_indices)
        
        return self.output
    
    def backward(self, grad, output_indices):
        
        self.layers[-1].updateInputGrad(grad)
        sparse_output_until_end = True
        for i in reversed(range(len(self.layers)-2)):
            if(sparse_output_until_end):
                self.layers[i+1].updateInputGrad(self.layers[i+2].input_grad)
            else:
                self.layers[i+1].updateInputGrad(self.layers[i+2].input_grad)
            if(self.layers[i+1].sparse_output == True):
                sparse_output_until_end = False
                
class StochasticGradient():
    
    def __init__(self,mlp,criterion,alpha = 0.01):
        self.mlp = mlp
        self.criterion = criterion
        self.alpha = alpha
    
    def train(self,input_indices, output_indices, target_values): 
        
        pred = self.mlp.forward(input_indices,output_indices)
        error = self.criterion.forward(pred,target_values,output_indices)
        self.mlp.backward(self.criterion.backward(pred,target_values,output_indices),output_indices)
        
        # update weights for sparse layer
        mlp.layers[0].accGradParameters(None,mlp.layers[1].input_grad,self.alpha)
        
        # update dense middle layers
        for i in range(len(self.mlp.layers)-2):
            mlp.layers[i+1].accGradParameters(mlp.layers[i].output,mlp.layers[i+2].input_grad,self.alpha)
                    
        # update weights for output with criterion gradient        
        mlp.layers[-1].accGradParameters(mlp.layers[-2].output,criterion.grad,self.alpha)
            
        return error

In [878]:
seed = la.Seed(1)

syn0 = SparseLinearInput(100,64)
syn0.weights.uniform(seed)
syn0.weights -= 0.5
syn0.weights /= 50

syn1 = Linear(64,32)
syn1.weights.uniform(seed)
syn1.weights -= 0.5
syn1.weights /= 50

syn2 = Linear(32,16)
syn2.weights.uniform(seed)
syn2.weights -= 0.5
syn2.weights /= 50

syn3 = SparseLinearOutput(16,100)
syn3.weights.uniform(seed)

<linguamind.linalg.Matrix; proxy of <Swig Object of type 'Matrix *' at 0x1049a1f60> >

In [879]:
mlp = Sequential()

mlp.add(syn0)
mlp.add(syn1)
mlp.add(Relu(32))
mlp.add(syn2)
mlp.add(Relu(16))
mlp.add(syn3)
mlp.add(Relu(100))

criterion = MSECriterion()

In [880]:
input_indices = (1,2,3,4)
output_indices = (1,2,3,4,5,6,20,4)

target = la.Vector(len(output_indices))
target.zero()
target.set(0,1)

In [883]:
optim = StochasticGradient(mlp,criterion)

In [884]:
for iter in range(1000):
    print(optim.train(input_indices,output_indices,target))

0.12499696034424791
0.12499694544474545
0.12499693054525286
0.12499691564577009
0.1249969007462974
0.12499688584683458
0.1249968709473818
0.12499684114696578
0.12499682624753408
0.12499681134811254
0.12499679644870126
0.12499678154930016
0.12499676664990929
0.12499675175052874
0.1249967368511587
0.12499672195179899
0.12499670705244975
0.12499669215311102
0.12499667725378279
0.12499666235446516
0.1249966474551583
0.12499663255586202
0.12499660275561736
0.12499658785634356
0.12499657295708051
0.12499655805782837
0.12499654315858721
0.12499652825935705
0.12499651336013776
0.1249964984609296
0.1249964835617326
0.12499646866254684
0.12499645376337223
0.12499642396326044
0.1249964090641093
0.1249963941649695
0.12499637926584113
0.12499636436672437
0.12499634946761892
0.1249963345685253
0.12499631966944322
0.1249963047703727
0.1249962749703745
0.12499626007132844
0.12499624517229432
0.12499623027327217
0.12499621537426182
0.12499620047526354
0.12499618557627733
0.12499617067730326
0.124996140

In [39]:
n_sample = 10

In [40]:
syn0 = nn.SparseLinearInput(1000,50)
syn0.weights.uniform()
syn0.weights -= 0.5
syn0.weights /= 50

syn1 = nn.SparseLinearOutput(50,1000,n_sample)
syn1.weights.uniform()

<linguamind.linalg.Tensor; proxy of <Swig Object of type 'Tensor *' at 0x1078b4e70> >

In [41]:
mlp = nn.Sequential()
mlp.add(syn0)
mlp.add(syn1)

criterion = nn.MSECriterion(1,n_sample)

In [42]:
target = la.Tensor((1,10))
target.zero()
target.set(0,1)

In [43]:
context_indices = (1,2,4,5)
pred = 3
neg_samples = list([6,7,8,9,10,11,12,13,14])
target_indices = list([pred]) + neg_samples

In [44]:
criterion.forwards(mlp.forward(context_indices,target_indices))

In [45]:
criterion.backwards(target)

<linguamind.linalg.Tensor; proxy of <Swig Object of type 'Tensor *' at 0x1078b4630> >

In [None]:
input = la.Tensor((1,50)).uniform()

In [None]:
syn1.updateOutput(input,(1,3))

In [None]:
syn1.output.get(1)

In [None]:
text = nlp.Text("hello world","BIIIIOBIIII")

In [None]:
vocab = text.getVocab("tokens")

In [None]:
text.getSequence('tokens')

In [None]:
# text = lm.Text("hello world")

In [None]:
text = lm.Text("hello world","BIIIIOBIIII")

In [None]:
vocab = text.getVocab("tokens")

In [None]:
for i in xrange(vocab.size):
    print vocab.getTermAtIndex(i).letters

In [None]:
text.getSequence("tokens")

#NLP Components

Two Types of Component:
- word level
- sentence level
- document level

### Data Model

In [None]:
c = lm.Text("/my/folder/of/files") # atempts to autodetect file type
c = lm.Text(path="myfile.txt",type="raw")
c = lm.Text(path="myfile.txt",type="csv-token") # one token per line in CSV
c = lm.Text(path="myfile.txt",type="csv-segment") # text segment per line in CSV
c = lm.Text(path="myfile.txt",type="prefix__label__") # one sentence per line with special labels
c = lm.Text(path="myfile.txt",type="json")

# document level raw text
d1 = lm.Text("This is my sentence.")
d2 = lm.Text("myfile.txt")

c = lm.Text([d1,d2]) #form documents into Corpus

# document level text with labels
contents = {}
contents["text"] = "This is my sentence. This is another one."
contents["date"] = 1997
d = lm.Text(contents)

# sentence level text with labels
contents = {}
contents["text"] = "This is my sentence."
contents["sentiment"] = "Positive"
contents["speaker"] = "John Locke"
d = lm.Text(contents)

### Applying Analytics

In [None]:
tok = lm.Segmenter(name="tokenizer",lang="en")
eos = lm.Segmenter(name="eos",lang="en")

# document level text with labels
contents = {}
contents["text"] = "This is my sentence. This is another one."
contents["date"] = 1997
d = lm.Text(contents)

tok.segment(d)
eos.segment(d)

print d["text"] # "This is my sentence. This is another one."
print d["tokens"] # ["This", "is"...
print d["sentences"] # [["This", "is", "my", "sentence","."],["This","is","another","one","."]]                
                     
pos = lm.Classifier(name="pos")
pos.predict(d)

print d["sentences"] # [[{"token":"This","POS":"NNP"},...                

sentiment = lm.Classifier(name="sentiment")
sentiment.predict(d)
print d["sentences"] # [{"sentiment":"Positive","tokens":[{"token":"This","POS":"NNP"},...

###Creating Analytics

In [None]:
# Creating a Segmenter (tokenizer)
contents = {}
contents["text"] = "I like pie."
contents["char_segs"] = "BOBIIIOBIIB"
d = lm.Text(contents)

feats=[["text_-2","text_-1"],"text_-1","text_0","text_1",["text_1","text_2"]]

ambiguity_hash = lm.AmbiguityHash(d,feats,"char_segs",threshold=0.95,min_count=5)

percept_model = lm.ml.Sequential()
percept_model.add(lm.ml.Linear(size=[d.getVocab("text").size() * 10],encoding="hashtable"))
percept_model.add(lm.ml.LogSoftmax())

percept_loss = lm.ml.PerceptronLoss()
percept_optimizer = lm.ml.optim.PerceptronUpdate()
percept_searcher = lm.ml.search.BeamSearch(beam=5)

# where available in leau of calling all the forward/backprop logic from Python slowing things down
tokenizer = lm.ml.trainer.FastSegmentationTrainer(d,"char_segs",feats,percept_model,percept_loss,percept_searcher,inherits=[ambiguity_hash],threads=50)

tokenizer.segment(d,"tokens") #generates new vocab for what was segmented

eos = lm.pretrained.Segmenter(name="eos",lang="en")
eos.segment(d,"sentences",ignore_vocab=True) # does not generate a new vocab for what was segmented

# https://github.com/oxford-cs-ml-2015/practical6/blob/master/train.lua
lstm_hidden = 50
seq_length = 16

layers = list()
for i in range(seq_length):
    layer = {}
    
    inputs = {}
    layer['embed'] = lm.ml.Embedding(d.getVocab("token").size(),lstm_hidden)
    layer['lstm'] = lm.ml.LSTM(lstm_hidden,input_x=layer['embed'],input_h=layers[i-1]['lstm']['h'])
    layer['softmax'] = lm.ml.LogElasticHierarchicalSoftmax(input=layer['lstm']['h'],size=(lstm_hidden,d.getVocab("token").size()),sample_rate=0.001)
    layer['criterion'] = lm.ml.LogElasticHierarchicalSoftmaxLoss()
    layers.append(layer)

word_language_model_searcher = lm.ml.search.BeamSearch(beam=5)
percept_optimizer = lm.ml.optim.SGD()
word_language_model = lm.ml.trainer.FastLanguageModelTrainer(d,layers,word_language_model_searcher,percept_optimizer,predict_on="token",bound_on="sentence",threads=50)

#ML Components

In [None]:
#Ambiguity Hashing

In [None]:
#Perceptron 

In [None]:
#LSTM 

In [None]:
#Neural Index