In [5]:
# load data set

samples = []
for i in range(1,511):
    if i < 10:
        i = "00" + str(i)
    elif i < 100:
        i = "0" + str(i)
    
    with open("data/bbc/business/{}.txt".format(str(i))) as fh:
        lines = fh.read().strip().split('\n')
        samples.append(lines[0])
        #samples.append(lines[0] + " " + lines[2] + " " + lines[4])

print("Loaded {} lines.".format(len(samples)))
        

Loaded 510 lines.


In [35]:
print(samples[9])

Court rejects $280bn tobacco case


In [7]:
# load GloVe encodings

import torch

print("Loading word vectors...")
embeddings_index = {}
word_index = {}

with open("glove300.txt", 'r') as fh:
    for line in fh:
        values = line.split()
        word = values[0]
        coefs = list(map(lambda x: float(x), values[1:])) # torch.FloatTensor(list(map(lambda x: float(x), values[1:])))
        embeddings_index[word] = coefs
        #key = list(map(lambda x: float("{0:.1f}".format(x)), coefs))
       
        word_index[torch.FloatTensor(coefs)] = word
        

print('Found %s word vectors.' % len(embeddings_index))

Loading word vectors...
Found 400000 word vectors.


In [9]:
# encode each sample as array of embedded words

import torch

seq_len = 6 #1025   # number of words in sentence
embedded_sentences = []

dimensions = 300

for sample in samples:
    
    sentence = [[0.0]*dimensions]*(seq_len+1) #torch.FloatTensor(1025, dimensions).zero_()
    #sentence[0] = torch.ones_like(torch.FloatTensor(1, dimensions)) # first element in sentence is all ones => start token
    ind = 0 #1 
    
    for word in sample.split(' '):
        word = word.lower().strip('(,.;!?:"').strip("'").strip(")")
        if word.startswith('"'):
            word = word[1:]
        if word.endswith("'s"):
            word = word[0:-2]
        elif word.endswith("bn"):
            words = [word[0:-2], "billion"]
        elif word.endswith("%"):
            words = [word[0:-1], "percent"]
        else:
            words = [word]
        
        try:
            for word in words:
                coefs = embeddings_index[word]
                sentence[ind] = coefs
                ind+=1
                
            
        except Exception as e:
            print("could not lookup embedding for word: {}".format(e))
    
        
    embedded_sentences.append(sentence)

print("done embedding sentences")

could not lookup embedding for word: '$280'
could not lookup embedding for word: 'illva'
could not lookup embedding for word: '$53'
could not lookup embedding for word: '$4'
could not lookup embedding for word: 'ex-boss'
could not lookup embedding for word: "won't"
could not lookup embedding for word: 'turkey-iran'
could not lookup embedding for word: 'fiat-gm'
could not lookup embedding for word: '$16'
could not lookup embedding for word: '$102.6'
could not lookup embedding for word: '$280'
could not lookup embedding for word: 'ex-aide'
could not lookup embedding for word: '$515m'
could not lookup embedding for word: '$28'
could not lookup embedding for word: 'sell-offs'
could not lookup embedding for word: '$50'
could not lookup embedding for word: '$50'
could not lookup embedding for word: '$11'
could not lookup embedding for word: '$6'
could not lookup embedding for word: '$2'
could not lookup embedding for word: 'ex-boeing'
could not lookup embedding for word: '$20'
could not look

In [10]:
# Helper functions

import random
import time
import math



# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random line
def randomTrainingPair():
    line = randomChoice(embedded_sentences) # has shape (1025, 300)
    l = line[0:-1]
    v = torch.FloatTensor(l)
    input = Variable(v.view(seq_len, 1, dimensions), requires_grad=True)
    
    t = torch.FloatTensor(line[1:])
    target = Variable(t.view(seq_len, 1, dimensions))
                  
    return (input, target)

# Timing runtime of network operations
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [20]:
# build model

import torch
import torch.nn as nn
from torch.autograd import Variable

batch_size = 1
input_size = dimensions # length of glove vectors
num_layers = 2
hidden_size = input_size # must be same?

gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=.2)

# train network
criterion = nn.MSELoss()
learning_rate = 0.0005


# input : start token to last letter
# target: first letter to EOS
def train(input, target, hidden):
    
    total_loss = 0
    for i in range(input.size()[0]): # 1024 words in sentence
        #print(i)
        output, hidden = gru(input.detach(), hidden)
        loss_var = criterion(output, target)
        total_loss += loss_var.data[0]
        
    
    loss_var.backward()
   
    return total_loss
  


start = time.time()
        
n_iters = 10000
print_every = 1000
#plot_every = 100


hidden = Variable(torch.zeros(num_layers, 1, hidden_size)) # for init only
for iter in range(1, n_iters+1):
    # input shape: (seq_len, batch, input_size)
    input_tensor, target_tensor = randomTrainingPair() # random embedded sentence
    
    loss = train(input_tensor, target_tensor, hidden)
    
    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    #if iter % plot_every == 0:
     #   all_losses.append(total_loss / plot_every)
     #   total_loss = 0
        


1m 43s (1000 10%) 0.4773
3m 25s (2000 20%) 0.4335
5m 6s (3000 30%) 0.7313
6m 51s (4000 40%) 0.7664
8m 33s (5000 50%) 0.5775
10m 17s (6000 60%) 0.9244
11m 59s (7000 70%) 0.8302
13m 41s (8000 80%) 0.4965
15m 28s (9000 90%) 0.2976
17m 11s (10000 100%) 0.6287


In [None]:
max_length = 6

eos = torch.FloatTensor(1,dimensions).zero_()

def round_floats(float_num, dummy):
    return float("{0:.2f}".format(float_num))

start_word  = "court"

def generate():
    # TODO pick random start word
    input = Variable(torch.FloatTensor(embeddings_index[start_word]).view(1,1,dimensions)) # start word 
    
    generated_sentence = [input.data]
    hidden = Variable(torch.zeros(num_layers, 1, hidden_size)) # pick random vector from embeddings
    
    for i in range(max_length):
        output, hidden = gru(input, hidden)
        #print(type(output.data))
        #topv, topi = output.data.topk(1)
        #topi = topi[0][0]
        #print(topi)
        if torch.equal(output.data.view(1,dimensions), eos):
            print("found eos after {} words".format(i))
            break
        else:
            #embedded_word = all_letters[topi]
            #next_input = Variable(torch.FloatTensor(output.data))
            
            generated_sentence.append(output.data)  # storing embedded words for now
            
            
        input = output

    return generated_sentence
    

print_every = 100000
def translate(embedded_sentence):
    iter = 0
    start = time.time()
    closest_matches = [[0.0,0,"???"]] * len(embedded_sentence)  # TODO 
        
    # go through list of words in order of occurance probability
    for vector, word in word_index.items():
        iter += 1
        if iter % print_every == 0:
            print('%s (%d %d%%)' % (timeSince(start), iter, iter / 400000 * 100))
            
        #cursor_tensor_key = emb_word_tensor.map_(torch.FloatTensor([0]), round_floats)
        
        # check distance for each word in sentence
        ind = 0
        for emb_word_tensor in embedded_sentence:
            try:
                # calculate distance
                
                dist = 1.0 / torch.sum(torch.pow(vector.sub(emb_word_tensor), 2))
                
                if dist > closest_matches[ind][0]:
                    
                    closest_matches[ind] = [dist, vector, word]
        
            except Exception as e:
                print("could not look up embedded word: {}".format(e))
          
            ind += 1
    
    sentence = ""
    for [_, _, word] in closest_matches:
        sentence = sentence + str(word) + " "
                       
    return sentence

print("Output: ", start_word, translate(generate()))

could not look up embedded word: float division by zero
0m 5s (100000 25%)
0m 11s (200000 50%)
0m 16s (300000 75%)
