In [1]:
# load data set

samples = []
for i in range(1,511):
    if i < 10:
        i = "00" + str(i)
    elif i < 100:
        i = "0" + str(i)
    
    with open("data/bbc/business/{}.txt".format(str(i))) as fh:
        lines = fh.read().strip().split('\n')
        samples.append(lines[0])
        #samples.append(lines[0] + " " + lines[2] + " " + lines[4])

print("Loaded {} lines.".format(len(samples)))
        

Loaded 510 lines.


In [33]:
print(samples[3])

High fuel prices hit BA's profits


In [7]:
# load GloVe encodings

import torch

print("Loading word vectors...")
embeddings_index = {}
word_index = {}

with open("glove300.txt", 'r') as fh:
    for line in fh:
        values = line.split()
        word = values[0]
        coefs = list(map(lambda x: float(x), values[1:])) # torch.FloatTensor(list(map(lambda x: float(x), values[1:])))
        embeddings_index[word] = coefs
        #key = list(map(lambda x: float("{0:.1f}".format(x)), coefs))
       
        word_index[torch.FloatTensor(coefs)] = word
        

print('Found %s word vectors.' % len(embeddings_index))

Loading word vectors...
Found 400000 word vectors.


In [3]:
# encode each sample as array of embedded words

import torch

seq_len = 8 #1025   # number of words in sentence
embedded_sentences = []

for sample in samples:
    
    sentence = [[0.0]*300]*(seq_len+1) #torch.FloatTensor(1025, 300).zero_()
    #sentence[0] = torch.ones_like(torch.FloatTensor(1, 300)) # first element in sentence is all ones => start token
    ind = 0 #1 
    
    for word in sample.split(' '):
        word = word.lower().strip('(,.;!?:"').strip("'").strip(")")
        if word.startswith('"'):
            word = word[1:]
        if word.endswith("'s"):
            word = word[0:-2]
        elif word.endswith("bn"):
            words = [word[0:-2], "billion"]
        elif word.endswith("%"):
            words = [word[0:-1], "percent"]
        else:
            words = [word]
        
        try:
            for word in words:
                coefs = embeddings_index[word]
                sentence[ind] = coefs
                ind+=1
                
            
        except Exception as e:
            print("could not lookup embedding for word: {}".format(e))
    
        
    embedded_sentences.append(sentence)

print("done embedding sentences")

could not lookup embedding for word: '$280'
could not lookup embedding for word: 'illva'
could not lookup embedding for word: '$53'
could not lookup embedding for word: '$4'
could not lookup embedding for word: 'ex-boss'
could not lookup embedding for word: "won't"
could not lookup embedding for word: 'turkey-iran'
could not lookup embedding for word: 'fiat-gm'
could not lookup embedding for word: '$16'
could not lookup embedding for word: '$102.6'
could not lookup embedding for word: '$280'
could not lookup embedding for word: 'ex-aide'
could not lookup embedding for word: '$515m'
could not lookup embedding for word: '$28'
could not lookup embedding for word: 'sell-offs'
could not lookup embedding for word: '$50'
could not lookup embedding for word: '$50'
could not lookup embedding for word: '$11'
could not lookup embedding for word: '$6'
could not lookup embedding for word: '$2'
could not lookup embedding for word: 'ex-boeing'
could not lookup embedding for word: '$20'
could not look

In [4]:
# Helper functions

import random
import time
import math

# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random line
def randomTrainingPair():
    line = randomChoice(embedded_sentences) # has shape (1025, 300)
    l = line[0:-1]
    v = torch.FloatTensor(l)
    input = Variable(v.view(seq_len, 1, 300), requires_grad=True)
    
    t = torch.FloatTensor(line[1:])
    target = Variable(t.view(seq_len, 1, 300))
                  
    return (input, target)

# Timing runtime of network operations
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [5]:
# build model

import torch
import torch.nn as nn
from torch.autograd import Variable

batch_size = 1
input_size = 300 # length of glove vectors
num_layers = 2
hidden_size = input_size # must be same?

gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=.2)

# train network
criterion = nn.MSELoss()
learning_rate = 0.0005


# input : start token to last letter
# target: first letter to EOS
def train(input, target, hidden):
    
    total_loss = 0
    for i in range(input.size()[0]): # 1024 words in sentence
        #print(i)
        output, hidden = gru(input.detach(), hidden.detach())
        loss_var = criterion(output, target)
        total_loss += loss_var.data[0]
        
    
    loss_var.backward()
   
    return total_loss
  


start = time.time()
        
n_iters = 10000
print_every = 1000
#plot_every = 100

for iter in range(1, n_iters+1):
    # input shape: (seq_len, batch, input_size)
    input_tensor, target_tensor = randomTrainingPair() # random embedded sentence
    hidden = Variable(torch.zeros(num_layers, 1, hidden_size)) # for init only
    loss = train(input_tensor, target_tensor, hidden)
    
    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    #if iter % plot_every == 0:
     #   all_losses.append(total_loss / plot_every)
     #   total_loss = 0
        


0m 53s (1000 10%) 0.4213
1m 46s (2000 20%) 0.6092
2m 54s (3000 30%) 0.8868
4m 4s (4000 40%) 0.5635
4m 56s (5000 50%) 0.8506
5m 49s (6000 60%) 0.5414
6m 45s (7000 70%) 0.5985
7m 38s (8000 80%) 0.7610
8m 32s (9000 90%) 0.6529
9m 24s (10000 100%) 0.5678


In [34]:
max_length = 20

eos = torch.FloatTensor(1,300).zero_()

def round_floats(float_num, dummy):
    return float("{0:.2f}".format(float_num))


def generate():
    # TODO pick random start word
    input = Variable(torch.FloatTensor(embeddings_index["high"]).view(1,1,300)) # start word 
    
    generated_sentence = [input.data]
    hidden = Variable(torch.zeros(num_layers, 1, hidden_size))
    
    for i in range(max_length):
        output, hidden = gru(input, hidden)
        #print(type(output.data))
        #topv, topi = output.data.topk(1)
        #topi = topi[0][0]
        #print(topi)
        if torch.equal(output.data.view(1,300), eos):
            print("found eos after {} words".format(i))
            break
        else:
            #embedded_word = all_letters[topi]
            #next_input = Variable(torch.FloatTensor(output.data))
            
            generated_sentence.append(output.data)  # storing embedded words for now
            
            
        input = output

    return generated_sentence
    

print_every = 1000
def translate(embedded_sentence):
    iter = 0
    start = time.time()
    closest_matches = [[0.0,0,"???"]] * len(embedded_sentence)  # TODO 
        
    # go through list of words in order of occurance probability
    for vector, word in word_index.items():
        iter += 1
        if iter % print_every == 0:
            print('%s (%d %d%%) %s' % (timeSince(start), iter, iter / 400000 * 100, word))
            
        #cursor_tensor_key = emb_word_tensor.map_(torch.FloatTensor([0]), round_floats)
        
        # check distance for each word in sentence
        ind = 0
        for emb_word_tensor in embedded_sentence:
            try:
                # calculate distance
                
                dist = 1.0 / torch.sum(torch.pow(vector.sub(emb_word_tensor), 2))
                
                if dist > closest_matches[ind][0]:
                    
                    closest_matches[ind] = [dist, vector, word]
        
            except Exception as e:
                print("could not look up embedded word: {}".format(e))
          
            ind += 1
    
    sentence = ""
    for [_, _, word] in closest_matches:
        sentence = sentence + str(word) + " "
                       
    return sentence

print("Output: ", translate(generate()))

0m 0s (1000 0%) ouranos
0m 0s (2000 0%) minsky
0m 0s (3000 0%) euro88
0m 0s (4000 1%) giht
0m 0s (5000 1%) tofte
0m 0s (6000 1%) nyckelharpa
0m 1s (7000 1%) angoras
0m 1s (8000 2%) birlas
0m 1s (9000 2%) haehnel
0m 1s (10000 2%) algodones
0m 1s (11000 2%) adr
0m 1s (12000 3%) 1430gmt
0m 2s (13000 3%) royal
0m 2s (14000 3%) socio-historical
0m 2s (15000 3%) ōsaka
0m 2s (16000 4%) khandwa
0m 2s (17000 4%) nakajima
0m 2s (18000 4%) montanari
0m 2s (19000 4%) geiseric
0m 3s (20000 5%) kumārajīva
0m 3s (21000 5%) chthonic
0m 3s (22000 5%) sokol
0m 3s (23000 5%) ruethling
0m 3s (24000 6%) 1572
0m 3s (25000 6%) overwinter
0m 4s (26000 6%) who2004.svg
0m 4s (27000 6%) lvovich
0m 4s (28000 7%) liepājas
0m 4s (29000 7%) towage
0m 4s (30000 7%) skotnikov
0m 4s (31000 7%) appleby-in-westmorland
0m 4s (32000 8%) vadas
0m 5s (33000 8%) tusker
0m 5s (34000 8%) ond
0m 5s (35000 8%) 25-50
0m 5s (36000 9%) procyon
0m 5s (37000 9%) trilateral
0m 5s (38000 9%) corporately
0m 6s (39000 9%) ridenhour
0m 6s 

0m 46s (296000 74%) slighter
0m 46s (297000 74%) chronister
0m 46s (298000 74%) 5:39
0m 46s (299000 74%) ascochyta
0m 46s (300000 75%) dvd-r
0m 46s (301000 75%) funes
0m 47s (302000 75%) chubbuck
0m 47s (303000 75%) kluttz
0m 47s (304000 76%) supermarkets
0m 47s (305000 76%) rihl
0m 47s (306000 76%) 20,444
0m 47s (307000 76%) siddhis
0m 48s (308000 77%) 700w
0m 48s (309000 77%) anatoma
0m 48s (310000 77%) pr-10
0m 48s (311000 77%) khokhlova
0m 48s (312000 78%) prizefighters
0m 48s (313000 78%) 1.154
0m 49s (314000 78%) magid
0m 49s (315000 78%) soest
0m 49s (316000 79%) ryoo
0m 49s (317000 79%) 185.5
0m 49s (318000 79%) fna
0m 49s (319000 79%) gholston
0m 50s (320000 80%) vorsah
0m 50s (321000 80%) coriaceous
0m 50s (322000 80%) govern
0m 50s (323000 80%) bugey
0m 50s (324000 81%) ramo
0m 50s (325000 81%) 450-meter
0m 50s (326000 81%) 21.83
0m 51s (327000 81%) sariyev
0m 51s (328000 82%) 048
0m 51s (329000 82%) skouris
0m 51s (330000 82%) amplifier
0m 51s (331000 82%) instar
0m 51s (33

In [14]:
print(generated_sentence)


NameError: name 'generated_sentence' is not defined