In [1]:
# load data set

samples = []
for i in range(1,510):
    if i < 10:
        i = "00" + str(i)
    elif i < 100:
        i = "0" + str(i)
    
    with open("data/bbc/business/{}.txt".format(str(i))) as fh:
        lines = fh.read().strip().split('\n')
        samples.append(lines[0])
        
        samples.append(" ".join(lines[2].split(' ')[0:6]))
        samples.append(" ".join(lines[4].split(' ')[0:6]))
        try:
            samples.append(" ".join(lines[6].split(' ')[0:6]))
            samples.append(" ".join(lines[8].split(' ')[0:6]))
            samples.append(" ".join(lines[10].split(' ')[0:6]))
        except:
            pass
 
print("Loaded {} lines.".format(len(samples)))

Loaded 2490 lines.


In [2]:
print(samples[29])

The US created fewer jobs than


In [3]:
# load GloVe encodings

import torch

dimensions = 50

print("Loading word vectors...")
embeddings_index = {}
word_index = {}

with open("data/glove{}.txt".format(dimensions), 'r') as fh:
    for line in fh:
        values = line.split()
        word = values[0]
        coefs = list(map(lambda x: float(x), values[1:])) # torch.FloatTensor(list(map(lambda x: float(x), values[1:])))
        embeddings_index[word] = coefs
        #key = list(map(lambda x: float("{0:.1f}".format(x)), coefs))
       
        word_index[torch.FloatTensor(coefs)] = word
        

print('Found %s word vectors.' % len(embeddings_index))

Loading word vectors...
Found 400000 word vectors.


In [4]:
# encode each sample as array of embedded words

import torch

seq_len = 6 #1025   # number of words in sentence
embedded_sentences = []



for sample in samples:
    
    sentence = [[0.0]*dimensions]*(seq_len+1) #torch.FloatTensor(1025, dimensions).zero_()
    #sentence[0] = torch.ones_like(torch.FloatTensor(1, dimensions)) # first element in sentence is all ones => start token
    ind = 0 #1 
    
    for word in sample.split(' '):
        word = word.lower().strip('(,.;!?:"').strip("'").strip(")")
        if word.startswith('"'):
            word = word[1:]
        if word.endswith("'s"):
            word = word[0:-2]
        elif word.endswith("bn"):
            words = [word[0:-2], "billion"]
        elif word.endswith("%"):
            words = [word[0:-1], "percent"]
        else:
            words = [word]
        
        try:
            for word in words:
                coefs = embeddings_index[word]
                sentence[ind] = coefs
                ind+=1
                
            
        except Exception as e:
            print("could not lookup embedding for word: {}".format(e))
    
        
    embedded_sentences.append(sentence)

print("done embedding sentences")

could not lookup embedding for word: '$280'
could not lookup embedding for word: '$280'
could not lookup embedding for word: '£155'
could not lookup embedding for word: '43.305'
could not lookup embedding for word: 'agroflora'
could not lookup embedding for word: 'agroflora'
could not lookup embedding for word: 'illva'
could not lookup embedding for word: 'illva'
could not lookup embedding for word: 'illva'
could not lookup embedding for word: '£15.4m'
could not lookup embedding for word: '$53'
could not lookup embedding for word: '$4'
could not lookup embedding for word: 're-listing'
could not lookup embedding for word: list assignment index out of range
could not lookup embedding for word: "didn't"
could not lookup embedding for word: '418bn-rand'
could not lookup embedding for word: 'bmw-brand'
could not lookup embedding for word: '80,600'
could not lookup embedding for word: 'ex-boss'
could not lookup embedding for word: 'ex-head'
could not lookup embedding for word: "won't"
could 

In [5]:
# Helper functions

import random
import time
import math



# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random line
def randomTrainingPair():
    line = randomChoice(embedded_sentences) # has shape (1025, 300)
    l = line[0:-1]
    v = torch.FloatTensor(l)
    input = Variable(v.view(seq_len, 1, dimensions))#, requires_grad=True)    
    t = torch.FloatTensor(line[1:])
    target = Variable(t.view(seq_len, 1, dimensions))               
    return (input, target)

# Timing runtime of network operations
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [6]:
# build model

import torch
import torch.nn as nn
from torch.autograd import Variable

batch_size = 1
input_size = dimensions # length of glove vectors
num_layers = 2
hidden_size = input_size # must be same?

gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=.2)

# train network
criterion = nn.MSELoss()
learning_rate = 0.001


def train(input, target, hidden):
    
    total_loss = 0
    for i in range(input.size()[0]): # 1024 words in sentence
        #print(i)
        output, hidden = gru(input, hidden)
        loss_var = criterion(output, target)
        total_loss += loss_var.data[0]
        
    
    loss_var.backward()
    
    for p in gru.parameters():
        p.data.add_(-learning_rate, p.grad.data)
   
    return total_loss / input.size()[0] # average loss
  


start = time.time()
        
n_iters = 50000
print_every = 1000
#plot_every = 100


hidden = Variable(torch.zeros(num_layers, 1, hidden_size)) # for init only
for iter in range(1, n_iters+1):
    # input shape: (seq_len, batch, input_size)
    input_tensor, target_tensor = randomTrainingPair() # random embedded sentence
    
    loss = train(input_tensor, target_tensor, hidden)
    
    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    #if iter % plot_every == 0:
     #   all_losses.append(total_loss / plot_every)
     #   total_loss = 0
        


0m 24s (1000 2%) 0.3006
0m 47s (2000 4%) 0.3512
1m 11s (3000 6%) 0.2737
1m 34s (4000 8%) 0.2871
1m 58s (5000 10%) 0.3995
2m 21s (6000 12%) 0.3229
2m 44s (7000 14%) 0.3322
3m 6s (8000 16%) 0.4146
3m 29s (9000 18%) 0.2780
3m 53s (10000 20%) 0.2948
4m 17s (11000 22%) 0.3313
4m 40s (12000 24%) 0.3239
5m 4s (13000 26%) 0.5752
5m 30s (14000 28%) 0.2998
5m 53s (15000 30%) 0.2348
6m 15s (16000 32%) 0.4798
6m 40s (17000 34%) 0.3276
7m 3s (18000 36%) 0.4067
7m 27s (19000 38%) 0.3719
7m 51s (20000 40%) 0.3797
8m 14s (21000 42%) 0.2740
8m 38s (22000 44%) 0.3213
9m 1s (23000 46%) 0.2772
9m 25s (24000 48%) 0.3194
9m 48s (25000 50%) 0.3029
10m 12s (26000 52%) 0.4203
10m 36s (27000 54%) 0.3239
10m 59s (28000 56%) 0.2713
11m 23s (29000 57%) 0.3491
11m 52s (30000 60%) 0.3167
12m 21s (31000 62%) 0.3149
12m 50s (32000 64%) 0.3824
13m 19s (33000 66%) 0.2977
13m 50s (34000 68%) 0.2849
14m 19s (35000 70%) 0.3896
14m 49s (36000 72%) 0.3109
15m 19s (37000 74%) 0.3229
15m 49s (38000 76%) 0.3146
16m 18s (39000 7

In [9]:
max_length = 6

#eos = torch.FloatTensor(1,dimensions).zero_()

def round_floats(float_num, dummy):
    return float("{0:.2f}".format(float_num))

start_word  = "ironically"

def generate():
    # TODO pick random start word
    input = Variable(torch.FloatTensor(embeddings_index[start_word]).view(1,1,dimensions)) # start word 
    
    generated_sentence = [input.data]
    hidden = Variable(torch.zeros(num_layers, 1, hidden_size)) # pick random vector from embeddings
    
    for i in range(max_length):
        output, hidden = gru(input, hidden)
     
        #if torch.equal(output.data.view(1,dimensions), eos):
            #print("found eos after {} words".format(i))
            #break
        
        generated_sentence.append(output.data)  # storing embedded words for now
        input = output

    return generated_sentence
    

print_every = 100000
def translate(embedded_sentence):
    iter = 0
    start = time.time()
    closest_matches = [[0.0,0,"???"]] * len(embedded_sentence)  # TODO 
        
    # go through list of words in order of occurance probability
    for vector, word in word_index.items():
        iter += 1
        if iter % print_every == 0:
            print('%s (%d %d%%)' % (timeSince(start), iter, iter / 400000 * 100))
            
        #cursor_tensor_key = emb_word_tensor.map_(torch.FloatTensor([0]), round_floats)
        
        # check distance for each word in sentence
        ind = 0
        for emb_word_tensor in embedded_sentence:
            try:
                # calculate distance
                
                dist = 1.0 / torch.sum(torch.pow(vector.sub(emb_word_tensor), 2))
                
                if dist > closest_matches[ind][0]:
                    
                    closest_matches[ind] = [dist, vector, word]
        
            except Exception as e:
                print("exception during embedded word lookup: {}".format(e))
          
            ind += 1
    
    sentence = ""
    for [_, _, word] in closest_matches:
        sentence = sentence + str(word) + " "
                       
    return sentence

print("Output: ", start_word, translate(generate()))

0m 5s (100000 25%)
0m 11s (200000 50%)
exception during embedded word lookup: float division by zero
0m 16s (300000 75%)
0m 22s (400000 100%)
Output:  ironically coincidentally bdb94 ironically str95bb str95bb ironically mo95 
