In [57]:
# load data set

samples = []
n = 6 # length of string to generate

for i in range(1,510):
    if i < 10:
        i = "00" + str(i)
    elif i < 100:
        i = "0" + str(i)
    
    with open("data/business/{}.txt".format(str(i))) as fh:
        lines = fh.read().strip().split('\n')
        # split entire file into 6-word sentences

        for line in lines:
            if line != '':
                line = line.split(' ')
                substrings = [line[i * n:(i + 1) * n] for i in range((len(line) + n - 1) // n )]   
                for s in substrings:
                    if len(s) == n:
                        res = []
                        for word in s:
                            word = filter(lambda ch: ch not in " -,.;!?:()][]=+_*", word)
                            if word.startswith("£"):
                                res.append("euros")
                            elif word.startswith("$"):
                                res.append("dollars")
                            elif word.endswith("%"):
                                res.append("percents")
                            elif word.endswith("bn"):
                                res.append("billions")
                            elif word.endswith("\\'s"):
                                res.append(word[:-3])
                            elif word.endswith("'s"):
                                res.append(word[:-2])
                            elif word.isdigit():
                                res.append("number{}".format(len(word)))
                            else:
                                res.append(word.strip('.",').lower())
                        r = " ".join(res)
                        if not "\\x" in repr(r):
                            samples.append(r)
      
 
print("Loaded {} lines.".format(len(samples)))
print(samples)

Loaded 26812 lines.


In [22]:
# load GloVe encodings

import torch

dimensions = 300

print("Loading word vectors...")
embeddings_index = {}
word_index = {}

with open("data/glove{}.txt".format(dimensions), 'r') as fh:
    for line in fh:
        values = line.split()
        word = values[0]
        coefs = list(map(lambda x: float(x), values[1:])) # torch.FloatTensor(list(map(lambda x: float(x), values[1:])))
        embeddings_index[word] = coefs
        #key = list(map(lambda x: float("{0:.1f}".format(x)), coefs))
       
        word_index[torch.FloatTensor(coefs)] = word
        

print('Found %s word vectors.' % len(embeddings_index))

Loading word vectors...
Found 2196016 word vectors.


In [58]:
# encode each sample as array of embedded words

import torch

seq_len = n #1025   # number of words in sentence
embedded_sentences = []

for sample in samples:
    
    sentence = [[0.0]*dimensions]*(seq_len+1) #torch.FloatTensor(1025, dimensions).zero_()
    #sentence[0] = torch.ones_like(torch.FloatTensor(1, dimensions)) # first element in sentence is all ones => start token
    ind = 0 #1 
    
    for word in sample.split(' '):
        
        try:
            for word in words:
                coefs = embeddings_index[word]
                sentence[ind] = coefs
                ind+=1
                
        except Exception as e:
            print("could not lookup embedding for word: {}".format(e))
    
    embedded_sentences.append(sentence)

print("done embedding sentences")

done embedding sentences


In [59]:
# Helper functions

import random
import time
import math

# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random line
def randomTrainingPair():
    line = randomChoice(embedded_sentences) # has shape (1025, 300)
    l = line[0:-1]
    v = torch.FloatTensor(l)
    input = Variable(v.view(seq_len, 1, dimensions))#, requires_grad=True)    
    t = torch.FloatTensor(line[1:])
    target = Variable(t.view(seq_len, 1, dimensions))               
    return (input, target)

# Timing runtime of network operations
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [60]:
# build model

import torch
import torch.nn as nn
from torch.autograd import Variable

batch_size = 1
input_size = dimensions # length of glove vectors
num_layers = 2
hidden_size = input_size # must be same?

gru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=.2)

# train network
criterion = nn.MSELoss()
learning_rate = 0.001

def train(input, target, hidden):
    total_loss = 0
    for i in range(input.size()[0]): # 1024 words in sentence
        #print(i)
        output, hidden = gru(input, hidden)
        loss_var = criterion(output, target)
        total_loss += loss_var.data.item()
        
    loss_var.backward()
    
    for p in gru.parameters():
        p.data.add_(-learning_rate, p.grad.data)
   
    return total_loss / input.size()[0] # average loss


start = time.time()
        
n_iters = 50000
print_every = 1000
#plot_every = 100

hidden = Variable(torch.zeros(num_layers, 1, hidden_size)) # for init only
for iter in range(1, n_iters+1):
    # input shape: (seq_len, batch, input_size)
    input_tensor, target_tensor = randomTrainingPair() # random embedded sentence
    
    loss = train(input_tensor, target_tensor, hidden)
    
    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    #if iter % plot_every == 0:
     #   all_losses.append(total_loss / plot_every)
     #   total_loss = 0
        
print("done training model")

0m 44s (1000 0%) 0.0398
1m 25s (2000 0%) 0.0425
2m 6s (3000 0%) 0.0373
2m 48s (4000 0%) 0.0344
3m 30s (5000 0%) 0.0300
4m 12s (6000 0%) 0.0282
4m 53s (7000 0%) 0.0264
5m 35s (8000 0%) 0.0262
6m 17s (9000 0%) 0.0258
6m 58s (10000 0%) 0.0259
7m 39s (11000 0%) 0.0257
8m 19s (12000 0%) 0.0253
8m 59s (13000 0%) 0.0254
9m 41s (14000 0%) 0.0257
10m 22s (15000 0%) 0.0257
11m 2s (16000 0%) 0.0259
11m 45s (17000 0%) 0.0262
12m 27s (18000 0%) 0.0263
13m 8s (19000 0%) 0.0262
13m 49s (20000 0%) 0.0264
14m 31s (21000 0%) 0.0261
15m 12s (22000 0%) 0.0257
15m 54s (23000 0%) 0.0256
16m 36s (24000 0%) 0.0258
17m 17s (25000 0%) 0.0257
17m 59s (26000 0%) 0.0255
18m 40s (27000 0%) 0.0259
19m 21s (28000 0%) 0.0259
20m 4s (29000 0%) 0.0258
20m 47s (30000 0%) 0.0261
21m 30s (31000 0%) 0.0259
22m 12s (32000 0%) 0.0258
22m 53s (33000 0%) 0.0257
23m 34s (34000 0%) 0.0257
24m 15s (35000 0%) 0.0257
24m 55s (36000 0%) 0.0255
25m 37s (37000 0%) 0.0256
26m 18s (38000 0%) 0.0257
26m 58s (39000 0%) 0.0256
27m 39s (4000

In [61]:
max_length = 6

#eos = torch.FloatTensor(1,dimensions).zero_()

def round_floats(float_num, dummy):
    return float("{0:.2f}".format(float_num))

start_word  = "trade"

def generate():
    # TODO pick random start word
    input = Variable(torch.FloatTensor(embeddings_index[start_word]).view(1,1,dimensions)) # start word 
    
    generated_sentence = [input.data]
    hidden = Variable(torch.zeros(num_layers, 1, hidden_size)) # pick random vector from embeddings
    
    for i in range(max_length):
        output, hidden = gru(input, hidden)
     
        #if torch.equal(output.data.view(1,dimensions), eos):
            #print("found eos after {} words".format(i))
            #break
        
        generated_sentence.append(output.data)  # storing embedded words for now
        input = output

    return generated_sentence
    

print_every = 100000
def translate(embedded_sentence):
    iter = 0
    start = time.time()
    closest_matches = [[0.0,0,"???"]] * len(embedded_sentence)  # TODO 
        
    # go through list of words in order of occurance probability
    for vector, word in word_index.items():
        iter += 1
        if iter % print_every == 0:
            print('%s (%d %d%%)' % (timeSince(start), iter, iter / 2000000 * 100))
            
        #cursor_tensor_key = emb_word_tensor.map_(torch.FloatTensor([0]), round_floats)
        
        # check distance for each word in sentence
        ind = 0
        for emb_word_tensor in embedded_sentence:
            try:
                # calculate distance              
                dist = 1.0 / torch.sum(torch.pow(vector.sub(emb_word_tensor), 2))
                
                if dist > closest_matches[ind][0]:
                    
                    closest_matches[ind] = [dist, vector, word]
        
            except Exception as e:
                print("exception during embedded word lookup: {}".format(e))
          
            ind += 1
    
    sentence = ""
    for [_, _, word] in closest_matches:
        sentence = sentence + str(word) + " "
                       
    return sentence

print("Output: ", start_word, translate(generate()))

23m 55s (100000 0%)
24m 48s (200000 0%)
25m 40s (300000 0%)
26m 33s (400000 0%)
27m 26s (500000 0%)
28m 20s (600000 0%)
29m 17s (700000 0%)
30m 17s (800000 0%)
31m 21s (900000 0%)
32m 24s (1000000 0%)
33m 28s (1100000 0%)
34m 33s (1200000 0%)
35m 38s (1300000 0%)
36m 43s (1400000 0%)
37m 54s (1500000 0%)
39m 2s (1600000 0%)
40m 15s (1700000 0%)
41m 27s (1800000 0%)
42m 38s (1900000 0%)
43m 51s (2000000 100%)
45m 3s (2100000 100%)
('Output: ', 'trade', 'trade optimism optimism optimism optimism optimism optimism ')
