In [1]:
from __future__ import print_function
import argparse
import torch
import torch.utils.data
from torch import nn, optim
from torch.nn import functional as F
from torchvision import datasets, transforms
from torchvision.utils import save_image

In [2]:
ls glove

glove.6B.100d.txt    glove.6B.300d.txt    glove.840B.300d.txt
glove.6B.200d.txt    glove.6B.50d.txt


In [3]:
# prepare glove vectors
words = set()
word_to_vec = {}
word_to_id = {}
id_to_word = {}
glove_dim = 300
glove_filename = "./glove/glove.840B.300d.txt" # "./glove/glove.6B.300d.txt"
with open(glove_filename) as file:
    count = 0
    for line in file:
        tokens = line.split()
        word, vals = tokens[0], tokens[-glove_dim:]
        if word not in words:
            words.add(word)
            word_to_vec[word] = torch.Tensor([float(val) for val in vals])
            word_to_id[word] = count
            id_to_word[count] = word
            count += 1
print(len(words))
print(id_to_word[1])

2195884
.


In [4]:
def make_embedding(words, id_to_word, word_to_vec, dim):
    weights = torch.stack([word_to_vec[id_to_word[i]] for i in range(len(words))])
    embed = nn.Embedding.from_pretrained(weights)
    return embed

In [5]:
embed = make_embedding(['cat', 'dog'], {0:'cat', 1:'dog'}, 
                       {'cat':torch.FloatTensor([1,1]), 'dog':torch.FloatTensor([2,2])}, 2)
glove_embed = make_embedding(words, id_to_word, word_to_vec, glove_dim)
print(glove_embed(torch.LongTensor([0])))

tensor([[-0.0828,  0.6720, -0.1499, -0.0650,  0.0565,  0.4023,  0.0028, -0.3311,
         -0.3069,  2.0817,  0.0318,  0.0136,  0.3027,  0.0071, -0.5819, -0.2774,
         -0.0623,  1.1451, -0.2423,  0.1235, -0.1224,  0.3315, -0.0062, -0.3054,
         -0.1306, -0.0546,  0.0371, -0.0706,  0.5893, -0.3038,  0.2898, -0.1465,
         -0.2705,  0.3716,  0.3203, -0.2912,  0.0052, -0.1321, -0.0527,  0.0873,
         -0.2667, -0.1690,  0.0152, -0.0084, -0.1487,  0.2341, -0.2072, -0.0914,
          0.4008, -0.1722,  0.1814,  0.3759, -0.2868,  0.3729, -0.1619,  0.1801,
          0.3032, -0.1322,  0.1835,  0.0958,  0.0949,  0.0083,  0.1176,  0.3405,
          0.0368, -0.2908,  0.0583, -0.0278,  0.0829,  0.1862, -0.0315,  0.2799,
         -0.0744, -0.1376, -0.2187,  0.1814,  0.0409, -0.1130,  0.2411,  0.3657,
         -0.2752, -0.0568,  0.3487,  0.0119,  0.1452, -0.7139,  0.4850,  0.1481,
          0.6229,  0.2060,  0.5838, -0.1344,  0.4021,  0.1831,  0.2802, -0.4235,
         -0.2563,  0.1771, -

In [6]:
class Poem_LSTM(nn.Module):
    def __init__(self, embed, hidden_dim):
        super(Poem_LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embed = embed
        self.embed_dim = embed.embedding_dim
        self.vocab_size = embed.num_embeddings
        self.lstm = nn.LSTM(self.embed_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, self.vocab_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.embed(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
    
def maxword(id_to_word, tag_scores):
    return id_to_word[torch.argmax(tag_scores)]

In [7]:
hidden_dim = 100
lstm = Poem_LSTM(glove_embed, hidden_dim)
sentence = "This is a test"
sentence_input = torch.LongTensor([word_to_id[word.lower()] for word in sentence.split()])
scores = lstm.forward(sentence_input)
word_id = torch.argmax(scores[-1])
print(len(scores[0]))
print(word_id.item())
print(id_to_word[word_id.item()])

2195884
1631776
credit-linked


In [8]:
import pandas as pd
import spacy

def load_data(vocab, poems, limit):
    nlp = spacy.load('en')
    line_data = []
    scraps = {}
    count = 0
    for poem in poems:
        try:
            for line in poem.lower().split('\n'):
                if len(line) == 0:
                    continue
                tokens = nlp(line)
                include = True
                bad_words = []
                for token in tokens:
                    if token.text not in vocab:
                        bad_words.append(token.text)
                        include = False
                if include: line_data.append(line)
                else: scraps[line] = bad_words
        except:
            continue
        if count == limit:
            break
        count += 1
    return line_data, scraps

In [9]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
model = Poem_LSTM(glove_embed, hidden_dim)
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)

def prepare_training_data(limit=-1):
    filename = "./clean_poems.csv"
    data = pd.read_csv(filename)
    #poems = data.iloc[:,3]
    poems = load_data(words, data, limit)
    return poems
    
def sentence_to_vector(s, word_to_vec):
    ids = [word_to_vec[w] for w in s]
    return torch.tensor(ids, dtype=torch.long)

def sentence_one_hot(s, word_to_vec):
    one_hot = torch.zeros([len(s), len(word_to_vec)], dtype=torch.long)
    for i, w in enumerate(s):
        one_hot[i, word_to_vec[w]] = 1
    return one_hot

train_data, scraps = prepare_training_data(limit=-1)
print(len(train_data), len(scraps))
print(scraps)

2 0
{}


In [22]:
nlp = spacy.load('en')
filename = "clean_poems.csv"
poems = pd.read_csv(filename)

In [34]:
good_lines = []
bad_lines  = []

def valid_line(line, words):
    tokens = nlp(line)
    count = 0
    for token in tokens:
        w = token.text
        if w.isalnum():
            if w not in words:
                return False
            count += 1
    if count < 2:
        return False
    return True
            
for poem in poems.iloc[:10,1]:
    lines = poem.split('\n')
    for line in lines:
        if valid_line(line, words): good_lines.append(line)
        else: bad_lines.append(line)
print(len(good_lines))
print(len(bad_lines))

339
85


In [35]:
print(bad_lines)

['Philosophic', '', 'and discrete—a mirror come unsilvered,', '', '', '', 'WINTER', '', 'SUMMER', '', 'STORES', '', 'WRITING', '', 'TODAY', '', '', '', 'RECIPE', 'flour.', '', '', '', '', 'HAPPINESS', '', 'MONEY', '', '', 'EDWARD', '', 'LAKE', '', '', '', 'POTATOES', '', 'MOTHER', '', '', 'sand.', '', 'TODAY', '', '', 'ALASKA', '', '', 'LOYALTY', '', '', 'touch?', '', 'COMPANY', 'but', '', 'lace', 'She', 'betting quarters, sidewheelers and straight thoroughs,', 'look at these whores these onehundreddollar whores', '', '', '', '', '', 'hard-wired giammai to dare say so. So what moved him to not-say', 'have to start to agree. The verbness of it impropriety (eyes glob up the', "three little words aren't meant as saying. An icy drink in stormlight. A", 'the centuried moon rose above dinnermint stone; many men contin-', 'ued  talking; a woman lifted her sarsenet skirt, peed on green lilies and,', '', 'Soul, self; come, poor Jackself, I do advise', 'Betweenpie mountains — lights a lovely mil

In [36]:
print(good_lines[:3])

['in its complex, ovoid emptiness,', 'a skillful pundit coined it as a sort', 'of stopgap doorstop for those']
