In [1]:
%%javascript

window.load_remote_theme = true
var theme_js = "https://odhk.github.io/hyrule_theme/custom.js";

window.load_local_theme = function(){
    var hostname = document.location.hostname
    return ((hostname == "localhost" || hostname == '127.0.0.1') && !load_remote_theme)
}

var url = load_local_theme() ? document.location.origin + "/files/theme/custom.js" : theme_js

$.getScript(url)

<IPython.core.display.Javascript object>

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from functools import reduce
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS",2:"<unk>"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [5]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [6]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [7]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'chn', True)
print(random.choice(pairs))

Reading lines...
Read 20052 sentence pairs
Trimmed to 1603 sentence pairs
Counting words...
Counted words:
chn 9
eng 1378
[' ', 'you re going to be a mommy .']


In [8]:
output_lang.word2count['i']
eng = output_lang
eng_words = output_lang.index2word.values()
#print(list(eng_words))
big_sentence_array = [p[1].split() for p in pairs]
big_sentence_array = [p.split('.') for arr in big_sentence_array for p in arr]

In [9]:
test_sentence = reduce(lambda x,y:x+y,big_sentence_array)
test_sentence = [w for w in test_sentence if w!='']
for index in range(len(test_sentence)):
    if test_sentence[index] == 'm':
        test_sentence[index]='am'
    elif test_sentence[index] == 're':
        test_sentence[index]='are'
    elif test_sentence[index] == 's':
        test_sentence[index]='is'
#vocab = set(test_sentence)
#word_to_ix = {word: i for i, word in enumerate(vocab)}

In [10]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}
word_to_ix['<unk>'] = len(word_to_ix)

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]

model = NGramLanguageModeler(len(word_to_ix), EMBEDDING_DIM,CONTEXT_SIZE)

[(['i', 'am'], 'ok'), (['am', 'ok'], 'i'), (['ok', 'i'], 'am')]


In [11]:
losses = []
loss_function = nn.NLLLoss()

optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(50):
    total_loss = torch.Tensor([0])
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a variable)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()
        #print(1)
        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
    print(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

tensor([ 62547.0312])
tensor([ 52094.1094])
tensor([ 49216.1797])
tensor([ 47509.8047])
tensor([ 46312.9883])
tensor([ 45372.3516])
tensor([ 44578.9375])
tensor([ 43877.5469])
tensor([ 43238.2891])
tensor([ 42643.6367])
tensor([ 42083.3125])
tensor([ 41550.6211])
tensor([ 41039.3984])
tensor([ 40545.8242])
tensor([ 40065.9062])
tensor([ 39598.1602])
tensor([ 39140.7891])
tensor([ 38692.7344])
tensor([ 38252.2812])
tensor([ 37818.5508])
tensor([ 37391.3984])
tensor([ 36970.4023])
tensor([ 36555.2891])
tensor([ 36145.2578])
tensor([ 35740.6602])
tensor([ 35341.5078])
tensor([ 34947.3125])
tensor([ 34558.5547])
tensor([ 34175.3711])
tensor([ 33797.9531])
tensor([ 33426.3789])
tensor([ 33060.6875])
tensor([ 32701.2422])
tensor([ 32347.4434])
tensor([ 32000.4238])
tensor([ 31659.5195])
tensor([ 31325.6465])
tensor([ 30998.1484])
tensor([ 30677.5781])
tensor([ 30364.2441])
tensor([ 30057.5078])
tensor([ 29758.0195])
tensor([ 29465.3691])
tensor([ 29179.4844])
tensor([ 28900.6348])
tensor([ 2

In [12]:
# 仅保存和加载模型参数(推荐使用)
torch.save(model.state_dict(), 'lm-model.pkl')
#model.load_state_dict(torch.load('lm-model.pkl'))

In [13]:
embed = model.embeddings

SyntaxError: invalid syntax (<ipython-input-14-3060cf88f854>, line 1)