In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

torch.manual_seed(1)

<torch._C.Generator at 0x10751a0f0>

In [8]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]])


In [9]:
hello_idx = torch.LongTensor([word_to_ix['hello']])
hello_idx = Variable(hello_idx)

In [10]:
hello_idx

tensor([ 0])

In [12]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """今天 天氣 如何 你 好 嗎""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}


class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[(['今天', '天氣'], '如何'), (['天氣', '如何'], '你'), (['如何', '你'], '好')]
[7.557050466537476, 7.493309736251831, 7.430240154266357, 7.367831707000732, 7.306158423423767, 7.245250225067139, 7.184972047805786, 7.1253156661987305, 7.06627345085144, 7.007836818695068]


In [15]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """民進黨 備戰 年底 縣市長 選舉 加緊腳步 改革 拚 政績
不過 有時 官員 為 了 辯護 政策 發言 卻 被 認為 脫離現實 經常 遭 在野黨
或 網友 調侃 最近 行政院長 賴清德 對外 解釋 非核 家園 政策 並非 躁進
也 盼 外界 再 備載 容量 剩得 不夠 多 渲染 成 缺電 此番 言論 就 被 名嘴 黃 創夏 反諷 院長 沒有 智商 問
題 只是 腦子 太小""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


class CBOW(nn.Module):

    def __init__(self):
        pass

    def forward(self, inputs):
        pass


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example

[(['民進黨', '備戰', '縣市長', '選舉'], '年底'), (['備戰', '年底', '選舉', '加緊腳步'], '縣市長'), (['年底', '縣市長', '加緊腳步', '改革'], '選舉'), (['縣市長', '選舉', '改革', '拚'], '加緊腳步'), (['選舉', '加緊腳步', '拚', '政績'], '改革')]


tensor([  7,  50,  28,   8])

In [20]:
data[0]

(['民進黨', '備戰', '縣市長', '選舉'], '年底')

In [49]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
 
torch.manual_seed(1)
 
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right




raw_text = """民進黨 備戰 年底 縣市長 選舉 加緊腳步 改革 拚 政績
不過 有時 官員 為 了 辯護 政策 發言 卻 被 認為 脫離現實 經常 遭 在野黨
或 網友 調侃 最近 行政院長 賴清德 對外 解釋 非核 家園 政策 並非 躁進
也 盼 外界 再 備載 容量 剩得 不夠 多 渲染 成 缺電 此番 言論 就 被 名嘴 黃 創夏 反諷 院長 沒有 智商 問
題 只是 腦子 太小""".split()
 
# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
 
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
#print(data[:5])
 
 
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW,self).__init__() 
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) # embeddings， 待训练参数为embedding词表
        self.linear1 = nn.Linear(embedding_dim, vocab_size) # 待训练参数为 A b
 
 
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        add_embeds = torch.sum(embeds, dim=0).view(1,-1) # 相加后reshape
        out = self.linear1(add_embeds)
        log_probs = F.log_softmax(out)
        return log_probs
 
# create your model and train.  here are some functions to help you make
# the data ready for use by your module
 
 
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)
 
 
#make_context_vector(data[0][0], word_to_ix)  # example
 
# 建立優化器
losses = []
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, embedding_dim=20)
optimizer = optim.SGD(model.parameters(), lr=0.001)
 
# 開始訓練
for epoch in range(10):
    #print('epoch:', epoch, total_loss)
    total_loss = torch.FloatTensor([0])
    for context, target in data:
        context_idxs = [word_to_ix[w] for w in context]
        target_idx = word_to_ix[target]
        context_var = Variable(torch.LongTensor(context_idxs))
        target_var = Variable(torch.LongTensor([target_idx]))
        model.zero_grad()
        log_probs = model(context_var)
 
        loss = loss_function(log_probs,target_var)
        loss.backward()
        optimizer.step()
 
        total_loss += loss.data
    losses.append(total_loss)
print(losses)

[tensor([ 286.4037]), tensor([ 281.1267]), tensor([ 275.9252]), tensor([ 270.7987]), tensor([ 265.7467]), tensor([ 260.7687]), tensor([ 255.8643]), tensor([ 251.0333]), tensor([ 246.2751]), tensor([ 241.5897])]




In [44]:
log_probs

tensor([[-4.3736, -6.0672, -6.4029, -3.2983, -3.2056, -4.4463, -3.6566,
         -5.3512, -5.0240, -6.8106, -3.9341, -3.3772, -4.2399, -4.6874,
         -5.4792, -3.6308, -3.5693, -6.9198, -3.8954, -4.3974, -6.0739,
         -5.8943, -2.0312, -7.0127, -4.3555, -3.5623, -5.0418, -5.1572,
         -5.3538, -3.6363, -5.8433, -6.3246, -4.8121, -6.2440, -3.4695,
         -5.2942, -4.4541, -3.9078, -2.7883, -5.6608, -6.8956, -6.2957,
         -6.9875, -4.5539, -4.4182, -4.0486, -3.4340, -3.8613, -4.9785,
         -6.0099, -6.2466, -3.7921, -5.0173, -3.9109, -5.5299, -3.4842,
         -2.7776, -6.0387, -5.4375, -5.6201, -6.5588, -5.1013, -3.1098]])

In [45]:
word_to_ix

{'不夠': 24,
 '不過': 31,
 '並非': 25,
 '也': 23,
 '了': 44,
 '備戰': 50,
 '備載': 13,
 '再': 6,
 '剩得': 43,
 '創夏': 16,
 '加緊腳步': 58,
 '卻': 46,
 '反諷': 26,
 '只是': 36,
 '名嘴': 53,
 '問': 56,
 '在野黨': 40,
 '外界': 9,
 '多': 12,
 '太小': 4,
 '官員': 1,
 '家園': 45,
 '容量': 37,
 '對外': 5,
 '就': 21,
 '年底': 54,
 '成': 33,
 '或': 49,
 '拚': 52,
 '改革': 39,
 '政策': 30,
 '政績': 57,
 '智商': 14,
 '最近': 15,
 '有時': 60,
 '此番': 10,
 '民進黨': 7,
 '沒有': 18,
 '渲染': 47,
 '為': 20,
 '發言': 42,
 '盼': 61,
 '經常': 2,
 '網友': 17,
 '縣市長': 28,
 '缺電': 48,
 '脫離現實': 0,
 '腦子': 11,
 '行政院長': 32,
 '被': 41,
 '解釋': 22,
 '言論': 35,
 '認為': 29,
 '調侃': 55,
 '賴清德': 59,
 '躁進': 51,
 '辯護': 38,
 '遭': 3,
 '選舉': 8,
 '院長': 27,
 '非核': 19,
 '題': 62,
 '黃': 34}

In [None]:
#https://blog.csdn.net/CrazyBull2012/article/details/79380669