In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np
torch.manual_seed(1)

<torch._C.Generator at 0x1051b9e90>

In [56]:
word_to_ix = {"hello": 0, "world": 1} #Hello表示0, Wordld表示1
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings, 2x5矩陣
#如果1000個詞 希望100維度就是 1000,100
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long) #dtype=torch.long 64 int
hello_embed = embeds(lookup_tensor) #Embeddin放到embeds
print(hello_embed) #Hello的word embedding
#轉換tensor to numpy
hello_embed=hello_embed.detach().numpy().tolist()
print("list",hello_embed)

tensor([[-2.5667, -1.4303,  0.5009,  0.5438, -0.4057]])
list [[-2.5667366981506348, -1.4303274154663086, 0.5009211301803589, 0.5437674522399902, -0.4057423770427704]]


In [57]:
hello_embed

[[-2.5667366981506348,
  -1.4303274154663086,
  0.5009211301803589,
  0.5437674522399902,
  -0.4057423770427704]]

In [58]:
lookup_tensor = torch.tensor([word_to_ix["world"]], dtype=torch.long) #dtype=torch.long 64 int
world_embed = embeds(lookup_tensor) #Embeddin放到embeds
print(world_embed) #Hello的word embedding
world_embed=world_embed.detach().numpy().tolist()
print("list",world_embed)

tensor([[ 1.1341, -1.1115,  0.3501, -0.7703, -0.1473]])
list [[1.1340515613555908, -1.1115385293960571, 0.3500675857067108, -0.7702727913856506, -0.14726622402668]]


In [None]:
#計算夾角

In [85]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(hello_embed,world_embed)

array([[-0.27264674]])

In [12]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """今天 天氣 如何 你 好 嗎""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}


class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[(['今天', '天氣'], '如何'), (['天氣', '如何'], '你'), (['如何', '你'], '好')]
[7.557050466537476, 7.493309736251831, 7.430240154266357, 7.367831707000732, 7.306158423423767, 7.245250225067139, 7.184972047805786, 7.1253156661987305, 7.06627345085144, 7.007836818695068]


In [15]:
CONTEXT_SIZE = 2  # 左邊2個字，右邊2個字
raw_text = """民進黨 備戰 年底 縣市長 選舉 加緊腳步 改革 拚 政績
不過 有時 官員 為 了 辯護 政策 發言 卻 被 認為 脫離現實 經常 遭 在野黨
或 網友 調侃 最近 行政院長 賴清德 對外 解釋 非核 家園 政策 並非 躁進
也 盼 外界 再 備載 容量 剩得 不夠 多 渲染 成 缺電 此番 言論 就 被 名嘴 黃 創夏 反諷 院長 沒有 智商 問
題 只是 腦子 太小""".split()

# 通過從`raw_text`派生一個集合，我們對數組進行重複數據刪除
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


class CBOW(nn.Module):

    def __init__(self):
        pass

    def forward(self, inputs):
        pass


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example

[(['民進黨', '備戰', '縣市長', '選舉'], '年底'), (['備戰', '年底', '選舉', '加緊腳步'], '縣市長'), (['年底', '縣市長', '加緊腳步', '改革'], '選舉'), (['縣市長', '選舉', '改革', '拚'], '加緊腳步'), (['選舉', '加緊腳步', '拚', '政績'], '改革')]


tensor([  7,  50,  28,   8])

In [20]:
data[0]

(['民進黨', '備戰', '縣市長', '選舉'], '年底')

In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
 
torch.manual_seed(1)
 
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right




raw_text = """民進黨 備戰 年底 縣市長 選舉 加緊腳步 改革 拚 政績
不過 有時 官員 為 了 辯護 政策 發言 卻 被 認為 脫離現實 經常 遭 在野黨
或 網友 調侃 最近 行政院長 賴清德 對外 解釋 非核 家園 政策 並非 躁進
也 盼 外界 再 備載 容量 剩得 不夠 多 渲染 成 缺電 此番 言論 就 被 名嘴 黃 創夏 反諷 院長 沒有 智商 問
題 只是 腦子 太小""".split()
 
# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)
 
word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
#print(data[:5])
 
 
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW,self).__init__() 
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) # embeddings， 待训练参数为embedding词表
        self.linear1 = nn.Linear(embedding_dim, vocab_size) # 待训练参数为 A b
 
 
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        add_embeds = torch.sum(embeds, dim=0).view(1,-1) # 相加后reshape
        out = self.linear1(add_embeds)
        log_probs = F.log_softmax(out)
        return log_probs
 
# create your model and train.  here are some functions to help you make
# the data ready for use by your module
 
 
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)
 
 
#make_context_vector(data[0][0], word_to_ix)  # example
 
# 建立優化器
losses = []
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, embedding_dim=20)
optimizer = optim.SGD(model.parameters(), lr=0.001)
 
# 開始訓練
for epoch in range(10):
    #print('epoch:', epoch, total_loss)
    total_loss = torch.FloatTensor([0])
    for context, target in data:
        context_idxs = [word_to_ix[w] for w in context]
        target_idx = word_to_ix[target]
        context_var = Variable(torch.LongTensor(context_idxs))
        target_var = Variable(torch.LongTensor([target_idx]))
        model.zero_grad()
        log_probs = model(context_var)
 
        loss = loss_function(log_probs,target_var)
        loss.backward()
        optimizer.step()
 
        total_loss += loss.data
    losses.append(total_loss)
print(losses)

[tensor([ 298.9560]), tensor([ 293.2370]), tensor([ 287.6259]), tensor([ 282.1188]), tensor([ 276.7123]), tensor([ 271.4033]), tensor([ 266.1891]), tensor([ 261.0671]), tensor([ 256.0354]), tensor([ 251.0919])]




In [9]:
tensor

NameError: name 'tensor' is not defined

In [3]:
word_to_ix

{'不夠': 16,
 '不過': 54,
 '並非': 27,
 '也': 9,
 '了': 37,
 '備戰': 7,
 '備載': 38,
 '再': 35,
 '剩得': 25,
 '創夏': 41,
 '加緊腳步': 15,
 '卻': 30,
 '反諷': 58,
 '只是': 13,
 '名嘴': 47,
 '問': 60,
 '在野黨': 57,
 '外界': 53,
 '多': 28,
 '太小': 21,
 '官員': 12,
 '家園': 11,
 '容量': 45,
 '對外': 33,
 '就': 5,
 '年底': 24,
 '成': 3,
 '或': 49,
 '拚': 4,
 '改革': 59,
 '政策': 50,
 '政績': 29,
 '智商': 56,
 '最近': 17,
 '有時': 0,
 '此番': 42,
 '民進黨': 55,
 '沒有': 10,
 '渲染': 36,
 '為': 34,
 '發言': 48,
 '盼': 52,
 '經常': 31,
 '網友': 19,
 '縣市長': 18,
 '缺電': 22,
 '脫離現實': 6,
 '腦子': 20,
 '行政院長': 14,
 '被': 2,
 '解釋': 51,
 '言論': 40,
 '認為': 61,
 '調侃': 32,
 '賴清德': 39,
 '躁進': 26,
 '辯護': 44,
 '遭': 46,
 '選舉': 23,
 '院長': 43,
 '非核': 1,
 '題': 62,
 '黃': 8}

In [4]:
#https://blog.csdn.net/CrazyBull2012/article/details/79380669