In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud
from collections import Counter
import numpy as np
import random
import math

import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

USE_CUDA = torch.cuda.is_available()

random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)
    
#设定一些hyper parameters
C = 3
K = 100
NUM_EPOCHS = 2
MAX_VOCAB_SIZE = 10000
BATCH_SIZE = 192
LEARNING_RATE = 0.2
EMBEDDING_SIZE = 100

def word_tokenize(text):
    return text.split()
    

In [16]:
with open("C:\\Users\\Lyb-PC\\Desktop\\text8.train.txt","r") as fin:
    text = fin.read()

text = text.split()
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))
vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))

idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word:i for i,word in enumerate(idx_to_word)}

In [17]:
word_counts = np.array([count for count in vocab.values()],dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)
word_freqs = word_counts / np.sum(word_counts)
VOCAB_SIZE = len(idx_to_word)
VOCAB_SIZE

10000

In [18]:
class WordEmbeddingDataset(tud.Dataset):
    def __init__(self,text,word_to_idx,idx_to_word,word_freqs,word_counts):
        super(WordEmbeddingDataset,self).__init__()
        self.text_encoded = [word_to_idx.get(word,word_to_idx["<unk>"]) for word in text]
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)
        
    def __len__(self):
        return len(self.text_encoded)
    
    def __getitem__(self, idx):
        center_word = self.text_encoded[idx]
        pos_indices = list(range(idx-C)) + list(range(idx+1, idx+C+1))
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        pos_words = self.text_encoded[pos_indices]
        neg_words = torch.multinomial(self.word_freqs,K * pos_words.shape[0],True)
        
        return center_word,pos_words, neg_words

In [19]:
dataset = WordEmbeddingDataset(text,word_to_idx,idx_to_word,word_freqs,word_counts)
dataloader = tud.DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=0)

In [20]:
class EmbeddingModel(nn.Module):
    def __init__(self,vocab_size,embed_size):
        super(EmbeddingModel,self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        self.in_embed = nn.Embedding(self.vocab_size,self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size,self.embed_size)
        
    def forward(self,input_labels,pos_labels,neg_labels):
        input_embedding = self.in_embed(input_labels)
        pos_embedding = self.in_embed(pos_labels)
        neg_embedding = self.in_embed(neg_labels)
        
        input_embedding = input_emedding.unsqueze(2)
        pos_dot = torch.bmm(pos_embedding,input_embedding).squeeze(2)
        neg_dog = torch.bmm(neg_embedding,-input_embedding).squeeze(2)
        
        log_pos = F.logsigmoid(pos_dot).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)
        loss = log_pos + log_neg
        
        return -loss
    
    def input_embeddings(self):
        return self.in_embed.weight.data.cpu().numpy()


In [21]:
model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()

In [22]:
def evaluate(filename,embedding_weights):
    if filename.endwith(".csv"):
        data = pd.read_csv(filename,sep=",")
    else:
        data = pd.read_csv(filename,sep=",")
    human_similarity = []
    model_similarity = []
    for i in data.iloc[:, 0:2].index:
        word1,word2 = data.iloc[i,0],data.iloc[i,1]
        if word1 not in word_to_idx or word2 not in word_to_idx:
            continue
        else:
            word1_idx,word2_idx = word_to_idx[word1],word_to_idx[word2]
            word1_embed,word2_embed = emedding_weights[[word1_idx]],embedding_weights[[word2_idx]]
            model_similarity.append(float(sklearn.metrics.pairwise.cosine_similarity(word1_embed,word2_embed)))
            human_similarity.append(float(data.iloc[i,2]))
            
        return scipy.stats.spearmanr(human_similarity,model_similarity)
    
    def find_nearest(word):
        index = word_to_idx[word]
        embedding = embedding_weights[index]
        cos_dis = np.array([scipy.spatial.diatance.cosine(e,embedding)for e in embedding_weights])
        return [idx_to_word[i] for i in cos_dis.argsort()[:10]]
    
    
    

NameError: name 'dataloader' is not defined

In [23]:
optimizer = torch.optim.SGD(model.parameters(),lr=LEARNING_RATE)

for e in range(NUM_EPOCHS):
    for i, (input_labels,pos_labels,neg_labels) in enumerate(dataloader):
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        if USE_CUDA:
            input_labels = input_labels.cuda()
            pos_labels = pos_labels.cuda()
            neg_labels = pos_labels.cude()
            
        optimizer.zero_grad()
        loss = model(input_labels,pos_labels,neg_labels).mean()
        
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            with open(LOG_FILE,"a") as fout:
                fout.write("epoch: {},iter: {},loss: {}\n".forrmat(e,i,loss.item()))
                print("epoch: {},iter: {},loss: {}".format(e,i,loss.item()))
                
        if  i % 2000 == 0:
            embedding_weights = model.input_embeddings()
            sim_simlex = evaluate("simlex-999.text",embedding_weights)
            sim_men = evaluate("men.text",embedding_weights)
            sim_353 = evaluate("wordsim353.csv",embedding_weights)
            with open(LOG_FILE,"a") as fout:
                print("epoch: {},iteration: {},simlex-999: {},men: {},sim353: {},nearest to monster: {}\n".format(
                    e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))
                fout.write("epoch: {}, iteration: {}, simlex-999: {},men: {},sim353: {},nearest to monster: {}\n".format(
                    e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))
    embedding_weights = model.input_embeddings()
    np.save("embedding-{}".format(EMBEDDING_SIZE),embedding_weights)
    torch.save(model.state_dict(),"embedding_{}.th".format(EMBEDDING_SIZE))

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 5779220800 bytes. Buy new RAM!


In [24]:
import os
print(os.path.abspath('.'))

C:\Users\Lyb-PC
