## This notebook is for training WR and saving the processed embedding

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader,random_split
from torch.optim import Adam
from torch.autograd import Variable

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from collections import defaultdict

from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import *

In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
import tools.models as models
import tools.dataloaders as dataloaders
import tools.all_test_forBERT as all_test_forBERT
import tools.loaddatasets as loaddatasets

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
#choose bert model
model_name = 'bert-base-uncased'
if model_name == 'bert-base-uncased':
    emb_type = 'base'
if model_name == 'bert-large-uncased':
    emb_type = 'large'

#random state for PCA
random_state = 42

#list the ds you want to train
D = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,25,30,35,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]

#training batch size
batch_size = 32
# learning rate
lr = 2e-4
#training epoch    
EPOCHS = 200

In [5]:
# all dataset path for training
word_simi_train_file = 'datasets//word_simi_train.csv'
word_simi_test_file = 'datasets//word_simi_test.csv'
analogy_test_file = 'datasets//word_analogy.csv'
text_simi_test_file = 'datasets//text_simi.csv'

In [6]:
bert_model = BertModel.from_pretrained(model_name)
bert_model.eval()
bert_tokenizer = BertTokenizer.from_pretrained(model_name)
embedding = bert_model.get_input_embeddings()

ids = torch.tensor(range(30522))
E = embedding(ids).detach().numpy()
print('BERT Embedding shape check:', E.shape)

emb_dimension = E.shape[1]
vocab_len = E.shape[0]

BERT Embedding shape check: (30522, 768)


In [7]:
pca = PCA(random_state = random_state).fit(E)

In [8]:
# U
E = torch.tensor(E)
U = pca.components_
np.save('trained-embedding//U_%s.npy' % emb_type , U)
U = torch.tensor(U)
print(E.shape)
print(U.shape)

torch.Size([30522, 768])
torch.Size([768, 768])


In [9]:
# load datasets
word_simi_train, word_simi_test,analogy_test, text_simi_test = \
loaddatasets.load_datasets(bert_tokenizer, embedding, word_simi_train_file, word_simi_test_file, analogy_test_file, text_simi_test_file)

word similarity training dataset shape: (7703, 7)
   index   word1     word2  simi dataset_name  \
0    611   misty   weather  0.48      men3000   
1    530  parrot      wing  0.52      men3000   
2   2787     bay  chipmunk -0.72      men3000   

                                                  e1  \
0  [tensor(-0.0009), tensor(0.0425), tensor(0.018...   
1  [tensor(-0.0032), tensor(0.0592), tensor(0.026...   
2  [tensor(-0.0462), tensor(-0.0383), tensor(0.00...   

                                                  e2  
0  [tensor(-0.0026), tensor(-0.0180), tensor(-0.0...  
1  [tensor(0.0337), tensor(-0.0694), tensor(-0.01...  
2  [tensor(-0.0786), tensor(-0.2977), tensor(0.01...  
(2447, 10)
   index     word1     word2  simi dataset_name      id1      id2  id_num  \
0   1801      jean   washing -0.16      men3000   [3744]  [12699]       2   
1   1190   blurred      lens  0.20      men3000  [18449]  [10014]       2   
2   1817     lunch   morning -0.16      men3000   [6265]   [2851] 

In [10]:
train_loader = dataloaders.create_data_loader_forBERT(word_simi_train, batch_size, True, dataloaders.Dataset_direct2emb)
test_loader = dataloaders.create_data_loader_forBERT(word_simi_test, batch_size, False, dataloaders.Dataset_direct2emb)

In [11]:
def train_epoch(model, data_loader, loss_fn, optimizer,device):
    
    model = model.train()
    losses = []
    
    for step,d in enumerate(data_loader):
        emb = d['emb'].to(device)
        simi_label = d['simi_label'].to(device)
        
        simi_predict = model(x = emb)
        
        loss = loss_fn(simi_predict, simi_label)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
        
    return losses

In [12]:
# training
for d in D:
    print(f'D: {d}')
    print('~' * 10)
    u = U[:d]
    u = Variable(torch.tensor(u.T), requires_grad=False).to(device)
    print('this time u\'shape is: ', u.shape)

    model = models.Percoefficient_Model(emb_dimension = emb_dimension, component_num = d, U = u).to(device)
    optimizer = Adam(model.parameters(), lr = lr)
    total_steps = len(train_loader) * EPOCHS
    loss_fn = nn.MSELoss().to(device)
    
    for epoch in range(EPOCHS):
        #标出每个EPOCHS的头部
        print('-' * 10)
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        
        train_loss = train_epoch(
        model,
        train_loader,
        loss_fn, 
        optimizer,
        device
       )
        epoch_loss = np.mean(train_loss)
        print(f'Train loss {epoch_loss} ')
    
    x = []
    for parameters in model.parameters():
        print(parameters)
        x.append(parameters)
    para = x[0].sum(axis = 0).cpu().detach()

    u_cpu = u.cpu().detach()
    coe = torch.matmul(E,u_cpu)
    weighted_coe = torch.mul(para,coe)
    weighted_u = torch.matmul(weighted_coe,u_cpu.T)
    
    Emb = (E-weighted_u).numpy()
    np.save('trained-embedding/%sEmb_%s.npy' %(emb_type, d),Emb)
    torch.save(model,'trained-model/%s_%s_%s.pth' %(emb_type, d, EPOCHS))
    print('%s_%s_%s model saved' %(emb_type, d, EPOCHS) )

D: 1
~~~~~~~~~~


  


this time u'shape is:  torch.Size([768, 1])
----------
Epoch 1/200


  e1 = torch.tensor(self.emb1[index])
  e2 = torch.tensor(self.emb2[index])


Train loss 0.4542343306181997 
----------
Epoch 2/200
Train loss 0.4502802076694123 
----------
Epoch 3/200
Train loss 0.44681152058260637 
----------
Epoch 4/200
Train loss 0.4431554086716503 
----------
Epoch 5/200
Train loss 0.43987420143056627 
----------
Epoch 6/200
Train loss 0.4367636022832821 
----------
Epoch 7/200
Train loss 0.43347772382431743 
----------
Epoch 8/200
Train loss 0.43046133701635353 
----------
Epoch 9/200
Train loss 0.42787628900481395 
----------
Epoch 10/200
Train loss 0.4248610768294413 
----------
Epoch 11/200
Train loss 0.42182211438319633 
----------
Epoch 12/200
Train loss 0.41956925031652076 
----------
Epoch 13/200
Train loss 0.4167904350199604 
----------
Epoch 14/200
Train loss 0.41462637186703943 
----------
Epoch 15/200
Train loss 0.4123127662074683 
----------
Epoch 16/200
Train loss 0.4103440793675126 
----------
Epoch 17/200
Train loss 0.40874562324729247 
----------
Epoch 18/200
Train loss 0.4064404626619837 
----------
Epoch 19/200
Train los

Train loss 0.38939289257542087 
----------
Epoch 150/200
Train loss 0.38935434476213315 
----------
Epoch 151/200
Train loss 0.3890383558752803 
----------
Epoch 152/200
Train loss 0.3893056823832473 
----------
Epoch 153/200
Train loss 0.38912753463550603 
----------
Epoch 154/200
Train loss 0.3890020782746009 
----------
Epoch 155/200
Train loss 0.389364244037435 
----------
Epoch 156/200
Train loss 0.3893070104343621 
----------
Epoch 157/200
Train loss 0.3890570174834092 
----------
Epoch 158/200
Train loss 0.38911821451409595 
----------
Epoch 159/200
Train loss 0.3893116154288136 
----------
Epoch 160/200
Train loss 0.389134781463035 
----------
Epoch 161/200
Train loss 0.3892636006083909 
----------
Epoch 162/200
Train loss 0.3891553054178327 
----------
Epoch 163/200
Train loss 0.38935094925097263 
----------
Epoch 164/200
Train loss 0.38914157503787933 
----------
Epoch 165/200
Train loss 0.38913274721621516 
----------
Epoch 166/200
Train loss 0.389108912102893 
----------
Ep

Train loss 0.28163780389611154 
----------
Epoch 95/200
Train loss 0.28161039294319307 
----------
Epoch 96/200
Train loss 0.2815599915059073 
----------
Epoch 97/200
Train loss 0.28154829926170094 
----------
Epoch 98/200
Train loss 0.28166620159936967 
----------
Epoch 99/200
Train loss 0.28146550972459883 
----------
Epoch 100/200
Train loss 0.2815329680522715 
----------
Epoch 101/200
Train loss 0.2815340563116101 
----------
Epoch 102/200
Train loss 0.28145392457850266 
----------
Epoch 103/200
Train loss 0.2815498942021556 
----------
Epoch 104/200
Train loss 0.28165172050008 
----------
Epoch 105/200
Train loss 0.28147869599341674 
----------
Epoch 106/200
Train loss 0.28164387005711466 
----------
Epoch 107/200
Train loss 0.28159119993375886 
----------
Epoch 108/200
Train loss 0.2815780165918115 
----------
Epoch 109/200
Train loss 0.2815979914998764 
----------
Epoch 110/200
Train loss 0.2815324813870139 
----------
Epoch 111/200
Train loss 0.2815859931095296 
----------
Epoc

Train loss 0.2712820513791806 
----------
Epoch 39/200
Train loss 0.2714567512522196 
----------
Epoch 40/200
Train loss 0.2714072301635259 
----------
Epoch 41/200
Train loss 0.2713844835063916 
----------
Epoch 42/200
Train loss 0.2714408598809126 
----------
Epoch 43/200
Train loss 0.2714010403645146 
----------
Epoch 44/200
Train loss 0.27138509513440656 
----------
Epoch 45/200
Train loss 0.2712906825408464 
----------
Epoch 46/200
Train loss 0.2713982375248385 
----------
Epoch 47/200
Train loss 0.27143216509336965 
----------
Epoch 48/200
Train loss 0.2715596715838759 
----------
Epoch 49/200
Train loss 0.27136952676000226 
----------
Epoch 50/200
Train loss 0.271295262685935 
----------
Epoch 51/200
Train loss 0.2713045808829224 
----------
Epoch 52/200
Train loss 0.2714223340187215 
----------
Epoch 53/200
Train loss 0.2712883496837389 
----------
Epoch 54/200
Train loss 0.27149503115536844 
----------
Epoch 55/200
Train loss 0.2714053594687536 
----------
Epoch 56/200
Train l

Train loss 0.2713213749331297 
----------
Epoch 186/200
Train loss 0.2714886429798343 
----------
Epoch 187/200
Train loss 0.27140742441225846 
----------
Epoch 188/200
Train loss 0.2713177964896987 
----------
Epoch 189/200
Train loss 0.27136400854979675 
----------
Epoch 190/200
Train loss 0.2713502099698091 
----------
Epoch 191/200
Train loss 0.2713101568156937 
----------
Epoch 192/200
Train loss 0.27137780833203823 
----------
Epoch 193/200
Train loss 0.27144460583959923 
----------
Epoch 194/200
Train loss 0.2714426160947493 
----------
Epoch 195/200
Train loss 0.2713841682476418 
----------
Epoch 196/200
Train loss 0.2713590233107086 
----------
Epoch 197/200
Train loss 0.27143588747852443 
----------
Epoch 198/200
Train loss 0.2714100449785621 
----------
Epoch 199/200
Train loss 0.2714460226768602 
----------
Epoch 200/200
Train loss 0.2714438604706666 
Parameter containing:
tensor([[1.0000, 1.0000, 1.0000]], device='cuda:0', requires_grad=True)
base_3_200 model saved


## now WR-d embeddings and models are trained and saved, you can find them in fold: trained-embedding