In [1]:
# ! pip install numpy
# ! pip install zipfile36
# ! pip install pandas
# ! pip install tqdm
# ! pip install ipywidgets
# ! pip install scrapbook
# ! pip install tempfile
# ! pip install tensorflow
# ! pip install retrying
# ! pip install transformers
# ! conda install numpy-base
# ! pip install recommenders[npa,gpu]


In [2]:
import sys
import os
import numpy as np
import zipfile
import pickle
import pandas as pd
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.npa import NPAModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.6.11 (default, Nov 27 2020, 18:37:51) [MSC v.1916 64 bit (AMD64)]
Tensorflow version: 1.15.4


In [3]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

In [4]:
tmpdir = TemporaryDirectory()
# data_path = tmpdir.name
data_path = os.path.join('datasets', MIND_type)

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
wordDict_file_bert = os.path.join(data_path, "utils", "word_dict_bert.pkl")
wordEmb_file_bert = os.path.join(data_path, "utils", "embedding_bert.npy")

yaml_file = os.path.join(data_path, "utils", r'npa.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(
        data_path, 'train'), mind_train_dataset)

if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url,
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/',
                               os.path.join(data_path, 'utils'), mind_utils)


In [5]:
columnNames = ['news_ID', 'category', 'SubCategory', 'title',
               'abstract', 'URL', 'titleEntities', 'abstractEntities']
news = pd.read_csv(train_news_file , sep='\t',
                 header=None, names=columnNames)


In [6]:
from transformers import BertTokenizer, BertModel, BertForPreTraining
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
# model = BertForPreTraining.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
wordDictTokens = []
for i, title in enumerate(tqdm(news['title'])):
    wordDictTokens.append(tokenizer(str(title), truncation=True, padding=True, return_tensors='pt' ))

decodedtokens = []
for j in tqdm(range(len(wordDictTokens))):
    decodedtokens.append(tokenizer.decode(wordDictTokens[j]['input_ids'][0].detach().numpy()))
    
tokens = []
for k, title in enumerate(tqdm(decodedtokens)):
    tokens.append(tokenizer.tokenize(str(title)))

tokenDF = []
for l in tqdm(range(len(tokens))):
    tokenDF.append(pd.DataFrame(tokens[l], columns=['words']))

uniqTokens = pd.concat(tokenDF).drop_duplicates().reset_index(drop=True)

tList= uniqTokens['words'].values.tolist()

wordDict = []
for m in tqdm(range(len(tList))):
    wordDict.append({tList[m]: m+1})

token_ids = []
for n, word in enumerate(tqdm(wordDict)):
    token_ids.append(tokenizer.convert_tokens_to_ids(word))

with open(wordDict_file_bert, 'wb') as handle:
    pickle.dump(wordDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 26740/26740 [00:34<00:00, 768.99it/s] 
100%|██████████| 26740/26740 [00:01<00:00, 14006.69it/s]
100%|██████████| 26740/26740 [00:24<00:00, 1097.40it/s]
100%|██████████| 26740/26740 [00:13<00:00, 1916.15it/s]
100%|██████████| 17203/17203 [00:00<00:00, 1076597.06it/s]
100%|██████████| 17203/17203 [00:00<00:00, 238934.42it/s]


In [8]:
encoded_input  = []

for o, word in enumerate(tqdm(tList)):
    # for p, word in enumerate(title):
    tokenInputes = tokenizer(
        str(word),
        truncation=True,
        padding=True,
        return_tensors='pt',
        max_length=768) 
    # print(word)
    out = model(**tokenInputes)
    encoded_input.append(out['last_hidden_state'][0][1].detach().numpy())

100%|██████████| 17203/17203 [2:32:35<00:00,  1.88it/s]  


In [28]:
# wordEmb_file_bert = os.path.join(data_path, "utils", "embedding_bert.npy")
with open(wordEmb_file_bert, 'wb') as f: 
    np.save(f, encoded_input, allow_pickle=True)

In [30]:
np.load(wordEmb_file_bert)


array([[-0.71301967, -0.1801134 , -0.73847884, ..., -0.09353121,
         0.31587684,  0.01532312],
       [-0.5383994 , -0.78006506, -0.27009398, ...,  0.19941267,
         0.31292754, -0.36979085],
       [-0.28484663, -0.5982345 ,  0.0042835 , ...,  0.2716275 ,
         0.37806424, -0.1773692 ],
       ...,
       [ 0.07916661, -0.22728246, -0.00243599, ...,  0.51640457,
         0.2333358 , -0.14949359],
       [-0.0062239 , -0.29653686,  0.12724929, ...,  0.53706366,
         0.03804766, -0.09236237],
       [-0.10701666, -0.32589817, -0.02490966, ...,  0.11730062,
         0.40794182,  0.2757084 ]], dtype=float32)

In [9]:
# news['title']
    # 0        The Brands Queen Elizabeth, Prince Charles, an...
    # 1        The Cost of Trump's Aid Freeze in the Trenches...

# wordDictTokens
    # [{'input_ids': tensor([[ 101, 1996, 9639, 3035, 3870, 1010, 3159, 2798, 1010, 1998, 3159, 5170,
    #   8415, 2011,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},

# decodedtokens
    # ['[CLS] the brands queen elizabeth, prince charles, and prince philip swear by [SEP]',
    #  "[CLS] the cost of trump's aid freeze in the trenches of ukraine's war [SEP]", ...]

# tokens
    # [['[CLS]', 'the', 'brands', 'queen', 'elizabeth', ',', 'prince', 'charles', ',', 'and', 'prince', 'philip', 'swear', 'by','[SEP]'],...]

# tokenDF
    # [        words
    #  0       [CLS]
    #  1         the
    # ...       ...]

# uniqTokens
    #     words
    # 0	[CLS]
    # 1	the
    # 2	brands
    # 3	queen
    # 4	elizabeth
    # ...    ...

# tList
    # ['[CLS]', 'the', 'brands', 'queen', 'elizabeth',',','prince','charles',',','and','prince','philip','swear','by','[SEP]', ...]

# wordDict
    # [{'[CLS]': 1}, {'the': 2}, ...]

# token_ids
    # [[101],[1996],[9639],[3035],[3870],[1010],[3159],[2798],[1010],[1998],[3159],[5170],[8415],[2011],[102],...]

In [10]:
# # encoded_input  = []

# for o, title in enumerate(tqdm(news['title'])):
#     tokenInputes = tokenizer(
#         str(title),
#         truncation=True,
#         padding=True,
#         return_tensors='pt',
#         max_length=768) 
#     encoded_input = model(**tokenInputes)
#     for p, word in enumerate(title):
#         print( encoded_input.last_hidden_state[0][p].shape)
#         np.savetxt(wordEmb_file_bert, [{word: encoded_input['last_hidden_state'][0][p].detach().numpy()}])
#     if i == 0:
#         # print(title, encoded_input.last_hidden_state[0].shape)
#         break

# # embTokens = {}
# # for o, token in enumerate(tqdm(tokens)):
# #     for p, word in enumerate(token):
# #         embTokens[word] = encoded_input[o].last_hidden_state[0][p]

In [11]:
# # tokens = []
# # for i, title in enumerate(tqdm(len(news['title']))):
# #     tokens.append(tokenizer.tokenize(str(news['title'][i])))

# tokenDF = []
# for i in tqdm(range(len(news['title']))):
#     tokenDF.append(pd.DataFrame(tokens[i], columns=['words']))

# uniqTokens = pd.concat(tokenDF).drop_duplicates().reset_index(drop=True)

# tList= uniqTokens['words'].values.tolist()
# wordDict = {}
# for i, word in enumerate(tqdm(tList)):
#     wordDict[word] = i+1

# token_ids = []
# for i, word in enumerate(tqdm(wordDict)):
#     token_ids.append(tokenizer.convert_tokens_to_ids(word))

# with open(wordDict_file, 'wb') as handle:
#     pickle.dump(wordDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# decodedtokens = []
# for i in tqdm(range(len(wordDictTokens))):
#     decodedtokens.append(tokenizer.decode(wordDictTokens[i]['input_ids'][0].detach().numpy()))

In [13]:
# from transformers import BertTokenizer, BertForPreTraining
# import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForPreTraining.from_pretrained('bert-base-uncased')
# # inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# outputs = model(**wordDictTokens[0])

# prediction_logits = outputs.prediction_logits
# seq_relationship_logits = outputs.seq_relationship_logits

In [14]:
# encoded_input  = np.zeros((len(token_ids)), dtype=object)
# for i, title in enumerate(tqdm(token_ids)):
#     print(i, title)
#     if i == 5:
#         break
#     encoded_input[i] = model([])
# # #     BERT_tokens[i] = tokenizer.tokenize(title)
# # len(wordDict)