In [3]:
# ! pip install numpy
# ! pip install zipfile36
# ! pip install pandas
# ! pip install tqdm
# ! pip install ipywidgets
# ! pip install scrapbook
# ! pip install tempfile
# ! pip install tensorflow
# ! pip install retrying
# ! pip install transformers
# ! conda install numpy-base
# ! pip install recommenders[npa,gpu]


In [2]:
import sys
import os
import numpy as np
import zipfile
import pandas as pd
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf

from reco_utils.recommender.deeprec.deeprec_utils import download_deeprec_resources
from reco_utils.recommender.newsrec.newsrec_utils import get_mind_data_set
from reco_utils.recommender.newsrec.io.mind_iterator import MINDIterator

tf.get_logger().setLevel('ERROR')  # only show error messages


print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))


  from pyarrow import HadoopFileSystem


System version: 3.7.10 (default, Jun  4 2021, 14:48:32) 
[GCC 7.5.0]
Tensorflow version: 2.7.0


In [3]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

In [4]:
tmpdir = TemporaryDirectory()
# data_path = tmpdir.name
data_path = os.path.join('datasets', MIND_type)

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")

yaml_file = os.path.join(data_path, "utils", r'npa.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(
        data_path, 'train'), mind_train_dataset)

if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url,
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/',
                               os.path.join(data_path, 'utils'), mind_utils)


In [13]:
columnNames = ['news_ID', 'category', 'SubCategory', 'title',
               'abstract', 'URL', 'titleEntities', 'abstractEntities']
df = pd.read_csv(train_news_file , sep='\t',
                 header=None, names=columnNames)

In [14]:
seq_length = 300
num_samples = df.shape[0]

In [30]:
df['title'][1]

"The Cost of Trump's Aid Freeze in the Trenches of Ukraine's War"

In [35]:
# .pkl
# list word
# np.load(wordDict_file, allow_pickle=True)


# .npy
# list vector
a = np.load(wordEmb_file, allow_pickle=True)
a.shape


# (31028, 768)

(31028, 300)

In [15]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
BERT_tokens = np.zeros((num_samples, seq_length))
x = 0
encoded_input  = np.zeros((df['title'].count()), dtype=object)
for i, title in enumerate(tqdm(df['title'])):
    encoded_input[i] = torch.tensor([tokenizer.encode(title)])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 26740/26740 [00:14<00:00, 1820.38it/s]


In [17]:
outShape = np.zeros((df['title'].count()), dtype=object)
with open('bert.npy', 'wb') as f:
    with torch.no_grad():
        for i in tqdm(range(df['title'].count())):
            output = model(encoded_input[i])
            np.save(f, np.array(output['last_hidden_state'][0].detach().numpy()), allow_pickle=True)
            if i == 10:
                break
    


  0%|          | 0/26740 [00:00<?, ?it/s][A
  0%|          | 1/26740 [00:00<1:06:54,  6.66it/s][A
  0%|          | 2/26740 [00:00<1:03:47,  6.99it/s][A
  0%|          | 3/26740 [00:00<1:01:35,  7.23it/s][A
  0%|          | 4/26740 [00:00<59:27,  7.49it/s]  [A
  0%|          | 5/26740 [00:00<58:46,  7.58it/s][A
  0%|          | 6/26740 [00:00<1:01:26,  7.25it/s][A
  0%|          | 7/26740 [00:00<1:01:17,  7.27it/s][A
  0%|          | 8/26740 [00:01<1:02:23,  7.14it/s][A
  0%|          | 9/26740 [00:01<1:00:41,  7.34it/s][A
  0%|          | 10/26740 [00:01<1:06:51,  6.66it/s][A


In [25]:
output['pooler_output']

tensor([[-0.8991, -0.5201, -0.9452,  0.7746,  0.7234, -0.3744,  0.8507,  0.3547,
         -0.8930, -1.0000, -0.5521,  0.9291,  0.9784,  0.7049,  0.9283, -0.7519,
         -0.5475, -0.5965,  0.3389, -0.4499,  0.7782,  1.0000, -0.2826,  0.4853,
          0.6410,  0.9842, -0.7867,  0.9369,  0.9399,  0.7433, -0.6914,  0.4468,
         -0.9900, -0.2625, -0.9363, -0.9865,  0.4843, -0.6320,  0.0523,  0.0051,
         -0.8996,  0.3441,  1.0000, -0.3589,  0.3788, -0.3925, -1.0000,  0.3703,
         -0.8588,  0.9369,  0.9079,  0.9614,  0.2904,  0.5589,  0.5659, -0.6404,
         -0.0192,  0.2399, -0.3038, -0.6871, -0.7484,  0.3960, -0.8917, -0.8742,
          0.9562,  0.8571, -0.2774, -0.4376, -0.3216,  0.1390,  0.8761,  0.3578,
         -0.2952, -0.8451,  0.8222,  0.4438, -0.7001,  1.0000, -0.5985, -0.9794,
          0.9103,  0.8466,  0.6952, -0.6277,  0.7089, -1.0000,  0.6830, -0.1831,
         -0.9875,  0.4217,  0.7032, -0.3897,  0.7486,  0.6686, -0.6870, -0.6460,
         -0.4157, -0.8516, -

In [9]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
BERT_tokens = np.zeros((len(df), 768),dtype=object)
# encoded_input  = np.zeros((df['title'].count()), dtype=object)
for i, word in enumerate(tqdm(df)):
#     [CLS] word [SEP]
    BERT_tokens[i] = tokenizer(word, return_tensors='pt')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/31027 [00:00<?, ?it/s]


ValueError: cannot copy sequence with size 3 to array axis with dimension 768

In [63]:
BERT_tokens

['the']

In [29]:
outShape = np.zeros((len(df)), dtype=object)
bert_embedding_file = os.path.join(data_path, "utils", "bert_embedding.npy")
with open(bert_embedding_file, 'wb') as f:
    with torch.no_grad():
#         for i in tqdm(range(len(df))):
        output = model(**BERT_tokens)
        np.save(f, np.array(output['last_hidden_state'][0].detach().numpy()), allow_pickle=True)
#         if i == 10:
#             break
    

TypeError: BertModel object argument after ** must be a mapping, not numpy.ndarray

In [2]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
encoded_input

{'input_ids': tensor([[ 101, 5672, 2033, 2011, 2151, 3793, 2017, 1005, 1040, 2066, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [4]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2709, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[-0.9377, -0.5043, -0.9799,  0.9030,  0.9329, -0.2438,  0.8926,  0.2288,
         -0.9531, -1.0000, -0.8862,  0.9906,  0.9855,  0.7155,  0.9455, -0.8645,
         -0.6035, -0.6666,  0.3020, -0.1587,  0.7455,  1.0000, -0.4022,  0.4261,
          0.6151,  0.9996, -0.8773,  0.9594,  0.9585,  0.6950, -0.6718,  0.3325,
         -0.9954, -0.2268, -0.9658, -0.9951,  0.6127, -0.7670,  0.0873,  0.0824,
         -0.9518,  0.4713,  1.000