In [1]:
# ! pip install numpy
# ! pip install zipfile36
# ! pip install pandas
# ! pip install tqdm
# ! pip install ipywidgets
# ! pip install scrapbook
# ! pip install tempfile
# ! pip install tensorflow
# ! pip install retrying
# ! pip install transformers


In [2]:
import sys
import os
import numpy as np
import zipfile
import pandas as pd
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR')  # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.npa import NPAModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.6.11 (default, Nov 27 2020, 18:37:51) [MSC v.1916 64 bit (AMD64)]
Tensorflow version: 1.15.4


In [3]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

In [4]:
tmpdir = TemporaryDirectory()
# data_path = tmpdir.name
data_path = os.path.join('datasets', MIND_type)

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")

yaml_file = os.path.join(data_path, "utils", r'npa.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(
        data_path, 'train'), mind_train_dataset)

if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url,
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/',
                               os.path.join(data_path, 'utils'), mind_utils)


In [5]:
df = np.load(wordDict_file, allow_pickle=True)
df

{'the': 1,
 'brands': 2,
 'queen': 3,
 'elizabeth': 4,
 ',': 5,
 'prince': 6,
 'charles': 7,
 'and': 8,
 'philip': 9,
 'swear': 10,
 'by': 11,
 '50': 12,
 'worst': 13,
 'habits': 14,
 'for': 15,
 'belly': 16,
 'fat': 17,
 'cost': 18,
 'of': 19,
 'trump': 20,
 's': 21,
 'aid': 22,
 'freeze': 23,
 'in': 24,
 'trenches': 25,
 'ukraine': 26,
 'war': 27,
 'i': 28,
 'was': 29,
 'an': 30,
 'nba': 31,
 'wife': 32,
 '.': 33,
 'here': 34,
 'how': 35,
 'it': 36,
 'affected': 37,
 'my': 38,
 'mental': 39,
 'health': 40,
 'to': 41,
 'get': 42,
 'rid': 43,
 'skin': 44,
 'tags': 45,
 'according': 46,
 'a': 47,
 'dermatologist': 48,
 'should': 49,
 'nfl': 50,
 'be': 51,
 'able': 52,
 'fine': 53,
 'players': 54,
 'criticizing': 55,
 'officiating': 56,
 '?': 57,
 'been': 58,
 'orlando': 59,
 'hottest': 60,
 'october': 61,
 'ever': 62,
 'so': 63,
 'far': 64,
 'but': 65,
 'cooler': 66,
 'temperatures': 67,
 'on': 68,
 'way': 69,
 'chile': 70,
 'three': 71,
 'die': 72,
 'supermarket': 73,
 'fire': 74,
 'am

In [6]:
seq_length = 300
num_samples = df.shape[0]

In [7]:
df['title'].count()

26740

In [8]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
BERT_tokens = np.zeros((num_samples, seq_length))
x = 0
encoded_input  = np.zeros((df['title'].count()), dtype=object)
for i, title in enumerate(tqdm(df['title'])):
    encoded_input[i] = torch.tensor([tokenizer.encode(title)])


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 26740/26740 [00:18<00:00, 1453.92it/s]


In [16]:
outShape = np.zeros((df['title'].count()), dtype=object)
with open('bert.npy', 'wb') as f:
    with torch.no_grad():
        for i in tqdm(range(df['title'].count())):
            output = model(encoded_input[i])
            np.save(f, np.array(output['last_hidden_state'][0].detach().numpy()), allow_pickle=True)
#             if i == 10:
#                 break
    

  0%|          | 10/26740 [00:23<17:25:15,  2.35s/it]


In [28]:
bert_embedding_file = os.path.join(data_path, "utils", "bert_embedding.npy")
%timeit np.save(bert_embedding_file, outShape)


The slowest run took 8.33 times longer than the fastest. This could mean that an intermediate result is being cached.
5.49 ms ± 5.64 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)


# Lets check the file size of MS Paint exe 
# or you can use any file path
file_path = r"bert.npy"
print(file_size(bert_embedding_file))

575.0 KB


In [19]:
wordDict_file_bert = os.path.join(data_path, "utils", "word_dict_bert.pkl")

a = np.load(wordDict_file_bert, allow_pickle=True)
a

{'[CLS]': 1,
 'the': 2,
 'brands': 3,
 'queen': 4,
 'elizabeth': 5,
 ',': 6,
 'prince': 7,
 'charles': 8,
 'and': 9,
 'philip': 10,
 'swear': 11,
 'by': 12,
 '[SEP]': 13,
 'cost': 14,
 'of': 15,
 'trump': 16,
 "'": 17,
 's': 18,
 'aid': 19,
 'freeze': 20,
 'in': 21,
 'trenches': 22,
 'ukraine': 23,
 'war': 24,
 'i': 25,
 'was': 26,
 'an': 27,
 'nba': 28,
 'wife': 29,
 '.': 30,
 'here': 31,
 'how': 32,
 'it': 33,
 'affected': 34,
 'my': 35,
 'mental': 36,
 'health': 37,
 'to': 38,
 'get': 39,
 'rid': 40,
 'skin': 41,
 'tags': 42,
 'according': 43,
 'a': 44,
 'der': 45,
 '##mat': 46,
 '##ologist': 47,
 'been': 48,
 'orlando': 49,
 'hottest': 50,
 'october': 51,
 'ever': 52,
 'so': 53,
 'far': 54,
 'but': 55,
 'cooler': 56,
 'temperatures': 57,
 'on': 58,
 'way': 59,
 'best': 60,
 'ps': 61,
 '##5': 62,
 'games': 63,
 ':': 64,
 'top': 65,
 'playstation': 66,
 '5': 67,
 'titles': 68,
 'look': 69,
 'forward': 70,
 'report': 71,
 'weather': 72,
 '-': 73,
 'related': 74,
 'closing': 75,
 '##s'

In [21]:
b = np.load(wordEmb_file, allow_pickle=True)
b[1]

array([ 2.7204e-01, -6.2030e-02, -1.8840e-01,  2.3225e-02, -1.8158e-02,
        6.7192e-03, -1.3877e-01,  1.7708e-01,  1.7709e-01,  2.5882e+00,
       -3.5179e-01, -1.7312e-01,  4.3285e-01, -1.0708e-01,  1.5006e-01,
       -1.9982e-01, -1.9093e-01,  1.1871e+00, -1.6207e-01, -2.3538e-01,
        3.6640e-03, -1.9156e-01, -8.5662e-02,  3.9199e-02, -6.6449e-02,
       -4.2090e-02, -1.9122e-01,  1.1679e-02, -3.7138e-01,  2.1886e-01,
        1.1423e-03,  4.3190e-01, -1.4205e-01,  3.8059e-01,  3.0654e-01,
        2.0167e-02, -1.8316e-01, -6.5186e-03, -8.0549e-03, -1.2063e-01,
        2.7507e-02,  2.9839e-01, -2.2896e-01, -2.2882e-01,  1.4671e-01,
       -7.6301e-02, -1.2680e-01, -6.6651e-03, -5.2795e-02,  1.4258e-01,
        1.5610e-01,  5.5510e-02, -1.6149e-01,  9.6290e-02, -7.6533e-02,
       -4.9971e-02, -1.0195e-02, -4.7641e-02, -1.6679e-01, -2.3940e-01,
        5.0141e-03, -4.9175e-02,  1.3338e-02,  4.1923e-01, -1.0104e-01,
        1.5111e-02, -7.7706e-02, -1.3471e-01,  1.1900e-01,  1.08