#Multi Head Self-Attention 

https://aclanthology.org/D19-1671.pdf

In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

### Microsoft news dataset  
https://msnews.github.io/

https://docs.microsoft.com/en-us/azure/open-datasets/dataset-microsoft-news?tabs=azureml-opendatasets

https://github.com/wuch15/EMNLP2019-NRMS


In [None]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gdown https://drive.google.com/uc?id=1Wro_pVngRi0OVig_i0Edfh5xcCyJQTpD
!gdown https://drive.google.com/uc?id=1bXSeI0FIJCSUlRUxOJrGgg-hTdqdpv-F


Downloading...
From: https://drive.google.com/uc?id=1Wro_pVngRi0OVig_i0Edfh5xcCyJQTpD
To: /content/MINDsmall_train.zip
100% 53.0M/53.0M [00:00<00:00, 276MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bXSeI0FIJCSUlRUxOJrGgg-hTdqdpv-F
To: /content/MINDsmall_dev.zip
100% 30.9M/30.9M [00:00<00:00, 215MB/s]


In [None]:
!unzip "MINDsmall_dev.zip"  -d  "/content/val"
!unzip "MINDsmall_train.zip"  -d  "/content/train"

Archive:  MINDsmall_dev.zip
replace /content/val/behaviors.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/val/behaviors.tsv  
replace /content/val/entity_embedding.vec? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/val/entity_embedding.vec  
replace /content/val/news.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/val/news.tsv   
replace /content/val/relation_embedding.vec? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/val/relation_embedding.vec  
Archive:  MINDsmall_train.zip
replace /content/train/behaviors.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/train/behaviors.tsv  
replace /content/train/entity_embedding.vec? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/train/entity_embedding.vec  
replace /content/train/news.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/train/news.tsv  
replace /content/train/relation_embedding.vec? [y]es, [n]o, [A]ll, [N]one, [r

### behaviors data
- [Impression ID] [User ID] [Impression Time] [User Click History] [Impression News]

### news data
- [News ID] [Category] [Subcategory] [News Title] [News Abstrct] [News Url] [Entities in News Title] [Entities in News Abstract] ...

Generate Embedding:

- word embeddings
-  news vertical embeddings
-  news subvertical embeddings 
-  user id embedding.

In [None]:
!pip install recommenders

In [None]:
import sys
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm
import pickle
import numpy as np


from tempfile import TemporaryDirectory
from recommenders.datasets.mind import (download_mind,
                                     extract_mind,
                                     download_and_extract_glove,
                                     load_glove_matrix,
                                     word_tokenize
                                    )
from recommenders.datasets.download_utils import unzip_file

In [None]:
word_embedding_dim = 300

In [None]:
data_path="train/"
val_data_path="val/"

In [None]:
news = pd.read_table(os.path.join(data_path, 'news.tsv'),
                     names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['vertical', 'subvertical', 'title', 'abstract'])
news.head(3)

Unnamed: 0,vertical,subvertical,title,abstract
0,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...
2,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...


In [None]:
news_vertical = news.vertical.drop_duplicates().reset_index(drop=True)
vert_dict_inv = news_vertical.to_dict()
vert_dict = {v: k+1 for k, v in vert_dict_inv.items()}

news_subvertical = news.subvertical.drop_duplicates().reset_index(drop=True)
subvert_dict_inv = news_subvertical.to_dict()
subvert_dict = {v: k+1 for k, v in vert_dict_inv.items()}

In [None]:
news.title = news.title.apply(word_tokenize)
news.abstract = news.abstract.apply(word_tokenize)

In [None]:
word_cnt = Counter()
word_cnt_all = Counter()

for i in tqdm(range(len(news))):
    word_cnt.update(news.loc[i]['title'])
    word_cnt_all.update(news.loc[i]['title'])
    word_cnt_all.update(news.loc[i]['abstract'])

100%|██████████| 51282/51282 [00:11<00:00, 4483.17it/s]


In [None]:
word_dict = {k: v+1 for k, v in zip(word_cnt, range(len(word_cnt)))}
word_dict_all = {k: v+1 for k, v in zip(word_cnt_all, range(len(word_cnt_all)))}

In [None]:
!mkdir utils

mkdir: cannot create directory ‘utils’: File exists


In [None]:
output_path= 'utils'

In [None]:
with open(os.path.join(output_path, 'vert_dict.pkl'), 'wb') as f:
    pickle.dump(vert_dict, f)
    
with open(os.path.join(output_path, 'subvert_dict.pkl'), 'wb') as f:
    pickle.dump(subvert_dict, f)

with open(os.path.join(output_path, 'word_dict.pkl'), 'wb') as f:
    pickle.dump(word_dict, f)
    
with open(os.path.join(output_path, 'word_dict_all.pkl'), 'wb') as f:
    pickle.dump(word_dict, f)

### Prepare embedding matrixs


In [None]:
glove_path = download_and_extract_glove(data_path)
embedding_matrix, exist_word = load_glove_matrix(glove_path, word_dict, word_embedding_dim)
embedding_all_matrix, exist_all_word = load_glove_matrix(glove_path, word_dict_all, word_embedding_dim)

100%|██████████| 842k/842k [02:39<00:00, 5.27kKB/s]


In [None]:
embedding_matrix, exist_word = load_glove_matrix(glove_path, word_dict, word_embedding_dim)
embedding_all_matrix, exist_all_word = load_glove_matrix(glove_path, word_dict_all, word_embedding_dim)

400000it [00:08, 47587.68it/s]
400000it [00:09, 40745.94it/s]


In [None]:
np.save(os.path.join(output_path, 'embedding.npy'), embedding_matrix)
np.save(os.path.join(output_path, 'embedding_all.npy'), embedding_all_matrix)

#### uid2index.pkl

In [None]:
uid2index = {}

with open(os.path.join(data_path, 'behaviors.tsv'), 'r') as f:
    for l in tqdm(f):
        uid = l.strip('\n').split('\t')[1]
        if uid not in uid2index:
            uid2index[uid] = len(uid2index) + 1

156965it [00:00, 559725.26it/s]


In [None]:
with open(os.path.join(output_path, 'uid2index.pkl'), 'wb') as f:
    pickle.dump(uid2index, f)

In [None]:
utils_state = {
    'vert_num': len(vert_dict),
    'subvert_num': len(subvert_dict),
    'word_num': len(word_dict),
    'word_num_all': len(word_dict_all),
    'embedding_exist_num': len(exist_word),
    'embedding_exist_num_all': len(exist_all_word),
    'uid2index': len(uid2index)
}

### NRMS

In [None]:
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

In [None]:
epochs = 5
seed = 42
batch_size = 32

# Options: demo, small, large
MIND_type = 'demo'

In [None]:
train_news_file = os.path.join(data_path, r'news.tsv')
train_behaviors_file = os.path.join(data_path, r'behaviors.tsv')
valid_news_file = os.path.join(val_data_path, r'news.tsv')
valid_behaviors_file = os.path.join(val_data_path, r'behaviors.tsv')
wordEmb_file = os.path.join(output_path, "embedding.npy")
userDict_file = os.path.join(output_path, "uid2index.pkl")
wordDict_file = os.path.join(output_path, "word_dict.pkl")
subvert_dict = os.path.join(output_path, "subvert_dict.pkl")
vert_dict = os.path.join(output_path, "vert_dict.pkl")

In [None]:
# data:
#   title_size: 30
#   body_size: 50
#   his_size: 50
#   vert_num: 17
#   subvert_num: 249
#   data_format: naml
#   npratio: 4
  
# info:
#   metrics:
#   - group_auc
#   - mean_mrr
#   - ndcg@5;10
#   show_step: 100000
  
# model:
#   attention_hidden_dim: 200
#   word_emb_dim: 300
#   vert_emb_dim: 100
#   subvert_emb_dim: 100

#   dropout: 0.2
#   filter_num: 400
#   window_size: 3
#   cnn_activation: relu
#   model_type: naml
#   dense_activation: relu

# train:
#   batch_size: 64
#   epochs: 10
#   learning_rate: 0.0001
#   loss: cross_entropy_loss
#   optimizer: adam
#   support_quick_scoring: true


In [None]:
yaml_file = os.path.join(output_path, r'param.yaml')


In [None]:
#  from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources
#  if not os.path.exists(yaml_file):
#         download_deeprec_resources(
#             r"https://recodatasets.z20.web.core.windows.net/newsrec/",
#             os.path.join(output_path, "utils"),
#             "MINDdemo_utils.zip",
#         )

100%|██████████| 95.0k/95.0k [00:12<00:00, 7.41kKB/s]


In [None]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          subvertDict_file=subvert_dict,
                          vertDict_file =vert_dict,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 32, 'show_step': 10, 'title_size': 30, 'body_size': 50, 'his_size': 50, 'vert_num': 17, 'subvert_num': 249, 'data_format': 'naml', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'naml', 'dense_activation': 'relu', 'loss': 'cross_entropy_loss', 'wordEmb_file': 'utils/embedding.npy', 'wordDict_file': 'utils/word_dict.pkl', 'userDict_file': 'utils/uid2index.pkl', 'subvertDict_file': 'utils/subvert_dict.pkl', 'vertDict_file': 'utils/vert_dict.pkl'}


### Train the NRMS model


In [None]:
iterator = MINDIterator

In [None]:
model = NRMSModel(hparams, iterator, seed=seed)

  super(Adam, self).__init__(name, **kwargs)


In [None]:
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
1326it [00:04, 287.36it/s]
2286it [00:44, 51.29it/s]
73152it [00:08, 8959.96it/s]


{'group_auc': 0.4659, 'mean_mrr': 0.201, 'ndcg@5': 0.2032, 'ndcg@10': 0.265}


In [None]:
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)


step 7380 , total_loss: 1.4067, data_loss: 1.4272: : 7386it [14:02,  8.77it/s]
1326it [00:01, 814.53it/s]
2286it [00:40, 56.64it/s]
73152it [00:07, 9380.47it/s]


at epoch 1
train info: logloss loss:1.406682246234273
eval info: group_auc:0.6111, mean_mrr:0.2712, ndcg@10:0.3632, ndcg@5:0.2934
at epoch 1 , train time: 842.3 eval time: 108.7


step 7380 , total_loss: 1.3218, data_loss: 1.2796: : 7386it [13:52,  8.87it/s]
1326it [00:01, 811.84it/s]
2286it [00:40, 56.53it/s]
73152it [00:07, 9195.13it/s]


at epoch 2
train info: logloss loss:1.3217672884399705
eval info: group_auc:0.6324, mean_mrr:0.2836, ndcg@10:0.3773, ndcg@5:0.3099
at epoch 2 , train time: 832.3 eval time: 108.4


step 7380 , total_loss: 1.2846, data_loss: 1.1310: : 7386it [13:52,  8.88it/s]
1326it [00:01, 816.80it/s]
2286it [00:40, 56.61it/s]
73152it [00:07, 9521.24it/s]


at epoch 3
train info: logloss loss:1.2844912215619433
eval info: group_auc:0.6388, mean_mrr:0.2893, ndcg@10:0.3847, ndcg@5:0.3172
at epoch 3 , train time: 832.1 eval time: 107.7


step 7380 , total_loss: 1.2599, data_loss: 1.2753: : 7386it [13:52,  8.87it/s]
1326it [00:01, 816.61it/s]
2286it [00:40, 56.58it/s]
73152it [00:07, 9382.43it/s]


at epoch 4
train info: logloss loss:1.259794737374011
eval info: group_auc:0.6455, mean_mrr:0.2954, ndcg@10:0.3907, ndcg@5:0.325
at epoch 4 , train time: 832.9 eval time: 108.0


step 7380 , total_loss: 1.2402, data_loss: 1.0269: : 7386it [13:52,  8.87it/s]
1326it [00:01, 806.32it/s]
2286it [00:40, 56.44it/s]
73152it [00:07, 9535.13it/s]


at epoch 5
train info: logloss loss:1.2402255413834395
eval info: group_auc:0.6457, mean_mrr:0.2981, ndcg@10:0.3925, ndcg@5:0.3253
at epoch 5 , train time: 832.5 eval time: 108.0


<recommenders.models.newsrec.models.nrms.NRMSModel at 0x7f6664c48a50>

In [None]:
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)

1326it [00:01, 742.37it/s]
2286it [00:40, 56.42it/s]
73152it [00:08, 8373.77it/s]


{'group_auc': 0.6457, 'mean_mrr': 0.2981, 'ndcg@5': 0.3253, 'ndcg@10': 0.3925}


In [None]:
model_path = os.path.join(data_path, "model")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "nrms_ckpt"))

In [None]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)


1326it [00:01, 808.22it/s]
2286it [00:40, 56.51it/s]
73152it [00:09, 7869.39it/s]


In [None]:
with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()
        pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'
        f.write(' '.join([str(impr_index), pred_rank])+ '\n')

73152it [00:01, 56693.38it/s]
