In [23]:
# from newsrec.config import model_name
import pandas as pd
import swifter
import json
import math
from tqdm import tqdm
from os import path
from pathlib import Path
import random
from nltk.tokenize import word_tokenize
import numpy as np
import csv
import importlib
from transformers import RobertaTokenizer, RobertaModel
import torch

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
train_dir = '../../data/train'
source = path.join(train_dir, 'news.tsv')
target = path.join(train_dir, 'news_parsed.tsv')
roberta_output_dir = path.join(train_dir, 'roberta')
category2int_path = path.join(train_dir, 'category2int.tsv')
word2int_path = path.join(train_dir, 'word2int.tsv')
entity2int_path = path.join(train_dir, 'entity2int.tsv')
mode='train'

In [4]:
news = pd.read_table(source,
                         header=None,
                         usecols=[0, 1, 2, 3, 4, 6, 7],
                         quoting=csv.QUOTE_NONE,
                         names=[
                             'id', 'category', 'subcategory', 'title',
                             'abstract', 'title_entities', 'abstract_entities'
                         ])  # TODO try to avoid csv.QUOTE_NONE
news.title_entities.fillna('[]', inplace=True)
news.abstract_entities.fillna('[]', inplace=True)
news.fillna(' ', inplace=True)

In [5]:
news.head()

Unnamed: 0,id,category,subcategory,title,abstract,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [7]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
title_roberta = tokenizer(news.title.tolist(),
                          padding='max_length',
                          truncation=True,
                          max_length=20)
abstract_roberta = tokenizer(news.abstract.tolist(),
                             padding='max_length',
                             truncation=True,
                             max_length=50)

In [8]:
roberta_df = pd.DataFrame(data=[
    title_roberta['input_ids'], title_roberta['attention_mask'],
    abstract_roberta['input_ids'], abstract_roberta['attention_mask']
]).T
roberta_df.columns = [
    'title_roberta', 'title_mask_roberta', 'abstract_roberta',
    'abstract_mask_roberta'
]

In [9]:
roberta_df.head()

Unnamed: 0,title_roberta,title_mask_roberta,abstract_roberta,abstract_mask_roberta
0,"[0, 133, 11885, 3929, 4690, 6, 2804, 3163, 6, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 39948, 5, 36459, 6, 20200, 6, 8, 55, 14, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[0, 30888, 15964, 4424, 14829, 14614, 15, 1426...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 20770, 18, 92, 9481, 8255, 836, 380, 2656,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[0, 1096, 31846, 16128, 2629, 286, 163, 10917,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0, 4528, 6590, 27243, 10095, 32, 1826, 47, 12...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[0, 26402, 21519, 9, 15067, 9243, 2196, 148, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 133, 7860, 9, 140, 18, 11572, 36996, 11, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 45105, 4, 12110, 10347, 14717, 2580, 20003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
for x in [title_roberta, abstract_roberta]:
    for key in x.keys():
        x[key] = torch.tensor(x[key]).to(device)
Path(roberta_output_dir).mkdir(parents=True, exist_ok=True)
roberta = RobertaModel.from_pretrained('roberta-base',
                                       return_dict=True).to(device)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [18]:
for k, v in title_roberta.items():
    print(k, v)

input_ids tensor([[    0,   133, 11885,  ...,     1,     1,     1],
        [    0, 30888, 15964,  ...,     1,     1,     1],
        [    0,  1096, 31846,  ...,     1,     1,     1],
        ...,
        [    0,  6179,     5,  ...,     1,     1,     1],
        [    0, 19183,  1847,  ...,     1,     1,     1],
        [    0,  3609,  1250,  ...,     1,     1,     1]])
attention_mask tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [20]:
batch_size = 128
with torch.no_grad():
        title_last_hidden_state = []
        title_pooler_output = []
        abstract_last_hidden_state = []
        abstract_pooler_output = []
        for count in tqdm(range(math.ceil(len(news) / batch_size)),
                          desc="Calculating news embeddings with RoBERTa"):
            title_roberta_minibatch = {
                k: v[count * batch_size:(1 + count) * batch_size]
                for k, v in title_roberta.items()
            }
            title_outputs = roberta(**title_roberta_minibatch)
            title_last_hidden_state.append(
                title_outputs['last_hidden_state'].cpu().numpy())
            title_pooler_output.append(
                title_outputs['pooler_output'].cpu().numpy())

            abstract_roberta_minibatch = {
                k: v[count * batch_size:(1 + count) * batch_size]
                for k, v in abstract_roberta.items()
            }
            abstract_outputs = roberta(**abstract_roberta_minibatch)
            abstract_last_hidden_state.append(
                abstract_outputs['last_hidden_state'].cpu().numpy())
            abstract_pooler_output.append(
                abstract_outputs['pooler_output'].cpu().numpy())

        np.save(path.join(roberta_output_dir, 'title_last_hidden_state.npy'),
                np.concatenate(title_last_hidden_state, axis=0))
        np.save(path.join(roberta_output_dir, 'title_pooler_output.npy'),
                np.concatenate(title_pooler_output, axis=0))
        np.save(
            path.join(roberta_output_dir, 'abstract_last_hidden_state.npy'),
            np.concatenate(abstract_last_hidden_state, axis=0))
        np.save(path.join(roberta_output_dir, 'abstract_pooler_output.npy'),
                np.concatenate(abstract_pooler_output, axis=0))

Calculating news embeddings with RoBERTa:   1%|          | 9/794 [03:05<4:30:15, 20.66s/it]


KeyboardInterrupt: 

In [25]:
category2int = {}
word2int = {}
word2freq = {}
entity2int = {}
entity2freq = {}

word_freq_threshold = 1
entity_freq_threshold = 2

for row in news.itertuples(index=False):
    if row.category not in category2int:
        category2int[row.category] = len(category2int) + 1
    if row.subcategory not in category2int:
        category2int[row.subcategory] = len(category2int) + 1

    for w in word_tokenize(row.title.lower()):
        if w not in word2freq:
            word2freq[w] = 1
        else:
            word2freq[w] += 1
    for w in word_tokenize(row.abstract.lower()):
        if w not in word2freq:
            word2freq[w] = 1
        else:
            word2freq[w] += 1

    for e in json.loads(row.title_entities):
        times = len(e['OccurrenceOffsets']) * e['Confidence']
        if times > 0:
            if e['WikidataId'] not in entity2freq:
                entity2freq[e['WikidataId']] = times
            else:
                entity2freq[e['WikidataId']] += times

    for e in json.loads(row.abstract_entities):
        times = len(e['OccurrenceOffsets']) * e['Confidence']
        if times > 0:
            if e['WikidataId'] not in entity2freq:
                entity2freq[e['WikidataId']] = times
            else:
                entity2freq[e['WikidataId']] += times

for k, v in word2freq.items():
    if v >= word_freq_threshold:
        word2int[k] = len(word2int) + 1

for k, v in entity2freq.items():
    if v >= entity_freq_threshold:
        entity2int[k] = len(entity2int) + 1


In [27]:
def parse_row(row):
        num_words_title = 20
        num_words_abstract = 50
        entity_confidence_threshold = 0.5
        
        new_row = [
            row.id,
            category2int[row.category] if row.category in category2int else 0,
            category2int[row.subcategory]
            if row.subcategory in category2int else 0,
            [0] * num_words_title, [0] * num_words_abstract,
            [0] * num_words_title, [0] * num_words_abstract
        ]

        # Calculate local entity map (map lower single word to entity)
        local_entity_map = {}
        for e in json.loads(row.title_entities):
            if e['Confidence'] > entity_confidence_threshold and e[
                    'WikidataId'] in entity2int:
                for x in ' '.join(e['SurfaceForms']).lower().split():
                    local_entity_map[x] = entity2int[e['WikidataId']]
        for e in json.loads(row.abstract_entities):
            if e['Confidence'] > entity_confidence_threshold and e[
                    'WikidataId'] in entity2int:
                for x in ' '.join(e['SurfaceForms']).lower().split():
                    local_entity_map[x] = entity2int[e['WikidataId']]

        try:
            for i, w in enumerate(word_tokenize(row.title.lower())):
                if w in word2int:
                    new_row[3][i] = word2int[w]
                    if w in local_entity_map:
                        new_row[5][i] = local_entity_map[w]
        except IndexError:
            pass

        try:
            for i, w in enumerate(word_tokenize(row.abstract.lower())):
                if w in word2int:
                    new_row[4][i] = word2int[w]
                    if w in local_entity_map:
                        new_row[6][i] = local_entity_map[w]
        except IndexError:
            pass

        return pd.Series(new_row,
                         index=[
                             'id', 'category', 'subcategory', 'title',
                             'abstract', 'title_entities', 'abstract_entities'
                         ])

In [28]:
parsed_news = news.swifter.apply(parse_row, axis=1)

In [29]:
parsed_news.head()

Unnamed: 0,id,category,subcategory,title,abstract,title_entities,abstract_entities
0,N88753,1,2,"[1, 2, 3, 4, 5, 6, 7, 5, 8, 6, 9, 10, 11, 0, 0...","[12, 1, 13, 5, 14, 5, 8, 15, 16, 1, 17, 18, 19...","[0, 0, 2, 2, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,N45436,3,4,"[23, 24, 25, 26, 27, 28, 0, 0, 0, 0, 0, 0, 0, ...","[29, 30, 31, 32, 33, 34, 35, 36, 26, 37, 38, 3...","[4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,N23144,5,6,"[40, 41, 42, 43, 44, 45, 0, 0, 0, 0, 0, 0, 0, ...","[46, 47, 48, 42, 49, 50, 51, 52, 8, 53, 51, 54...","[0, 0, 0, 0, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,N86255,5,7,"[58, 59, 56, 60, 61, 62, 1, 63, 30, 64, 52, 65...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,N93187,3,8,"[1, 66, 59, 67, 30, 68, 69, 70, 1, 71, 59, 72,...","[74, 75, 76, 77, 78, 79, 80, 59, 81, 82, 83, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [30]:
parsed_news = pd.concat([parsed_news, roberta_df], axis=1)
parsed_news.to_csv(target, sep='\t', index=False)

pd.DataFrame(category2int.items(),
             columns=['category', 'int']).to_csv(category2int_path,
                                                 sep='\t',
                                                 index=False)
print(
    f'Please modify `num_categories` in `src/config.py` into 1 + {len(category2int)}'
)

pd.DataFrame(word2int.items(), columns=['word',
                                        'int']).to_csv(word2int_path,
                                                       sep='\t',
                                                       index=False)
print(
    f'Please modify `num_words` in `src/config.py` into 1 + {len(word2int)}'
)

pd.DataFrame(entity2int.items(),
             columns=['entity', 'int']).to_csv(entity2int_path,
                                               sep='\t',
                                               index=False)
print(
    f'Please modify `num_entities` in `src/config.py` into 1 + {len(entity2int)}'
)

Please modify `num_categories` in `src/config.py` into 1 + 295
Please modify `num_words` in `src/config.py` into 1 + 101220
Please modify `num_entities` in `src/config.py` into 1 + 21842
