In [8]:
import torch
import torch.nn as nn
from Transformer_Models import ContextDistilBert
from datasets import load_dataset, concatenate_datasets

In [15]:
articles = load_dataset('online_news_popularity_data')

Found cached dataset online_news_popularity_data (/home/leepark/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/f3e03630a13ebe013884d6a83c7ec52cb4eec2c0f6012f710c9dba58aa719fcd)


  0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    # full_text_encoded = tokenizer([title + ' \n' + content for title, content in zip(batch['title'],batch['content'])],
    #                              return_tensors = 'pt', padding = 'max_length', max_length = 512, truncation = True)
    title_encoded = tokenizer(batch['title'], padding = True, truncation = True)
    title_encoded_renamed = {f"{k}_title":v for k,v in title_encoded.items()}
    content_encoded = tokenizer(batch['content'], padding = True, truncation = True)
    content_encoded_renamed = {f"{k}_content":v for k,v in content_encoded.items()}
#     return {k:torch.Tensor(v) for k,v in full_text_encoded.items()}
    title_encoded_renamed.update(content_encoded_renamed)
    return title_encoded_renamed

articles_encoded = articles.map(tokenize, remove_columns = ['title','content','shares'],
            batched = True, batch_size = None )

articles_encoded = articles_encoded.rename_column('shares_class','labels')
articles_encoded.set_format('pt')


Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/f3e03630a13ebe013884d6a83c7ec52cb4eec2c0f6012f710c9dba58aa719fcd/cache-19a1c81d15117239.arrow


Map:   0%|          | 0/7922 [00:00<?, ? examples/s]

In [17]:
device = torch.device('cpu')
model_ckpt = 'domain_adaptation_final_body'
model = ContextDistilBert(model_ckpt).to(device)

In [18]:
all_dataset = concatenate_datasets([articles_encoded['train'], articles_encoded['validation']])

In [21]:
features = all_dataset.remove_columns(['input_ids_title', 'attention_mask_title', 'input_ids_content', 'attention_mask_content'])

In [30]:
features.set_format('numpy')

In [54]:
import csv
from tqdm import tqdm
import numpy as np

batch_size = 20
training_steps = len(all_dataset)//20

features_name = features.column_names
hidden_state_name = [f"h_{i}" for i in range(1536)]


with open("hidden_state_vector.csv", 'w') as f:
    writer = csv.DictWriter(f, fieldnames= hidden_state_name + features_name)
    writer.writeheader()
    for batch_index in tqdm(range(training_steps +1)):
        torch.cuda.empty_cache()
        with torch.no_grad():
            start = batch_index*batch_size; end = (batch_index+1)*batch_size
            h = model(**{k:v.to(device) for k,v in articles_encoded['train'][start:end].items()}).numpy()
            v = np.hstack(tuple([vec.reshape(-1,1) for vec in features[start:end].values()]))
        for row_h, row_v in zip(h,v):
            row_dict = {f:v for f,v in zip(hidden_state_name, row_h)}
            add_dict = {f:v for f,v in zip(features_name, row_v)}
            row_dict.update(add_dict)
            writer.writerow(row_dict)



  0%|                                        | 5/1981 [01:01<6:43:04, 12.24s/it]


KeyboardInterrupt: 