In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
articles = load_dataset('online_news_popularity_data')

Found cached dataset online_news_popularity_data (/home/leepark/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/f3e03630a13ebe013884d6a83c7ec52cb4eec2c0f6012f710c9dba58aa719fcd)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def tokenize(batch):
    # full_text_encoded = tokenizer([title + ' \n' + content for title, content in zip(batch['title'],batch['content'])],
    #                              return_tensors = 'pt', padding = 'max_length', max_length = 512, truncation = True)
    title_encoded = tokenizer(batch['title'], padding = True,  truncation = True)
    title_encoded_renamed = {f"{k}_title":v for k,v in title_encoded.items()}
    content_encoded = tokenizer(batch['content'], padding = 'max_length', max_length = 512, truncation = True)
    content_encoded_renamed = {f"{k}_content":v for k,v in content_encoded.items()}
#     return {k:torch.Tensor(v) for k,v in full_text_encoded.items()}
    title_encoded_renamed.update(content_encoded_renamed)
    return title_encoded_renamed

In [3]:
articles_encoded = articles.map(tokenize, remove_columns = ['title','content','shares'],
            batched = True, batch_size = None)

Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/f3e03630a13ebe013884d6a83c7ec52cb4eec2c0f6012f710c9dba58aa719fcd/cache-97268e3b8769f89b.arrow
Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/f3e03630a13ebe013884d6a83c7ec52cb4eec2c0f6012f710c9dba58aa719fcd/cache-60f76fa598c78ae0.arrow


In [4]:
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch
import os
from dataclasses import dataclass
from Transformer_Models import ContextDistilBert, ContextDistilBertwithData


@dataclass
class SequenceClassifierOutput:
    loss: torch.FloatTensor = None
    logits: torch.FloatTensor = None
    preds: np.array = None
    labels: torch.FloatTensor = None
    hidden_states: torch.FloatTensor = None


class ContextDistilBertwithDataforClassification(nn.Module):
    def __init__(self, model_dir):
        super().__init__()
        self.body = ContextDistilBertwithData(model_dir)
        self.pre_classifier = nn.Linear(4398, 4398)
        self.classifier = nn.Linear(4398, 2)
        self.dropout = nn.Dropout(.1)
        
    def forward(self, **kwargs):
        labels = kwargs['shares_class']
        distilbert_output = self.body(**kwargs)
        pooled_output = distilbert_output
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.GELU()(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        softmax = logits.detach().cpu().numpy()
        
        # calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1,2), labels.view(-1))
            
        result = SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output,
            preds = softmax,
            labels = labels
        )
        

        # return model output object
        return result

In [25]:
import smtplib
from email.mime.text import MIMEText
from functools import partial


def send_email(subject, body, sender, recipients, password):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender
    msg['To'] = ', '.join(recipients)
    smtp_server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
    smtp_server.login(sender, password)
    smtp_server.sendmail(sender, recipients, msg.as_string())
    smtp_server.quit()

subject = "Attention: here is the progress of your run"
# body = "This is the body of the text message"
sender = "park.family.bucheon@gmail.com"
recipients = ["8593335321@msg.fi.google.com"]
password = "fizwmiuiwonvuulj"

# send_email(subject, body, sender, recipients, password)

send_text = partial(send_email, subject = subject, sender = sender, recipients = recipients, password = password)


In [None]:
from tqdm import tqdm
import csv


# Texting function

def send_email(subject, body, sender, recipients, password):
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender
    msg['To'] = ', '.join(recipients)
    smtp_server = smtplib.SMTP_SSL('smtp.gmail.com', 465)
    smtp_server.login(sender, password)
    smtp_server.sendmail(sender, recipients, msg.as_string())
    smtp_server.quit()

subject = "Attention: here is the progress of your run"
# body = "This is the body of the text message"
sender = "park.family.bucheon@gmail.com"
recipients = ["8593335321@msg.fi.google.com"]
password = "fizwmiuiwonvuulj"

send_email(subject, body, sender, recipients, password)

sent_text = partial(send_email, subject = subject, sender = sender, recipients = recipient, password = password)


def extracting_hidden_layers(h_training_dataset, model, batch_size, device, fname, verbose = False):
    
    h_training_dataset = h_training_dataset.shuffle()
    total_rows = h_training_dataset.num_rows
    steps = total_rows // batch_size
    
    if total_rows % batch_size:
        steps += 1
    
    with open(fname, 'w') as f:
        
        fieldnames = [f'h_{i}' for i in range(768 *2)] + [f'xx_{i}' for x in range(2809)] + [f'x_{i}' for x in range(53)]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        
        for batch_idx in range(steps):
            
            with torch.no_grad():
                start = batch_idx*batch_size; end = batch_idx*batch_size + batch_size
                if end > total_rows:
                    result = model(**{k:v.to(device) for k,v in h_training_dataset[start:].items()})
                else:
                    result = model(**{k:v.to(device) for k,v in h_training_dataset[start:end].items()})
                    
                    
                hidden_layer = result.hidden_states.numpy()
                for row in hidden_layer:
                    row_dict = {fieldname: row[i] for i, fieldname in enumerate(fieldnames)}
                    writer.writerow(row_dict)
            
            if batch_idx % 25 == 0:
                message = "PROGRESS : " + f"[{current:>5d}/{total_rows:>5d}]"
                send_text(body = message)
            

In [None]:
device = torch.device('cuda')

model = ContextDistilBertwithDataforClassification('domain_adaptation_final_body_2').to(device)
model.load_state_dict(torch.load('finetuned_model/model.pth'))

In [None]:
extracting_hidden_layers(articles_encoded['train'], model, 8, device, 'hidden_states_training.csv')
extracting_hidden_layers(articles_encoded['test'], model, 8, device, 'hidden_states_training.csv')