In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm

articles = load_dataset('online_news_popularity_data')


tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Found cached dataset online_news_popularity_data (/home/leepark/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/f3e03630a13ebe013884d6a83c7ec52cb4eec2c0f6012f710c9dba58aa719fcd)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def tokenize(batch):
    # full_text_encoded = tokenizer([title + ' \n' + content for title, content in zip(batch['title'],batch['content'])],
    #                              return_tensors = 'pt', padding = 'max_length', max_length = 512, truncation = True)
    title_encoded = tokenizer(batch['title'], padding = True,  truncation = True)
    title_encoded_renamed = {f"{k}_title":v for k,v in title_encoded.items()}
    content_encoded = tokenizer(batch['content'], padding = 'max_length', max_length = 512, truncation = True)
    content_encoded_renamed = {f"{k}_content":v for k,v in content_encoded.items()}
#     return {k:torch.Tensor(v) for k,v in full_text_encoded.items()}
    title_encoded_renamed.update(content_encoded_renamed)
    return title_encoded_renamed

In [3]:
articles_encoded = articles.map(tokenize, remove_columns = ['title','content','shares'],
            batched = True, batch_size = None)

Loading cached processed dataset at /home/leepark/.cache/huggingface/datasets/online_news_popularity_data/online_news_popularity_data/1.0.0/f3e03630a13ebe013884d6a83c7ec52cb4eec2c0f6012f710c9dba58aa719fcd/cache-97268e3b8769f89b.arrow


Map:   0%|          | 0/7922 [00:00<?, ? examples/s]

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from transformers.modeling_outputs import SequenceClassifierOutput
from Transformer_Models import ContextDistilBert, ContextDistilBertwithData
import torch


In [5]:
articles_original = articles.remove_columns(['title','content','shares'])

In [25]:
from dataclasses import dataclass
from typing import Optional
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


@dataclass
class NN_Classifier_Output:
    loss: torch.FloatTensor = None
    logits: torch.FloatTensor = None
    preds: np.array = None
    labels: torch.FloatTensor = None

class simple_NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(2809, 2809*2); self.linear_1.weight = nn.init.normal_(self.linear_1.weight)
        self.linear_2 = nn.Linear(2809*2, 2809); self.linear_2.weight = nn.init.normal_(self.linear_2.weight)
        self.linear_3 = nn.Linear(2809, 2);      self.linear_3.weight = nn.init.normal_(self.linear_3.weight)
        self.dropout = nn.Dropout(.15)
        self.relu   = nn.GELU()

    
    def forward(self, **kwargs):
        input_tensor = torch.cat([v.reshape(-1, 1) for k,v in kwargs.items() if 'shares' not in k], dim = -1)
        input_tensor = torch.vstack([torch.flatten(torch.kron(v, v)) for v in input_tensor])
        labels = kwargs['shares_class'].long()
        output = self.linear_1(F.normalize(input_tensor))
        output = self.relu(output)
        output = self.dropout(output)
        output = self.linear_2(output)
        logits = self.linear_3(self.dropout(self.relu(output)))
        
        
        softmax = F.softmax(logits, dim = -1).detach().numpy()
        
#         calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(label_smoothing = .1)
            loss = loss_fct(logits, labels)

        result = NN_Classifier_Output(
            loss=loss,
            logits=logits,
            preds = softmax,
            labels = labels
        )
        
        # return model output object
        return result

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def compute_metrics(labels, preds):
    f1 = f1_score(labels, preds)
    acc = accuracy_score(labels, preds)
    recall = recall_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    precision = precision_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall':recall, 'f1':f1, 'auc':auc}

In [27]:
def train_loop(h_training_dataset, model, optimizer, batch_size, verbose = False):
    h_training_dataset = h_training_dataset.with_format('pt').shuffle()
    total_rows = h_training_dataset.num_rows
    steps = total_rows // batch_size
    if verbose:
        pbar = tqdm(range(steps +1))
        for batch_idx in pbar:
            start = batch_idx*batch_size; end = batch_idx*batch_size + batch_size
            if end > total_rows:
                result = model(**{k:v for k,v in h_training_dataset[start:].items()})
            else:
                result = model(**{k:v for k,v in h_training_dataset[start:end].items()})
            optimizer.zero_grad()
            result.loss.backward()
            optimizer.step()
            if batch_idx % 100 == 0:
                loss, current = result.loss.item(), (batch_idx + 1) * steps
                pbar.set_postfix({"mean(loss)":""f"{loss:>7f}  [{current:>5d}/{batch_size:>5d}]"})
    else:
        for batch_idx in range(steps +1):
            start = batch_idx*batch_size; end = batch_idx*batch_size + batch_size
            if end > total_rows:
                result = model(**{k:v for k,v in h_training_dataset[start:].items()})
            else:
                result = model(**{k:v for k,v in h_training_dataset[start:end].items()})
            optimizer.zero_grad()
            result.loss.backward()
            optimizer.step()
            if batch_idx % 100 == 0:
                loss, current = result.loss.item(), (batch_idx + 1) * batch_size
                print({"mean(loss)":""f"{loss:>7f}  [{current:>5d}/{total_rows:>5d}]"})


In [28]:
def test_loop(h_test_dataset, model, batch_size, verbose = False):
    h_test_dataset = h_test_dataset.with_format('pt').shuffle()
    total_rows = h_test_dataset.num_rows
    steps = total_rows // batch_size
    from sklearn.metrics import accuracy_score
    preds = []
    labels = h_test_dataset.with_format('np')['shares_class']
    loss = 0
    with torch.no_grad():
        if verbose:
            pbar = tqdm(range(steps +1))
            for batch_idx in pbar:
                start = batch_idx*batch_size; end = batch_idx*batch_size + batch_size
                if end > total_rows:
                    result = model(**{k:v for k,v in h_test_dataset[start:].items()})
                else:
                    result = model(**{k:v for k,v in h_test_dataset[start:end].items()})
                loss += result.loss.item()
                preds.append(np.argmax(result.preds, -1))
            preds = np.concatenate(preds)
            accuracy = accuracy_score(preds.reshape(-1,1), labels.reshape(-1,1))
            loss /= total_rows
            metrics = compute_metrics(labels, preds)
            print(f"loss : {loss}, metrics: {metrics}")
            return metrics
        else:
            for batch_idx in range(steps +1):
                start = batch_idx*batch_size; end = batch_idx*batch_size + batch_size
                if end > total_rows:
                    result = model(**{k:v for k,v in h_test_dataset[start:].items()})
                else:
                    result = model(**{k:v for k,v in h_test_dataset[start:end].items()})
                loss += result.loss.item()
                preds.append(np.argmax(result.preds, -1))
            preds = np.concatenate(preds)
            accuracy = accuracy_score(preds.reshape(-1,1), labels.reshape(-1,1))
            loss /= total_rows
            metrics = compute_metrics(labels, preds)
            print(f"loss : {loss}, metrics: {metrics}")
            return metrics


In [None]:
nn_model = simple_NN()
optimizer = torch.optim.AdamW(nn_model.parameters(), lr = 5e-1)
lr_scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, 1, 1e-5, total_iters = 18)

for _ in range(25):
    train_loop(articles_original['train'], nn_model, optimizer, 100)
    test_loop(articles_original['validation'], nn_model, 500)
    lr_scheduler.step()

{'mean(loss)': '1142.896484  [  100/31686]'}
{'mean(loss)': '0.700288  [10100/31686]'}
{'mean(loss)': '0.693363  [20100/31686]'}
{'mean(loss)': '0.694805  [30100/31686]'}
loss : 0.00141434062613709, metrics: {'accuracy': 0.493436001009846, 'precision': 0.4931766489764973, 'recall': 0.9994878361075544, 'f1': 0.6604619680175987, 'auc': 0.5004907440433216}
{'mean(loss)': '0.699744  [  100/31686]'}
{'mean(loss)': '0.729113  [10100/31686]'}
{'mean(loss)': '13.712405  [20100/31686]'}
{'mean(loss)': '0.694754  [30100/31686]'}


  _warn_prf(average, modifier, msg_start, len(result))


loss : 1.7655081995747601, metrics: {'accuracy': 0.5070689219893966, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'auc': 0.5}
{'mean(loss)': '925.087341  [  100/31686]'}
{'mean(loss)': '0.704156  [10100/31686]'}
{'mean(loss)': '44718.429688  [20100/31686]'}
{'mean(loss)': '0.696389  [30100/31686]'}
loss : 0.0014005806508674613, metrics: {'accuracy': 0.5074476142388286, 'precision': 1.0, 'recall': 0.0007682458386683739, 'f1': 0.0015353121801432957, 'auc': 0.5003841229193342}
{'mean(loss)': '0.694826  [  100/31686]'}
{'mean(loss)': '0.694073  [10100/31686]'}
{'mean(loss)': '0.693048  [20100/31686]'}
{'mean(loss)': '1.203115  [30100/31686]'}
loss : 0.0014116143719832784, metrics: {'accuracy': 0.4929310780106034, 'precision': 0.4929310780106034, 'recall': 1.0, 'f1': 0.6603534285955864, 'auc': 0.5}
{'mean(loss)': '0.714902  [  100/31686]'}
{'mean(loss)': '0.692634  [10100/31686]'}
{'mean(loss)': '0.692545  [20100/31686]'}
{'mean(loss)': '0.685506  [30100/31686]'}
loss : 0.0014111322379720178

  _warn_prf(average, modifier, msg_start, len(result))


loss : 0.0014070258902588507, metrics: {'accuracy': 0.5070689219893966, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'auc': 0.5}
{'mean(loss)': '0.689244  [  100/31686]'}
{'mean(loss)': '0.689116  [10100/31686]'}
{'mean(loss)': '22280.070312  [20100/31686]'}
{'mean(loss)': '0.695439  [30100/31686]'}
loss : 0.0022994311988669735, metrics: {'accuracy': 0.506942691239586, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'auc': 0.4998755290017426}
{'mean(loss)': '0.691729  [  100/31686]'}
{'mean(loss)': '0.704612  [10100/31686]'}
{'mean(loss)': '0.695528  [20100/31686]'}
{'mean(loss)': '0.720691  [30100/31686]'}


  _warn_prf(average, modifier, msg_start, len(result))


loss : 0.001400689176163629, metrics: {'accuracy': 0.5070689219893966, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'auc': 0.5}
{'mean(loss)': '0.690225  [  100/31686]'}
{'mean(loss)': '0.697556  [10100/31686]'}
{'mean(loss)': '0.700776  [20100/31686]'}
{'mean(loss)': '0.685719  [30100/31686]'}


  _warn_prf(average, modifier, msg_start, len(result))


loss : 0.001455016876224074, metrics: {'accuracy': 0.5070689219893966, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'auc': 0.5}
{'mean(loss)': '0.740849  [  100/31686]'}
