In [1]:
! pip install transformers datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"stepantita","key":"1fba5adcfa0373779b7e875ade980fd2"}\n'}

In [3]:
! mkdir ~/.kaggle

In [4]:
! cp kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! cat ~/.kaggle/kaggle.json

{"username":"stepantita","key":"1fba5adcfa0373779b7e875ade980fd2"}


In [7]:
! kaggle datasets download -d oliviervha/crypto-news
! kaggle datasets download -d stepantita/crypto-news-price-impact

Downloading crypto-news.zip to /content
100% 2.34M/2.34M [00:00<00:00, 4.57MB/s]
100% 2.34M/2.34M [00:00<00:00, 3.94MB/s]
Downloading crypto-news-price-impact.zip to /content
100% 399k/399k [00:00<00:00, 1.09MB/s]
100% 399k/399k [00:00<00:00, 1.09MB/s]


In [8]:
! unzip crypto-news.zip

Archive:  crypto-news.zip
  inflating: cryptonews.csv          


In [9]:
! unzip crypto-news-price-impact.zip

Archive:  crypto-news-price-impact.zip
  inflating: news-1.csv              
  inflating: news-2.csv              
  inflating: news-3.csv              
  inflating: news-5.csv              
  inflating: news-7.csv              
  inflating: news-8.csv              
  inflating: news-9.csv              
  inflating: source_title_description_bin_clf.csv  


In [1]:
import math
import json
from collections import Counter
import random
import os

from io import StringIO
from html.parser import HTMLParser

import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

from datasets import load_dataset, Dataset, DatasetDict

In [2]:
SEED = 42

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=SEED)

In [4]:
def on_gpu(f):
    def wrapper(*args):
        if torch.cuda.is_available():
            return f(*args)
        else:
            print('cuda unavailable')
    return wrapper

In [5]:
if torch.cuda.is_available():
    ! pip install pynvml
    from pynvml import *
    from numba import cuda

@on_gpu
def print_gpu_utilization(dev_id):
    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(dev_id)
        info = nvmlDeviceGetMemoryInfo(handle)
        print(f"GPU memory occupied: {info.used//1024**2} MB.")
    except Exception as e:
        print(e)

@on_gpu
def free_gpu_cache(dev_id=0):
    print("Initial GPU Usage")
    print_gpu_utilization(dev_id)

    torch.cuda.empty_cache()

    print("GPU Usage after emptying the cache")
    print_gpu_utilization(dev_id)

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()



In [6]:
device_id = 0

In [7]:
device = torch.device(f'cuda:{device_id}' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [8]:
if torch.cuda.is_available():
    torch.cuda.set_device(device)

In [9]:
MODEL_NAME = 'bert-base-cased'
MODELS_PATH = 'models'
DATASET_NAME = 'crypto-news-plus' # hate_speech18, hatexplain
NUM_LABELS = 2

NUM_EPOCHS = 1
BATCH_SIZE = 32
MAX_SEQ_LEN = 512
LEARNING_RATE = 2e-5
MAX_GRAD_NORM = 1000

In [10]:
data = pd.read_csv('source_title_description_bin_clf.csv')
data

Unnamed: 0,text,label
0,Source: CoinTelegraph Title: Bitcoin options d...,1
1,Source: The Daily Hodl Title: Crypto Exchange ...,0
2,Source: The Daily Hodl Title: BitKeep Wallet G...,0
3,Source: Bitcoin Magazine Title: Kazakhstan Pre...,0
4,Source: CoinTelegraph Title: The blockchain tr...,0
...,...,...
1313,Source: U.Today Title: Ethereum (ETH) Price An...,0
1314,Source: ZyCrypto Title: Digital Assets Amongst...,0
1315,"Source: The Daily Hodl Title: Cardano, Litecoi...",1
1316,Source: ZyCrypto Title: Binance plans to inves...,1


In [11]:
dataset = DatasetDict({
    'train': Dataset.from_list(data.to_dict('records'))
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1318
    })
})

In [12]:
train_split, test_split = train_test_split(dataset['train'], test_size=0.5, random_state=SEED)

In [13]:
dataset['test'] = Dataset.from_dict(test_split)

test_split, val_split = train_test_split(dataset['test'], test_size=0.2, random_state=SEED)

In [14]:
dataset['train'] = Dataset.from_dict(train_split)
dataset['test'] = Dataset.from_dict(test_split)
dataset['val'] = Dataset.from_dict(val_split)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 659
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 527
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 132
    })
})

In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer

BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [16]:
tokenized_dataset = dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=MAX_SEQ_LEN, return_tensors='pt'), batched=True)
tokenized_dataset.set_format('torch', device=device)
tokenized_dataset

Map:   0%|          | 0/659 [00:00<?, ? examples/s]

Map:   0%|          | 0/527 [00:00<?, ? examples/s]

Map:   0%|          | 0/132 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 659
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 527
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 132
    })
})

In [28]:
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS).to(device)
base_model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [29]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [30]:
for p in base_model.bert.parameters():
    p.requires_grad = False
# for p in base_model.bert.encoder.layer[-1].parameters():
#     p.requires_grad = True

In [31]:
count_parameters(base_model)

1538

In [32]:
def eval(f):
    def wrapper(model, *args, **kwargs):
        model.eval()
        return f(model, *args, **kwargs)
    return wrapper

def train(f):
    def wrapper(model, *args, **kwargs):
        model.train()
        return f(model, *args, **kwargs)
    return wrapper

In [33]:
@train
def train_epoch(model, train_dataloader, optimizer):
    train_loss = 0.0
    train_preds = []
    train_labels = []

    for step, batch in enumerate(tqdm(train_dataloader, total=len(train_dataloader))):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        targets = batch['label'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets) # (B, Seq_Len, 2)

        loss, logits = outputs.loss, outputs.logits

        probs = F.softmax(logits, dim=-1).cpu()
        pred = torch.argmax(probs, dim=-1) # (B)
        train_preds += pred.detach().tolist()
        train_labels += [l.item() for l in targets.cpu()]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
    return train_loss, train_preds, train_labels

@eval
def eval_epoch(model, val_dataloader):
    val_loss = 0.0
    val_preds = []
    val_labels = []

    with torch.no_grad():

        for step, batch in enumerate(tqdm(val_dataloader, total=len(val_dataloader))):
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            targets = batch['label'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)

            loss, logits = outputs.loss, outputs.logits

            probs = F.softmax(logits, dim=-1).cpu()
            pred = torch.argmax(probs, dim=-1) # (B)
            val_preds += pred.detach().tolist()
            val_labels += [l.item() for l in targets.cpu()]

            val_loss += loss.item()
    return val_loss, val_preds, val_labels

In [34]:
def training(model, train_data, val_data, config, balance='weighted'):
    model = model.to(device)

    optimizer = torch.optim.Adam(
        params=model.parameters(),
        lr=config['lr'],
        weight_decay=config['weight_decay']
    )

    num_train_steps = int(len(train_data) / config['batch_size'] * config['epochs'])

    print(f'Train steps: {num_train_steps}')

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=2 * config['batch_size'])

    history = {
        'train_losses': [],
        'val_losses': [],
        'train_acc': [],
        'val_acc': [],
        'train_f1': [],
        'val_f1': [],
        'train_precision': [],
        'val_precision': [],
        'train_recall': [],
        'val_recall': [],
        'train_roc_auc': [],
        'val_roc_auc': [],
        'train_precision_recall': [],
        'val_precision_recall': [],
    }

    for epoch_num in range(config['epochs']):
        print(f'Epoch: {epoch_num + 1}')

        # train stage
        train_loss, train_preds, train_labels = train_epoch(model, train_dataloader, optimizer)

        # eval stage
        val_loss, val_preds, val_labels = eval_epoch(model, val_dataloader)

        # metrics
        train_acc = accuracy_score(train_labels, train_preds)
        val_acc = accuracy_score(val_labels, val_preds)
        train_f1 = f1_score(train_labels, train_preds, average=balance)
        val_f1 = f1_score(val_labels, val_preds, average=balance)
        train_precision = precision_score(train_labels, train_preds, average=balance)
        val_precision = precision_score(val_labels, val_preds, average=balance)
        train_recall = recall_score(train_labels, train_preds, average=balance)
        val_recall = recall_score(val_labels, val_preds, average=balance)
        train_roc_auc = roc_auc_score(train_labels, train_preds, average=balance)
        val_roc_auc = roc_auc_score(val_labels, val_preds, average=balance)
        train_precision_recall = average_precision_score(train_labels, train_preds, average=balance)
        val_precision_recall = average_precision_score(val_labels, val_preds, average=balance)

        history['train_losses'].append(train_loss / len(train_dataloader))
        history['val_losses'].append(val_loss / len(val_dataloader))
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        history['train_f1'].append(train_f1)
        history['val_f1'].append(val_f1)
        history['train_precision'].append(train_precision)
        history['val_precision'].append(val_precision)
        history['train_recall'].append(train_recall)
        history['val_recall'].append(val_recall)
        history['train_roc_auc'].append(train_roc_auc)
        history['val_roc_auc'].append(val_roc_auc)
        history['train_precision_recall'].append(train_precision_recall)
        history['val_precision_recall'].append(val_precision_recall)

        print()
        print(f'Train loss: {train_loss / len(train_dataloader)} | Val loss: {val_loss / len(val_dataloader)}')
        print(f'Train acc: {train_acc} | Val acc: {val_acc}')
        print(f'Train f1: {train_f1} | Val f1: {val_f1}')
        print(f'Train precision: {train_precision} | Val precision: {val_precision}')
        print(f'Train recall: {train_recall} | Val recall: {val_recall}')
        print(f'Train AUCROC: {train_roc_auc} | Val AUCROC: {val_roc_auc}')
        print(f'Train AUPRC: {train_precision_recall} | Val AUPRC: {val_precision_recall}')

    free_gpu_cache(device_id)
    return history

In [35]:
ids = tokenized_dataset['test'][0]['input_ids'].unsqueeze(0)
mask = tokenized_dataset['test'][0]['attention_mask'].unsqueeze(0)
targets = tokenized_dataset['test'][0]['label'].unsqueeze(0)

ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)

out = base_model(ids, mask, labels=targets)
out.loss

tensor(0.4402, device='cuda:0', grad_fn=<NllLossBackward0>)

In [36]:
config = {
    'epochs': NUM_EPOCHS,
    'batch_size': BATCH_SIZE,
    'gradient_accumulation_steps': 1,
    'fp16': False,
    'lr': LEARNING_RATE,
    'max_grad_norm': MAX_GRAD_NORM,
    'weight_decay': 0.01,
}

In [37]:
history = training(base_model, tokenized_dataset['train'], tokenized_dataset['val'], config)

Train steps: 20
Epoch: 1


100%|██████████| 21/21 [00:26<00:00,  1.25s/it]
100%|██████████| 3/3 [00:04<00:00,  1.60s/it]
  _warn_prf(average, modifier, msg_start, len(result))



Train loss: 0.6722888265337262 | Val loss: 0.6906612118085226
Train acc: 0.6191198786039454 | Val acc: 0.6287878787878788
Train f1: 0.4800406377732167 | Val f1: 0.48548273431994354
Train precision: 0.5370336855489506 | Val precision: 0.39537419651056016
Train recall: 0.6191198786039454 | Val recall: 0.6287878787878788
Train AUCROC: 0.5003325183374083 | Val AUCROC: 0.5
Train AUPRC: 0.3795277693474962 | Val AUPRC: 0.3712121212121212
Initial GPU Usage
GPU memory occupied: 4867 MB.
GPU Usage after emptying the cache
GPU memory occupied: 2559 MB.


In [38]:
def plot_results(history, do_val=True):
    fig, ax = plt.subplots(figsize=(8, 8))

    x = list(range(0, len(history['train_losses'])))

    # loss

    ax.plot(x, history['train_losses'], label='train_loss')

    if do_val:
        ax.plot(x, history['val_losses'], label='val_loss')

    plt.title('Train / Validation Loss')
    plt.legend(loc='upper right')

    # accuracy

    if 'train_acc' in history:
        fig, ax = plt.subplots(figsize=(8, 8))

        ax.plot(x, history['train_acc'], label='train_acc')

        if do_val:
            ax.plot(x, history['val_acc'], label='val_acc')

    plt.title('Train / Validation Accuracy')
    plt.legend(loc='upper right')

    # f1-score

    if 'train_f1' in history:
        fig, ax = plt.subplots(figsize=(8, 8))

        ax.plot(x, history['train_f1'], label='train_f1')

        if do_val:
            ax.plot(x, history['val_f1'], label='val_f1')

        plt.title('Train / Validation F1')
        plt.legend(loc='upper right')

    # precision

    if 'train_precision' in history:
        fig, ax = plt.subplots(figsize=(8, 8))

        ax.plot(x, history['train_precision'], label='train_precision')

        if do_val:
            ax.plot(x, history['val_precision'], label='val_precision')

        plt.title('Train / Validation Precision')
        plt.legend(loc='upper right')

    # recall

    if 'train_recall' in history:
        fig, ax = plt.subplots(figsize=(8, 8))

        ax.plot(x, history['train_recall'], label='train_recall')

        if do_val:
            ax.plot(x, history['val_recall'], label='val_recall')

        plt.title('Train / Validation Recall')
        plt.legend(loc='upper right')

    fig.show()

In [39]:
# plot_results(history)

In [40]:
# torch.save(base_model.state_dict(), f'{MODELS_PATH}/{DATASET_NAME}_{MODEL_NAME.replace("/", "_")}_{NUM_EPOCHS}.bin')

In [41]:
val_dataset = tokenized_dataset['test']

In [42]:
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=2 * config['batch_size'])

In [43]:
val_loss, val_preds, val_labels = eval_epoch(base_model, val_dataloader)

100%|██████████| 9/9 [00:18<00:00,  2.01s/it]


In [44]:
val_acc = accuracy_score(val_labels, val_preds)
val_f1 = f1_score(val_labels, val_preds, average='weighted')
val_precision = precision_score(val_labels, val_preds, average='weighted')
val_recall = recall_score(val_labels, val_preds, average='weighted')
val_roc_auc = roc_auc_score(val_labels, val_preds, average='weighted')
val_precision_recall = average_precision_score(val_labels, val_preds, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
print(f'Val loss: {val_loss / len(val_dataloader)}')
print(f'Val acc: {val_acc}')
print(f'Val f1: {val_f1}')
print(f'Val precision: {val_precision}')
print(f'Val recall: {val_recall}')
print(f'Val ROCAUC: {val_roc_auc}')
print(f'Val AUPRC: {val_precision_recall}')

Val loss: 0.6434246632787917
Val acc: 0.6622390891840607
Val f1: 0.527674525400085
Val precision: 0.43856061124333434
Val recall: 0.6622390891840607
Val ROCAUC: 0.5
Val AUPRC: 0.3377609108159393
