In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import random
import os
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
MAX_LEN = 128
NUM_LABELS = 3
VALID_SPLIT = 0.2
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-5
DR_RATE = 0.3

## Load Data

In [4]:
df = pd.read_csv('news.csv', encoding='latin-1', header=None)
df.columns = ['target','title']
df = df[['title','target']]

label_enc = {'neutral':0,'positive':1,'negative':2}
df.target.replace(label_enc, inplace=True)

print(df.shape)
df.head()

(4846, 2)


Unnamed: 0,title,target
0,"According to Gran , the company has no plans t...",0
1,Technopolis plans to develop in stages an area...,0
2,The international electronic industry company ...,2
3,With the new production plant the company woul...,1
4,According to the company 's updated strategy f...,1


In [5]:
df.isna().sum()

title     0
target    0
dtype: int64

In [6]:
print('Duplitcated:', df.duplicated().sum())
df.drop_duplicates(inplace=True)

Duplitcated: 6


## FinBERT Demo

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

MODEL_NAME = 'yiyanghkust/finbert-tone'
finbert = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
nlp = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer)

In [None]:
from sklearn.metrics import accuracy_score

# Process without label encoding
preds = [str(result['label']).lower() for result in nlp(df.title.tolist())]
print('Accuracy:', round(accuracy_score(df.target.tolist(), preds), 5))

Accuracy: 0.79236


## Test

In [4]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
from IPython.display import clear_output
# from urllib import request
# request.urlretrieve('https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-', 'imdbs.csv')

dataset = load_dataset('csv', data_files='./imdbs.csv', split='train')
dataset = dataset.train_test_split(test_size=0.3)
train_set = dataset['train']
test_set = dataset['test']
clear_output()

In [5]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
clear_output()

In [6]:
def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))
clear_output()

In [7]:
train_set.set_format('torch', columns=['input_ids','attention_mask','label'])
test_set.set_format('torch', columns=['input_ids','attention_mask','label'])

In [8]:
batch_size = 8
epochs = 2
warmup_steps = 500
weight_decay = 0.01

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    logging_dir='./logs'
)

In [10]:
device = torch.device('mps')
model.to(device)
clear_output()

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

In [12]:
trainer.train()

***** Running training *****
  Num examples = 70
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18


  0%|          | 0/18 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6928791403770447, 'eval_runtime': 8.6955, 'eval_samples_per_second': 3.45, 'eval_steps_per_second': 0.46, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.6928918361663818, 'eval_runtime': 10.6884, 'eval_samples_per_second': 2.807, 'eval_steps_per_second': 0.374, 'epoch': 2.0}
{'train_runtime': 217.4936, 'train_samples_per_second': 0.644, 'train_steps_per_second': 0.083, 'train_loss': 0.7065895398457845, 'epoch': 2.0}


TrainOutput(global_step=18, training_loss=0.7065895398457845, metrics={'train_runtime': 217.4936, 'train_samples_per_second': 0.644, 'train_steps_per_second': 0.083, 'train_loss': 0.7065895398457845, 'epoch': 2.0})

In [13]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6928918361663818,
 'eval_runtime': 11.2858,
 'eval_samples_per_second': 2.658,
 'eval_steps_per_second': 0.354,
 'epoch': 2.0}

## Make Datasets

In [7]:
from transformers import BertTokenizer, BertModel

In [8]:
MODEL_NAMES = {
    'bert_base': 'bert-base-uncased',
    'bert_large': 'bert-large-uncased',
    'finbert': 'ProsusAI/finbert',
    'finbert_tone': 'yiyanghkust/finbert-pretrain',
}

In [9]:
class NewsDataset(Dataset):
    """ Financial News Sentiment Corpus Dataset """
    def __init__(self, df: pd.DataFrame, tokenizer: BertTokenizer, max_len: int, num_labels: int):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_labels = num_labels
        self.title = self.df.title
        self.target = self.df.target

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        title = self.title[index]
        target = self.target[index]
        inputs = self.bert_tokenize(title)
        inputs.update({'targets': self.one_hot_encoding(target)})
        return inputs

    def bert_tokenize(self, text):
        encoded_dict = self.tokenizer.encode_plus(
            text=text,
            text_pair=None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return encoded_dict

    def one_hot_encoding(self, label):
        one_hot = F.one_hot(torch.arange(self.num_labels))
        return one_hot[label].to(torch.float)

In [10]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(df, test_size=VALID_SPLIT, stratify=df.target)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [14]:
train_datasets = dict()
valid_datasets = dict()

for name, path in MODEL_NAMES.items():
    tokenizer = BertTokenizer.from_pretrained(path)
    train_datasets[name] = NewsDataset(train_df, tokenizer, MAX_LEN, NUM_LABELS)
    valid_datasets[name] = NewsDataset(valid_df, tokenizer, MAX_LEN, NUM_LABELS)

## BERT Embedding

In [None]:
for name, path in MODEL_NAMES.items():
    tokens = train_datasets[name].__getitem__(0)
    model = BertModel.from_pretrained(MODEL_NAMES['bert_base'], output_hidden_states=True)
    result = model(tokens['input_ids'], attention_mask=tokens['attention_mask'])

In [26]:
test_data = train_datasets['bert_base'].__getitem__(0)
del test_data['targets']
result = model(**test_data)
last_hidden_state, pooler_output, hidden_states = result[0], result[1], result[2]

In [None]:
test_data = train_datasets['bert_base'].__getitem__(0)
model = BertModel.from_pretrained(MODEL_NAMES['bert_base'], output_hidden_states=True)
result = model(test_data['input_ids'],attention_mask=test_data['attention_mask'])
last_hidden_state, pooler_output, hidden_states = result[0], result[1], result[2]

In [None]:
class BertBaseModel(nn.Module):
    def __init__(self, dropout: float, num_labels: int):
        super(BertBaseModel, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask, token_type_ids)
        output = self.dropout(output.pooler_output)
        output = self.linear(output)
        return output

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('mps')
model = BertBaseModel(DR_RATE, NUM_LABELS)
model.to(device)

In [None]:
class BertEmbedding(nn.Module):
    def __init__(self, model_name)

## BERT

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification

MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [8]:
class NewsDataset(Dataset):
    """ Financial News Sentiment Corpus Dataset """
    def __init__(self, df: pd.DataFrame, tokenizer: BertTokenizer, max_len: int, num_labels: int):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_labels = num_labels
        self.title = self.df.title
        self.target = self.df.target

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        title = self.title[index]
        target = self.target[index]
        inputs = self.bert_tokenize(title)
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'targets': self.one_hot_encoding(target),
        }

    def bert_tokenize(self, text):
        encoded_dict = self.tokenizer.encode_plus(
            text=text,
            text_pair=None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return encoded_dict

    def one_hot_encoding(self, label):
        one_hot = F.one_hot(torch.arange(self.num_labels))
        return one_hot[label].to(torch.float)

In [9]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=VALID_SPLIT, stratify=df.target)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [10]:
train_dataset = NewsDataset(train_df, tokenizer, MAX_LEN, NUM_LABELS)
valid_dataset = NewsDataset(val_df, tokenizer, MAX_LEN, NUM_LABELS)

In [None]:
MODEL_NAME = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
from transformers import BertModel
model = BertModel.from_pretrained(MODEL_NAME, output_hidden_states=True)

In [78]:
sentence = 'I love Paris'
tokens = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=7, padding='max_length', truncation=True)
print(tokens)

{'input_ids': [101, 1045, 2293, 3000, 102, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 0, 0]}


In [79]:
token_ids = torch.tensor(tokens['input_ids']).unsqueeze(0)
attention_mask = torch.tensor(tokens['attention_mask']).unsqueeze(0)

In [91]:
token_ids

tensor([[ 101, 1045, 2293, 3000,  102,    0,    0]])

In [80]:
result = model(token_ids,attention_mask=attention_mask)
hidden_states, pooler_output, hidden_states = result[0], result[1], result[2]

In [82]:
hidden_states[0].shape

torch.Size([1, 7, 1024])

In [81]:
len(hidden_states)

25

In [74]:
hidden_states[-1].shape

torch.Size([1, 7, 768])

In [None]:
from transformers import BertTokenizer, BertModel

FINBERT = 'ProsusAI/finbert'
finbert = BertModel.from_pretrained(FINBERT, output_hidden_states=True)
fintokenizer = BertTokenizer.from_pretrained(FINBERT)

In [88]:
result = finbert(token_ids,attention_mask=attention_mask)
hidden_states, pooler_output, hidden_states = result[0], result[1], result[2]

In [90]:
hidden_states[-1]

tensor([[[ 0.3517,  0.8137, -0.9715,  ..., -1.2517, -0.5206,  0.5874],
         [ 0.7348,  1.1075,  0.0664,  ..., -0.7516, -0.4270,  0.2203],
         [ 0.9962,  1.3483,  0.3890,  ..., -0.9823,  0.1299, -0.2494],
         ...,
         [ 0.3348,  0.3485,  0.0019,  ..., -0.2372, -0.6656, -0.1062],
         [ 0.2788,  0.7250,  0.1129,  ..., -0.3339, -0.3293,  0.1269],
         [ 0.3271,  0.4002,  0.0916,  ..., -0.2062, -0.3955, -0.0705]]],
       grad_fn=<NativeLayerNormBackward0>)

In [87]:
hidden_states[-1]

tensor([[[ 0.4045, -0.6752, -0.3544,  ...,  0.3486,  0.8672,  0.8592],
         [-1.2629,  0.2834,  0.7925,  ...,  0.3055,  0.4203,  1.9711],
         [-0.3013, -0.6540, -0.3854,  ..., -0.3721,  1.0705,  1.1328],
         ...,
         [ 1.0212, -0.9028,  0.7644,  ..., -0.5486,  0.2132,  1.7447],
         [ 0.5257, -0.3719,  0.1031,  ..., -0.4782,  0.5302,  1.3596],
         [ 0.2073, -0.4750,  0.1476,  ...,  0.2096,  0.8126,  1.1591]]],
       grad_fn=<NativeLayerNormBackward0>)

In [89]:
len(hidden_states)

13

In [11]:
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

In [None]:
def load_checkpoint(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_checkpoint(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
class BertBaseModel(nn.Module):
    def __init__(self, dropout: float, num_labels: int):
        super(BertBaseModel, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask, token_type_ids)
        output = self.dropout(output.pooler_output)
        output = self.linear(output)
        return output

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('mps')
model = BertBaseModel(DR_RATE, NUM_LABELS)
model.to(device)

In [None]:
# def loss_fn(outputs, targets):
#     return nn.BCEWithLogitsLoss()(outputs, targets)

def loss_fn(outputs, targets, num_labels):
    return nn.CrossEntropyLoss()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
optimizer = optim.Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 500
epochs = 1
total_loss = 0
total_len = 0
total_correct = 0

model.train()
for epoch in range(epochs):
    for text, label in train_loader:
        optimizer.zero_grad()

        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]

        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1

In [None]:
# evaluation
model.eval()

nsmc_eval_dataset = NsmcDataset(test_df)
eval_loader = DataLoader(nsmc_eval_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)

In [None]:
MODEL_NAME = 'bert-base-uncased'

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.encode_plus

In [None]:
from transformers import BertTokenizer, BertModel
model = BertModel

In [None]:
model_nm = 'yiyanghkust/finbert-pretrain'

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_nm)

In [None]:
def tok_func(x): return tokz(x["text"],padding=True,truncation=True,max_length=128)

In [None]:
tok_ds = ds.map(tok_func, batched=True)

In [None]:
tok_ds = tok_ds.remove_columns('text')
tok_ds

In [None]:
from transformers import TrainingArguments,Trainer,EarlyStoppingCallback

In [None]:
bs = 64
lr = 2e-5
epochs = 4

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*4, 
                         weight_decay=0.01, report_to='none',num_train_epochs=epochs,load_best_model_at_end = True,
                         logging_strategy='epoch'
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=3)

In [None]:
trainer = Trainer(model, args, train_dataset=tok_ds['train'], eval_dataset=tok_ds['test'],
                  tokenizer=tokz,compute_metrics=compute_metrics)

In [None]:
trainer.train();

In [None]:
trainer.evaluate()

In [None]:
preds = trainer.predict(tok_ds['test'])
preds = np.argmax(preds.predictions, axis=-1)
preds[:5]

lets examine what is wrong

In [None]:
val = ds['test'].to_pandas()

In [None]:
val.head(2)

In [None]:
assert len(val)==len(preds)

In [None]:
val['preds'] = preds

In [None]:
val.head(2)

In [None]:
(val.label==val.preds).mean()

In [None]:
d

In [None]:
val[val.label!=val.preds].label.value_counts()

In [None]:
wrong=val[val.label!=val.preds]
wrong.sample(10)

## finding similar data to fine tune model on
Let's find keyphrases that the current model may be confused with

In [None]:
!pip install -Uqq sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
news = pd.read_csv("../input/news-aggregator-dataset/uci-news-aggregator.csv")
print(news.shape[0])
news=news[news.CATEGORY=='b'] #b for business

news.drop(['ID','URL','CATEGORY','STORY','HOSTNAME','TIMESTAMP','PUBLISHER'],axis=1,inplace=True)
news.rename({'TITLE':'text'},inplace=True,axis=1)
news.sample(3)

In [None]:
# Corpus with example sentences
corpus = news.text.tolist()
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [None]:
# Query sentences:
queries = val.text.tolist()
query_embeddings = embedder.encode(queries, convert_to_tensor=True)

In [None]:
#https://www.sbert.net/examples/applications/semantic-search/README.html
corpus_embeddings = corpus_embeddings.to('cuda')
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

query_embeddings = query_embeddings.to('cuda')
query_embeddings = util.normalize_embeddings(query_embeddings)

In [None]:
hits = util.semantic_search(query_embeddings, corpus_embeddings, score_function=util.dot_score,top_k=10)

In [None]:
res=[]
for results in hits:
    for topResults in results:
        res.append(topResults['corpus_id'])

In [None]:
final=[corpus[x] for x in res]
final=list(set(final))

In [None]:
len(final)

## labelling the additional training dataset
typically at this point we would want to hand label some data, but who has time for that? for convenience, let's just use another finbert to label these datapoints - this is sometimes called semi-supervised learning, where we get a teacher model to pseudo label an unlabelled dataset.

In [None]:
from transformers import pipeline

import torch
if torch.cuda.is_available() : device=0
else: device=-1

nlp=pipeline('sentiment-analysis',model='ProsusAI/finbert',device=device)

#from https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/5.%20Testing%20transformer%20pipelines.ipynb
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

def batch_predict(nlp, data, batch_size=128):
    ret = []
    for d in chunks(data, batch_size):
        ret.extend(nlp(d))
    return ret

def pred_and_conf(data):
    # change format to softmax, make everything in [0.33, 0.66] range be predicted as neutral
    preds = batch_predict(nlp, data)
    pred=[x['label'] for x in preds]
    conf=[x['score'] for x in preds]
    return pred, conf

In [None]:
toLabel=pd.DataFrame({'text':final})

In [None]:
pred,conf=pred_and_conf(toLabel['text'].tolist())

toLabel['label'] = pred
toLabel['conf']  = conf

add to training set the higher confidence ones

In [None]:
# toTrain = toLabel
toTrain = toLabel[toLabel.conf>0.7]
toTrain.label = toTrain.label.replace(d)
len(toTrain)

In [None]:
add_trn = Dataset.from_pandas(toTrain[['text','label']].reset_index(drop=True))

In [None]:
ds

In [None]:
ds['train'] = concatenate_datasets([ds['train'],add_trn])
ds

In [None]:
tok_ds = ds.map(tok_func, batched=True)

we reinitialize the model and train, maybe for a longer time

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=3)

In [None]:
bs = 64
lr = 2e-5
epochs = 10

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*4, 
                         weight_decay=0.01, report_to='none',num_train_epochs=epochs,load_best_model_at_end = True,
                         logging_strategy='epoch'
)

In [None]:
trainer = Trainer(model, args, train_dataset=tok_ds['train'], eval_dataset=tok_ds['test'],
                  tokenizer=tokz,compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(5)])

In [None]:
trainer.train();

In [None]:
trainer.evaluate()