In [1]:
%env TOKENIZERS_PARALLELISM=false
%env WANDB_DISABLED=true
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AdamW, 
    AutoConfig, 
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    EarlyStoppingCallback
)

from datasets import Dataset, load_metric

import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

env: TOKENIZERS_PARALLELISM=false
env: WANDB_DISABLED=true


In [2]:
training_headline_df = pd.read_csv('../input/reddit-fin-headlines-data/2k_new_labelled.csv')[['sentiment', 'title']]
serving_headline_df = pd.read_csv('../input/reddit-fin-headlines-data/8k-labelled-vader.csv')[['title']]

kaggle_headline_df = pd.read_csv('../input/sentiment-analysis-for-financial-news/all-data.csv', sep=',', encoding='latin-1',names = ["sentiment","title"])
combined_df = pd.concat([training_headline_df, kaggle_headline_df]).sample(frac=1)

# Dataset

In [3]:
def preprocess_data(headline_df, tokenizer):
    def tokenize_fn(examples):
        return tokenizer(examples['text'], truncation=True)
    
    headline_df['sentiment'] = LabelEncoder().fit_transform(headline_df['sentiment'])
    X, y = headline_df['title'].values, headline_df['sentiment'].values
    # train : val : test = 0.8 : 0.1 : 0.1
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1, stratify=y)
    xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.1, stratify=ytrain)
    
    train_dataset_raw = Dataset.from_dict({'text':xtrain, 'labels':ytrain})
    valid_dataset_raw = Dataset.from_dict({'text':xval, 'labels': yval})
    test_dataset_raw = Dataset.from_dict({'text':xtest, 'labels': ytest})

    train_dataset = train_dataset_raw.map(tokenize_fn, batched=True)
    valid_dataset = valid_dataset_raw.map(tokenize_fn, batched=True)
    test_dataset = test_dataset_raw.map(tokenize_fn, batched=True)
    
    return train_dataset, valid_dataset, test_dataset

In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall}

In [5]:
def create_trainer(model, tokenizer, train_dataset, valid_dataset):
    data_collator = DataCollatorWithPadding(tokenizer)
    
    train_args = TrainingArguments(
        './Finbert Trained/',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=2*16,
        num_train_epochs=50,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,    
        do_eval=True,
        do_train=True,
        do_predict=True,
        evaluation_strategy='epoch',
        load_best_model_at_end = True,
        metric_for_best_model='f1',
        save_strategy="epoch",
        save_total_limit = 2
    )

    trainer = Trainer(
        model,
        train_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(5)]
    )

    return trainer

In [6]:
def get_test_metric(trainer, valid_dataset):
    output = trainer.predict(
        test_dataset=valid_dataset,
        )
    return compute_metrics(output)

In [7]:
model_data = {
    "train_on_kg": kaggle_headline_df,
    "train_on_reddit": training_headline_df,
    "transfer_learning_on_reddit": training_headline_df,
    "combined_data": combined_df 
}

val_dfs = {}

for model_name in model_data:
    print("---------------------------------------------------")
    print(f"Start training for {model_name}")
    df = model_data[model_name]
    
    if model_name == "transfer_learning_on_reddit":
        pretrained = 'finbert_finetuned_train_on_kg.bin'
        
    else:
        pretrained = 'ProsusAI/finbert'
    model = AutoModelForSequenceClassification.from_pretrained(pretrained)
    tokenizer = AutoTokenizer.from_pretrained(pretrained)
    
    train_dataset, valid_dataset, test_dataset = preprocess_data(df, tokenizer)
    
    trainer = create_trainer(model, tokenizer, train_dataset, valid_dataset)
    trainer.train()
    
    trainer.save_model(f'finbert_finetuned_{model_name}.bin')
    
    print("---------------------------------------------------")
    print("Test Metric")
    trainer.evaluate()
    test_metric = get_test_metric(trainer, test_dataset)
    print(test_metric)
    print(f"Finish {model_name}")
    print("---------------------------------------------------")

---------------------------------------------------
Start training for train_on_kg


Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 3924
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 12300


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.639529,0.771167,0.603667,0.76279,0.596839
2,No log,0.408779,0.853547,0.824192,0.83531,0.818276
3,1.187800,0.307583,0.894737,0.8772,0.87942,0.876803
4,1.187800,0.375778,0.894737,0.877863,0.874373,0.881908
5,0.216200,0.512539,0.87643,0.86266,0.868409,0.864119
6,0.216200,0.643479,0.855835,0.843819,0.833356,0.855615
7,0.090300,0.569112,0.887872,0.87682,0.870265,0.884166
8,0.090300,0.68973,0.871854,0.855497,0.839542,0.875192
9,0.047000,0.688024,0.874142,0.850791,0.87347,0.836134


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 437
  Batch size = 32
Saving model checkpoint to ./Finbert Trained/checkpoint-246
Configuration saved in ./Finbert Trained/checkpoint-246/config.json
Model weights saved in ./Finbert Trained/checkpoint-246/pytorch_model.bin
tokenizer config file saved in ./Finbert Trained/checkpoint-246/tokenizer_config.json
Special tokens file saved in ./Finbert Trained/checkpoint-246/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 437
  Batch size = 32
Saving model checkpoint to ./Finbert Trained/checkpoint-492
Configuration saved in ./Finbert Trained/checkpoint-492/config.json
Model weights saved in ./Finbert Trained/checkpoint-

---------------------------------------------------
Test Metric


The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 485
  Batch size = 32


{'accuracy': 0.8927835051546392, 'f1': 0.8885811643208309, 'precision': 0.8792651061168448, 'recall': 0.9001694256937748}
Finish train_on_kg
---------------------------------------------------
---------------------------------------------------
Start training for train_on_reddit


loading configuration file https://huggingface.co/ProsusAI/finbert/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2120f4f96b5830e5a91fe94d242471b0133b0976c8d6e081594ab837ac5f17bc.ef97278c578016c8bb785f15296476b12eae86423097fed78719d1c8197a3430
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1657
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5200


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.066427,0.481081,0.246914,0.237297,0.320349
2,No log,0.841094,0.632432,0.446441,0.415527,0.484506
3,No log,0.821173,0.686486,0.483289,0.449134,0.523131
4,No log,0.796888,0.702703,0.64099,0.647266,0.637152
5,0.933400,0.755874,0.697297,0.656859,0.660033,0.690835
6,0.933400,0.885028,0.745946,0.715309,0.706738,0.729296
7,0.933400,1.127332,0.72973,0.681284,0.708723,0.66754
8,0.933400,1.287906,0.756757,0.714135,0.744141,0.699291
9,0.933400,1.563088,0.72973,0.691019,0.688479,0.693781
10,0.075400,1.546788,0.72973,0.674858,0.70527,0.661429


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 185
  Batch size = 32
Saving model checkpoint to ./Finbert Trained/checkpoint-104
Configuration saved in ./Finbert Trained/checkpoint-104/config.json
Model weights saved in ./Finbert Trained/checkpoint-104/pytorch_model.bin
tokenizer config file saved in ./Finbert Trained/checkpoint-104/tokenizer_config.json
Special tokens file saved in ./Finbert Trained/checkpoint-104/special_tokens_map.json
Deleting older checkpoint [Finbert Trained/checkpoint-984] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 185
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./Finbert 

---------------------------------------------------
Test Metric


The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 205
  Batch size = 32
loading configuration file finbert_finetuned_train_on_kg.bin/config.json
Model config BertConfig {
  "_name_or_path": "finbert_finetuned_train_on_kg.bin",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "posit

{'accuracy': 0.7219512195121951, 'f1': 0.6860349880019401, 'precision': 0.675643654591023, 'recall': 0.7061843010118872}
Finish train_on_reddit
---------------------------------------------------
---------------------------------------------------
Start training for transfer_learning_on_reddit


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at finbert_finetuned_train_on_kg.bin.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
Didn't find file finbert_finetuned_train_on_kg.bin/added_tokens.json. We won't load it.
loading file finbert_finetuned_train_on_kg.bin/vocab.txt
loading file finbert_finetuned_train_on_kg.bin/tokenizer.json
loading file None
loading file finbert_finetuned_train_on_kg.bin/special_tokens_map.json
loading file finbert_finetuned_train_on_kg.bin/tokenizer_config.json


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1657
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5200


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.646263,0.783784,0.770128,0.79127,0.766776
2,No log,0.552866,0.8,0.78187,0.776773,0.789634
3,No log,0.592814,0.783784,0.75076,0.794314,0.727878
4,No log,0.680369,0.805405,0.78292,0.814443,0.763775
5,0.450000,0.769139,0.789189,0.765741,0.790066,0.749973
6,0.450000,0.84494,0.810811,0.788452,0.829978,0.764157
7,0.450000,1.171009,0.767568,0.719123,0.743722,0.704583
8,0.450000,1.187228,0.762162,0.715258,0.731814,0.707583
9,0.450000,1.187822,0.767568,0.731043,0.737888,0.726514
10,0.073800,1.343778,0.783784,0.748039,0.7805,0.735788


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 185
  Batch size = 32
Saving model checkpoint to ./Finbert Trained/checkpoint-104
Configuration saved in ./Finbert Trained/checkpoint-104/config.json
Model weights saved in ./Finbert Trained/checkpoint-104/pytorch_model.bin
tokenizer config file saved in ./Finbert Trained/checkpoint-104/tokenizer_config.json
Special tokens file saved in ./Finbert Trained/checkpoint-104/special_tokens_map.json
Deleting older checkpoint [Finbert Trained/checkpoint-624] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 185
  Batch size = 32
Saving model checkpoint to ./Finbert Trained/checkpoint-208
Configuration saved in ./Finbert

---------------------------------------------------
Test Metric


The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 205
  Batch size = 32


{'accuracy': 0.7951219512195122, 'f1': 0.7454313270827032, 'precision': 0.7668409087509908, 'recall': 0.7308183515080066}
Finish transfer_learning_on_reddit
---------------------------------------------------
---------------------------------------------------
Start training for combined_data


loading configuration file https://huggingface.co/ProsusAI/finbert/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2120f4f96b5830e5a91fe94d242471b0133b0976c8d6e081594ab837ac5f17bc.ef97278c578016c8bb785f15296476b12eae86423097fed78719d1c8197a3430
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",


  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 5582
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 17450


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.698301,0.714976,0.602082,0.675391,0.580817
2,1.418800,0.38905,0.855072,0.836472,0.854693,0.829547
3,0.348800,0.408046,0.845411,0.834123,0.823168,0.848047
4,0.348800,0.488727,0.851852,0.834572,0.845363,0.829117
5,0.197000,0.608906,0.847021,0.833166,0.834663,0.831785
6,0.126700,0.679744,0.847021,0.825773,0.841353,0.812887
7,0.126700,0.833472,0.827697,0.809765,0.811847,0.80802


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 621
  Batch size = 32
Saving model checkpoint to ./Finbert Trained/checkpoint-349
Configuration saved in ./Finbert Trained/checkpoint-349/config.json
Model weights saved in ./Finbert Trained/checkpoint-349/pytorch_model.bin
tokenizer config file saved in ./Finbert Trained/checkpoint-349/tokenizer_config.json
Special tokens file saved in ./Finbert Trained/checkpoint-349/special_tokens_map.json
Deleting older checkpoint [Finbert Trained/checkpoint-624] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 621
  Batch size = 32
Saving model checkpoint to ./Finbert Trained/checkpoint-698
Configuration saved in ./Finbert

---------------------------------------------------
Test Metric


The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 690
  Batch size = 32


{'accuracy': 0.8652173913043478, 'f1': 0.8512346671553813, 'precision': 0.8524576884822576, 'recall': 0.8563947748158274}
Finish combined_data
---------------------------------------------------


In [8]:
id2label = {
    0: "positive",
    1: "negative",
    2: "neutral"
  }

# Test

In [9]:
def prep_serving_data(headline_df, tokenizer):

    def tokenize_fn(examples):
        return tokenizer(examples['text'], truncation=True)

    X = headline_df['title'].values
    
    test_dataset_raw = Dataset.from_dict({'text':X})

    test_dataset = test_dataset_raw.map(tokenize_fn, batched=True)
    
    return test_dataset

In [10]:
def get_pred(dataset, tokenizer):
    data_collator = DataCollatorWithPadding(tokenizer)
    
    # arguments for Trainer
    test_args = TrainingArguments(
        output_dir = pretrained,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 64,   
        dataloader_drop_last = False    
    )

    # init trainer
    trainer = Trainer(
                  model = model, 
                  args = test_args, 
                  data_collator=data_collator,
                  compute_metrics = compute_metrics)
    
    test_results = trainer.predict(dataset)
    
    return test_results

In [11]:
metric_dict = {}
df_dict = {}

In [12]:
model_cpt = {
    "train_on_kg": './finbert_finetuned_train_on_kg.bin',
    "train_on_reddit": './finbert_finetuned_train_on_reddit.bin',
    "transfer_learning_on_reddit": './finbert_finetuned_transfer_learning_on_reddit.bin',
    "combined_data": './finbert_finetuned_combined_data.bin' 
}

In [13]:
training_headline_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047 entries, 0 to 2046
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  2047 non-null   int64 
 1   title      2047 non-null   object
dtypes: int64(1), object(1)
memory usage: 32.1+ KB


In [14]:
for model_name in model_cpt:
    print("---------------------------------------------------")
    print(f"Start serving for {model_name} using serving_headline_reddit")
    pretrained = model_cpt[model_name]
    
    model = AutoModelForSequenceClassification.from_pretrained(pretrained)
    tokenizer = AutoTokenizer.from_pretrained(pretrained)
    
    test_dataset = prep_serving_data(serving_headline_df, tokenizer)
    test_results = get_pred(test_dataset, tokenizer)
    preds = test_results.predictions.argmax(-1)

    print("---------------------------------------------------")
    
    submission = pd.DataFrame()
    submission['title'] = test_dataset['text']
    submission['prediction'] = preds
    submission['prediction'] = submission['prediction'].apply(lambda x:id2label[x])
    submission.to_csv(f'finbert_{model_name}_serving.csv', index=False)
    df_dict[f'finbert_{model_name}'] = submission

    
    print(f"Finish {model_name}")
    print("---------------------------------------------------")

loading configuration file ./finbert_finetuned_train_on_kg.bin/config.json
Model config BertConfig {
  "_name_or_path": "./finbert_finetuned_train_on_kg.bin",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522


---------------------------------------------------
Start serving for train_on_kg using serving_headline_reddit


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at ./finbert_finetuned_train_on_kg.bin.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
Didn't find file ./finbert_finetuned_train_on_kg.bin/added_tokens.json. We won't load it.
loading file ./finbert_finetuned_train_on_kg.bin/vocab.txt
loading file ./finbert_finetuned_train_on_kg.bin/tokenizer.json
loading file None
loading file ./finbert_finetuned_train_on_kg.bin/special_tokens_map.json
loading file ./finbert_finetuned_train_on_kg.bin/tokenizer_config.json


  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 8409
  Batch size = 64


loading configuration file ./finbert_finetuned_train_on_reddit.bin/config.json
Model config BertConfig {
  "_name_or_path": "./finbert_finetuned_train_on_reddit.bin",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size"

---------------------------------------------------
Finish train_on_kg
---------------------------------------------------
---------------------------------------------------
Start serving for train_on_reddit using serving_headline_reddit


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at ./finbert_finetuned_train_on_reddit.bin.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
Didn't find file ./finbert_finetuned_train_on_reddit.bin/added_tokens.json. We won't load it.
loading file ./finbert_finetuned_train_on_reddit.bin/vocab.txt
loading file ./finbert_finetuned_train_on_reddit.bin/tokenizer.json
loading file None
loading file ./finbert_finetuned_train_on_reddit.bin/special_tokens_map.json
loading file ./finbert_finetuned_train_on_reddit.bin/tokenizer_config.json


  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 8409
  Batch size = 64


loading configuration file ./finbert_finetuned_transfer_learning_on_reddit.bin/config.json
Model config BertConfig {
  "_name_or_path": "./finbert_finetuned_transfer_learning_on_reddit.bin",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cach

---------------------------------------------------
Finish train_on_reddit
---------------------------------------------------
---------------------------------------------------
Start serving for transfer_learning_on_reddit using serving_headline_reddit


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at ./finbert_finetuned_transfer_learning_on_reddit.bin.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
Didn't find file ./finbert_finetuned_transfer_learning_on_reddit.bin/added_tokens.json. We won't load it.
loading file ./finbert_finetuned_transfer_learning_on_reddit.bin/vocab.txt
loading file ./finbert_finetuned_transfer_learning_on_reddit.bin/tokenizer.json
loading file None
loading file ./finbert_finetuned_transfer_learning_on_reddit.bin/special_tokens_map.json
loading file ./finbert_finetuned_transfer_learning_on_reddit.bin/tokenizer_config.json


  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 8409
  Batch size = 64


loading configuration file ./finbert_finetuned_combined_data.bin/config.json
Model config BertConfig {
  "_name_or_path": "./finbert_finetuned_combined_data.bin",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30

---------------------------------------------------
Finish transfer_learning_on_reddit
---------------------------------------------------
---------------------------------------------------
Start serving for combined_data using serving_headline_reddit


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at ./finbert_finetuned_combined_data.bin.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.
Didn't find file ./finbert_finetuned_combined_data.bin/added_tokens.json. We won't load it.
loading file ./finbert_finetuned_combined_data.bin/vocab.txt
loading file ./finbert_finetuned_combined_data.bin/tokenizer.json
loading file None
loading file ./finbert_finetuned_combined_data.bin/special_tokens_map.json
loading file ./finbert_finetuned_combined_data.bin/tokenizer_config.json


  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 8409
  Batch size = 64


---------------------------------------------------
Finish combined_data
---------------------------------------------------


In [15]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

  This is separate from the ipykernel package so we can avoid doing imports until
