In [20]:
!pip install transformers==4.19.4
!pip install datasets==2.13.1
!pip install evaluate==0.4.0
!pip install rouge-score==0.1.2
!pip install py7zr



In [21]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

from datasets import DatasetDict, Dataset, load_dataset, concatenate_datasets

from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
os.environ['TOKENIZERS_PARALLELISM'] = 'false' # there might be interferences with the parallelism of the Hugging Face Trainer

In [24]:
# model = AutoModelForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

In [None]:
folds_dict = {}

for i in range(10):
    df_name = f'fold_{i+1}'
    df = pd.read_csv(f'/content/drive/MyDrive/citation_sentiment_data/folds/{df_name}.csv')
    folds_dict[df_name] = df

for f in folds_dict:
  folds_dict[f] = Dataset.from_pandas(folds_dict[f])

dataset = DatasetDict(folds_dict)

def tokenize_seqs(examples):
    return tokenizer(examples['citation'], padding = True, truncation=True, max_length=512, return_tensors="pt")

tokenized_data = dataset.map(tokenize_seqs, batched=True)

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Labels: <br>
> O - neutral <br>
> 1 - opinionated<br>
<br>

In [None]:
tokenized_data = tokenized_data.rename_column('label', 'labels')

In [None]:
def compute_metrics(eval_preds):
    y_true = eval_preds.label_ids
    y_pred = eval_preds.predictions.argmax(axis=1)
    f1 = f1_score(y_true, y_pred, average='weighted')
    return {'f1': f1}

In [None]:
# creating a list of all folds
fold_list = [tokenized_data[f'fold_{i+1}'] for i in range(10)]

In [None]:
# selecting one fold as eval set based on its index
eval_data = fold_list[0]

In [None]:
train_folds = fold_list.copy()
del train_folds[0]
train_data = concatenate_datasets(train_folds)

In [None]:
per_device_train_batch_size = 4

In [None]:
training_args = TrainingArguments(
    output_dir ='./logs/fold1',
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size  = per_device_train_batch_size*4,
    learning_rate = 2e-5,
    weight_decay = 1e-3,
    num_train_epochs = 3,
    evaluation_strategy = 'epoch',
    logging_strategy = 'steps',
    logging_steps = len(train_data) / per_device_train_batch_size,
    save_strategy = 'epoch',
    save_total_limit = 1,
    seed = 42,
    data_seed = 42,
    fp16 = True,
    dataloader_num_workers = 2,
    load_best_model_at_end = True
)

In [None]:
trainer = Trainer(model=model, args=training_args, compute_metrics = compute_metrics, train_dataset = train_data, eval_dataset = eval_data, tokenizer = tokenizer)

Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: citation. If citation are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7182
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5388


Epoch,Training Loss,Validation Loss,F1
1,No log,0.396571,0.890061
2,0.430900,0.37194,0.916939
3,0.430900,0.373322,0.917722


The following columns in the evaluation set don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: citation. If citation are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 798
  Batch size = 16
Saving model checkpoint to ./logs/fold1/checkpoint-1796
Configuration saved in ./logs/fold1/checkpoint-1796/config.json
Model weights saved in ./logs/fold1/checkpoint-1796/pytorch_model.bin
tokenizer config file saved in ./logs/fold1/checkpoint-1796/tokenizer_config.json
Special tokens file saved in ./logs/fold1/checkpoint-1796/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `XLNetForSequenceClassification.forward` and have been ignored: citation. If citation are not expected by `XLNetForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num exa

TrainOutput(global_step=5388, training_loss=0.37478910699630547, metrics={'train_runtime': 1024.3523, 'train_samples_per_second': 21.034, 'train_steps_per_second': 5.26, 'total_flos': 6138030503227392.0, 'train_loss': 0.37478910699630547, 'epoch': 3.0})

In [25]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from huggingface_hub import create_repo
create_repo("Velkymoss/impact-cite", private=True)

HfHubHTTPError: ignored

In [None]:
!mkdir model_f1
model.save_pretrained('model_f1')

In [None]:
model.push_to_hub('impact-cite')

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Configuration saved in impact-cite/config.json
Model weights saved in impact-cite/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 1.00/448M [00:00<?, ?B/s]

To https://huggingface.co/Velkymoss/impact-cite
   52ccae8..af0790e  main -> main

   52ccae8..af0790e  main -> main



'https://huggingface.co/Velkymoss/impact-cite/commit/af0790e16b670891611ae38327eaa1b4f3b5bf94'