## ENVIRONMENT SETUP

In [11]:
!nvidia-smi

Sun Apr  9 04:57:05 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
!git clone https://github.com/allenai/scirepeval.git
%cd scirepeval
!pip install --upgrade pip
!pip install -q -r requirements.txt

Cloning into 'scirepeval'...
remote: Enumerating objects: 1500, done.[K
remote: Counting objects: 100% (434/434), done.[K
remote: Compressing objects: 100% (230/230), done.[K
remote: Total 1500 (delta 239), reused 263 (delta 204), pack-reused 1066[K
Receiving objects: 100% (1500/1500), 14.17 MiB | 18.14 MiB/s, done.
Resolving deltas: 100% (1001/1001), done.
/content/scirepeval
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.4/123.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:

## TRAINING

- **SKIP THIS PART** if you only want to try out inferencing !!

### Dataset Creation and Preprocessing



In [None]:
import datasets

fos = datasets.load_dataset('allenai/scirepeval', 'fos')
# high_inf_cite = datasets.load_dataset('allenai/scirepeval', 'high_influence_cite')

In [None]:
fos

DatasetDict({
    evaluation: Dataset({
        features: ['doc_id', 'corpus_id', 'title', 'abstract', 'labels', 'labels_text'],
        num_rows: 68147
    })
    train: Dataset({
        features: ['doc_id', 'corpus_id', 'title', 'abstract', 'labels', 'labels_text'],
        num_rows: 541218
    })
    validation: Dataset({
        features: ['doc_id', 'corpus_id', 'title', 'abstract', 'labels', 'labels_text'],
        num_rows: 67631
    })
})

In [None]:
# Finding no. of unique labels
import itertools

labels = list(itertools.chain.from_iterable(fos['train']['labels']))
label_texts = list(itertools.chain.from_iterable(fos['train']['labels_text']))

labels_dict = dict()
temp = set()

for label_text, label in zip(label_texts, labels):
    temp.add((label_text, label))

labels_dict = {label_text:label for label_text, label in list(temp)}
num_labels = len(labels_dict)

print(labels_dict)

{'Computer science': 5, 'Psychology': 21, 'Business': 3, 'Art': 1, 'Law': 13, 'Mathematics': 16, 'Materials science': 15, 'Linguistics': 14, 'Physics': 19, 'Philosophy': 18, 'Political science': 20, 'History': 12, 'Geography': 10, 'Education': 7, 'Sociology': 22, 'Medicine': 17, 'Geology': 11, 'Chemistry': 4, 'Agricultural and Food sciences': 0, 'Biology': 2, 'Economics': 6, 'Engineering': 8, 'Environmental science': 9}


In [None]:
from datasets import concatenate_datasets

def pick_balanced_dataset(dataset, num_papers_to_pick, num_labels):
  '''
    Function to pick a sample fraction of a dataset, with papers from each category 
    selected in the same proportion as the original.
  '''
  num_total_papers = len(dataset)
  bal_dict = dict()  

  for label in range(num_labels):
    fos_subj = dataset.filter(lambda example: label in example['labels'])
    subj_count = len(fos_subj)
    small_fos_subj = fos_subj.shuffle(seed=42).select(range(round(subj_count/num_total_papers*num_papers_to_pick)))
    temp_dict = dict()

    temp_dict['data'] = small_fos_subj
    temp_dict['count'] = len(small_fos_subj)
    bal_dict[label] = temp_dict  
 
  print("Total papers picked : {}\n".format(num_papers_to_pick))

  bal_dataset = list()
  for label_text in labels_dict.keys():
    label = labels_dict[label_text]
    num_papers = bal_dict[label]['count']
    print("{} : {} ({:.2f}%)".format(label_text, num_papers, num_papers/num_papers_to_pick*100))
    bal_dataset.append(bal_dict[label]['data'])

  # Finally, concatenate datasets for each subject and then shuffle the contents
  bal_dataset = concatenate_datasets(bal_dataset).shuffle(seed=40)  
  return bal_dataset

In [None]:
fos_train = fos['train']
fos_eval = fos['evaluation']
num_train_papers_to_pick = 20000
num_eval_papers_to_pick = 5000

# Pick sample, balanced train and eval datasets
fos_small_train = pick_balanced_dataset(fos_train, num_train_papers_to_pick, num_labels)
fos_small_eval = pick_balanced_dataset(fos_eval, num_eval_papers_to_pick, num_labels)

  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 19822


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 3256


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 54948


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 8354


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 41384


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 35961


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 22788


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 30086


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 36055


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 53551


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 9933


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 11890


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 43016


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 5656


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 1839


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 23750


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 35177


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 69507


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 7198


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 71124


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 25118


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 29582


  0%|          | 0/542 [00:00<?, ?ba/s]

Subject count : 6446
Total papers picked : 20000

Computer science : 1329 (6.64%)
Psychology : 1093 (5.46%)
Business : 309 (1.54%)
Art : 120 (0.60%)
Law : 209 (1.04%)
Mathematics : 1300 (6.50%)
Materials science : 878 (4.39%)
Linguistics : 68 (0.34%)
Physics : 2628 (13.14%)
Philosophy : 266 (1.33%)
Political science : 928 (4.64%)
History : 1590 (7.95%)
Geography : 367 (1.84%)
Education : 1112 (5.56%)
Sociology : 238 (1.19%)
Medicine : 2569 (12.85%)
Geology : 439 (2.20%)
Chemistry : 1529 (7.65%)
Agricultural and Food sciences : 732 (3.66%)
Biology : 2031 (10.15%)
Economics : 842 (4.21%)
Engineering : 1332 (6.66%)
Environmental science : 1979 (9.89%)


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 2503


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 430


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 6918


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 1064


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 5201


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 4525


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 2865


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 3784


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 4524


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 6714


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 1231


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 1519


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 5406


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 726


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 245


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 2998


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 4422


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 8730


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 919


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 8912


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 3156


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 3722


  0%|          | 0/69 [00:00<?, ?ba/s]

Subject count : 839
Total papers picked : 5000

Computer science : 332 (6.64%)
Psychology : 273 (5.46%)
Business : 78 (1.56%)
Art : 32 (0.64%)
Law : 53 (1.06%)
Mathematics : 324 (6.48%)
Materials science : 220 (4.40%)
Linguistics : 18 (0.36%)
Physics : 654 (13.08%)
Philosophy : 67 (1.34%)
Political science : 232 (4.64%)
History : 397 (7.94%)
Geography : 90 (1.80%)
Education : 278 (5.56%)
Sociology : 62 (1.24%)
Medicine : 641 (12.82%)
Geology : 111 (2.22%)
Chemistry : 382 (7.64%)
Agricultural and Food sciences : 184 (3.68%)
Biology : 508 (10.16%)
Economics : 210 (4.20%)
Engineering : 332 (6.64%)
Environmental science : 493 (9.86%)


In [None]:
fos_small_train[8]

{'doc_id': '10964810',
 'corpus_id': 10964810,
 'title': 'A method for the ultrastructural examination of cell monolayers cultured in plastic microtitre plates.',
 'abstract': 'A method is presented by which cells growing as monolayers cultured in microtitre plates can be embedded for electron microscopy. The technique has the following advantages: numerous specimens may be prepared with relatively small numbers of cells, cell-cell interactions remain undisturbed and may be enumerated, and morphology can be studied under circumstances identical with those used in cell-mediated cytotoxicity assays.',
 'labels': [2, 17],
 'labels_text': ['Biology', 'Medicine']}

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

max_length = 400

def preprocess(dataset, tokenizer):
    '''
      Function to preprocess dataset in the correct format.
    '''
    
    dataset_ = dict()
    labels, texts = list(), list()

    for paper_dict in dataset:
        text = paper_dict['title'] + ' ' + paper_dict['abstract']
        for label in paper_dict['labels']:      # For each label of a paper, add a separate record to the dataset
            labels.append(label)
            texts.append(text)

    dataset_['label'] = labels
    dataset_['text'] = texts
    preprocessed_dataset = Dataset.from_dict(dataset_)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding=True, truncation=True, max_length=max_length)

    tokenized_dataset = preprocessed_dataset.map(tokenize_function, batched=True) 
    return tokenized_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

train = preprocess(fos_small_train, tokenizer)
eval = preprocess(fos_small_eval, tokenizer)

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

  0%|          | 0/33 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [None]:
eval[3]['label']

20

### Importing the Model

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=num_labels)

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [None]:
# Needed to fix some version conflict bug with protobuf
!pip install --upgrade protobuf
!cp /usr/local/lib/python3.9/dist-packages/google/protobuf/internal/builder.py /content/scirepeval
!pip install protobuf==3.19.5
!mv /content/scirepeval/builder.py /usr/local/lib/python3.9/dist-packages/google/protobuf/internal

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting protobuf
  Downloading protobuf-4.22.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.4
    Uninstalling protobuf-3.19.4:
      Successfully uninstalled protobuf-3.19.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.10.0 which is incompatible.
tensorflow-metadata 1.13.0 requires protobuf<4,>=3.13, but you have protobuf 4.22.1 which is incompatible.
tensorboardx 2.5.1 requires protobuf<=3.20.1,>=3.8.0, but you have protobuf 4.22.1 

In [None]:
# JUST A CODE CELL FOR INFINITE RUNNING :)

# %%shell
# source activate env
# python -m pip install huggingface_hub

# huggingface-cli login

### Training the model

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [None]:
import numpy as np

def compute_accuracy(pred_eval):
    '''
      Function to compute evaluation metrics like
      overall accuracy, cs-specific and non-cs accuracy.
    '''
    logits, labels = pred_eval
    predictions = np.argmax(logits, axis=-1)

    # Compute the accuracy
    accuracy = np.mean(predictions == labels)
    cs_label = labels_dict['Computer science']

    num_total_papers = len(labels)
    num_cs_papers = sum(np.array(labels == cs_label))
    num_non_cs_papers = num_total_papers - num_cs_papers

    cs_accuracy = np.mean((labels == cs_label) & (predictions == labels)) * (num_total_papers / num_cs_papers)
    non_cs_accuracy = np.mean((labels != cs_label) & (predictions == labels)) * (num_total_papers / num_non_cs_papers)

    return {"accuracy": accuracy, "cs_accuracy": cs_accuracy, "non_cs_accuracy": non_cs_accuracy}

In [None]:
# # Tiny sample for DEBUGGING ONLY
# fos_tiny_train = pick_balanced_dataset(fos_train, 100, num_labels)
# fos_tiny_eval = pick_balanced_dataset(fos_eval, 100, num_labels)

# tiny_train = preprocess(fos_tiny_train, tokenizer)
# tiny_eval = preprocess(fos_tiny_eval, tokenizer)



Subject count : 19822
Subject count : 3256
Subject count : 54948
Subject count : 8354
Subject count : 41384
Subject count : 35961




Subject count : 22788
Subject count : 30086
Subject count : 36055
Subject count : 53551
Subject count : 9933




Subject count : 11890
Subject count : 43016
Subject count : 5656
Subject count : 1839
Subject count : 23750
Subject count : 35177
Subject count : 69507




Subject count : 7198
Subject count : 71124
Subject count : 25118
Subject count : 29582
Subject count : 6446
Total papers picked : 100

Computer science : 7 (7.00%)
Psychology : 5 (5.00%)
Business : 2 (2.00%)
Art : 1 (1.00%)
Law : 1 (1.00%)
Mathematics : 6 (6.00%)
Materials science : 4 (4.00%)
Linguistics : 0 (0.00%)
Physics : 13 (13.00%)
Philosophy : 1 (1.00%)
Political science : 5 (5.00%)
History : 8 (8.00%)
Geography : 2 (2.00%)
Education : 6 (6.00%)
Sociology : 1 (1.00%)
Medicine : 13 (13.00%)
Geology : 2 (2.00%)
Chemistry : 8 (8.00%)
Agricultural and Food sciences : 4 (4.00%)
Biology : 10 (10.00%)
Economics : 4 (4.00%)
Engineering : 7 (7.00%)
Environmental science : 10 (10.00%)
Subject count : 2503




Subject count : 430
Subject count : 6918
Subject count : 1064
Subject count : 5201
Subject count : 4525
Subject count : 2865
Subject count : 3784




Subject count : 4524
Subject count : 6714
Subject count : 1231
Subject count : 1519
Subject count : 5406
Subject count : 726




Subject count : 245
Subject count : 2998
Subject count : 4422
Subject count : 8730
Subject count : 919




Subject count : 8912
Subject count : 3156
Subject count : 3722
Subject count : 839
Total papers picked : 100

Computer science : 7 (7.00%)
Psychology : 5 (5.00%)
Business : 2 (2.00%)
Art : 1 (1.00%)
Law : 1 (1.00%)
Mathematics : 6 (6.00%)
Materials science : 4 (4.00%)
Linguistics : 0 (0.00%)
Physics : 13 (13.00%)
Philosophy : 1 (1.00%)
Political science : 5 (5.00%)
History : 8 (8.00%)
Geography : 2 (2.00%)
Education : 6 (6.00%)
Sociology : 1 (1.00%)
Medicine : 13 (13.00%)
Geology : 2 (2.00%)
Chemistry : 8 (8.00%)
Agricultural and Food sciences : 4 (4.00%)
Biology : 10 (10.00%)
Economics : 4 (4.00%)
Engineering : 7 (7.00%)
Environmental science : 10 (10.00%)


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
# # DEBUGGING
# tiny_trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tiny_train,
#     eval_dataset=tiny_eval,
#     compute_metrics=compute_accuracy,
# )

# tiny_trainer.train()

In [None]:
# Whole dataset will take 52 hours for training on GPU, 10k random samples around 1 hr
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=eval,
    compute_metrics=compute_accuracy,
)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 32152
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12057


Epoch,Training Loss,Validation Loss,Accuracy,Cs Accuracy,Non Cs Accuracy
1,0.9849,0.947053,0.612356,0.525581,0.617236
2,0.722,0.902171,0.63167,0.448837,0.641951
3,0.5125,0.889598,0.641451,0.713953,0.637374


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=12057, training_loss=0.8227285946879783, metrics={'train_runtime': 8200.2262, 'train_samples_per_second': 11.763, 'train_steps_per_second': 1.47, 'total_flos': 1.98308008685376e+16, 'train_loss': 0.8227285946879783, 'epoch': 3.0})

### Saving the Model

In [None]:
# Save the files that can be directly loaded for model inference
trainer.save_model("/content/scirepeval/final_checkpoint/")

Saving model checkpoint to /content/scirepeval/final_checkpoint/
Configuration saved in /content/scirepeval/final_checkpoint/config.json
Model weights saved in /content/scirepeval/final_checkpoint/pytorch_model.bin


In [None]:
# Zip the final checkpoint
!zip -r /content/scirepeval/final_checkpoint.zip /content/scirepeval/final_checkpoint/ 

  adding: content/scirepeval/final_checkpoint/ (stored 0%)
  adding: content/scirepeval/final_checkpoint/pytorch_model.bin (deflated 7%)
  adding: content/scirepeval/final_checkpoint/config.json (deflated 64%)
  adding: content/scirepeval/final_checkpoint/training_args.bin (deflated 48%)


In [None]:
# Download the checkpoint
from google.colab import files
files.download('/content/scirepeval/final_checkpoint.zip')

In [None]:
# Or, add it to Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp /content/scirepeval/final_checkpoint_small.zip '/content/gdrive/MyDrive/'

## INFERENCE

In [2]:
!pip install gdown # Needed to install files from Google Drive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
# Download the classifier model checkpoints
!gdown https://drive.google.com/uc?id=1yGO03wbwVPyGe1HeSaQlYklXBgm0L0WG  

Downloading...
From: https://drive.google.com/uc?id=1yGO03wbwVPyGe1HeSaQlYklXBgm0L0WG
To: /content/scirepeval/classifier_checkpoint.zip
100% 408M/408M [00:02<00:00, 197MB/s]


In [17]:
!unzip classifier_checkpoint.zip -d /content/scirepeval/

Archive:  classifier_checkpoint.zip
   creating: /content/scirepeval/classifier_checkpoint/
  inflating: /content/scirepeval/classifier_checkpoint/pytorch_model.bin  
  inflating: /content/scirepeval/classifier_checkpoint/config.json  
  inflating: /content/scirepeval/classifier_checkpoint/training_args.bin  


Some weights of the model checkpoint at /content/scirepeval/classifier_checkpoint/ were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
# Needed to fix some version conflict bug with protobuf
!pip install --upgrade protobuf
!cp /usr/local/lib/python3.9/dist-packages/google/protobuf/internal/builder.py /content/scirepeval
!pip install protobuf==3.19.5
!mv /content/scirepeval/builder.py /usr/local/lib/python3.9/dist-packages/google/protobuf/internal

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting protobuf
  Downloading protobuf-4.22.1-cp37-abi3-manylinux2014_x86_64.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.19.4
    Uninstalling protobuf-3.19.4:
      Successfully uninstalled protobuf-3.19.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.12.0 requires tensorboard<2.13,>=2.12, but you have tensorboard 2.10.0 which is incompatible.
tensorflow-metadata 1.13.0 requires protobuf<4,>=3.13, but you have protobuf 4.22.1 which is incompatible.
tensorboardx 2.5.1 requires protobuf<=3.20.1,>=3.8.0, but you have protobuf 4.22.1 

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the configuration from the config.json file
pretrained_config = AutoConfig.from_pretrained("classifier_checkpoint/config.json")

# Load the model from the pytorch_model.bin file
pretrained_model = AutoModelForSequenceClassification.from_pretrained("classifier_checkpoint/pytorch_model.bin", config=pretrained_config)

# Load the training arguments from the training_args.bin file
# pretrained_args = TrainingArguments.load("classifier_checkpoint/training_args.bin")

# Create a Trainer instance with the loaded model and training arguments
trainer = Trainer(
    model=pretrained_model
)

loading configuration file classifier_checkpoint/config.json
Model config BertConfig {
  "_name_or_path": "classifier_checkpoint/config.json",
  "adapters": {
    "adapters": {},
    "config_map": {},
    "fusion_config_map": {},
    "fusions": {}
  },
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22"
  },
  "initializer_range": 0.02,
  "in

In [33]:
# Trying out inference on a single prompt
import torch
from transformers import AutoModel, AutoTokenizer, PretrainedConfig, Trainer

# Original category : Biology
text = "A method for the ultrastructural examination of cell monolayers cultured in plastic microtitre plates. A method is presented by which cells growing as monolayers cultured in microtitre plates can be embedded for electron microscopy. The technique has the following advantages: numerous specimens may be prepared with relatively small numbers of cells, cell-cell interactions remain undisturbed and may be enumerated, and morphology can be studied under circumstances identical with those used in cell-mediated cytotoxicity assays. "

# Load pretrained model from checkpoint folder
pretrained_config = PretrainedConfig.from_json_file('classifier_checkpoint/config.json')
pretrained_model = AutoModel.from_pretrained('/content/scirepeval/classifier_checkpoint/', config=pretrained_config)
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
pretrained_args = torch.load('classifier_checkpoint/training_args.bin')

trainer = Trainer(
    model=pretrained_model,
    args=pretrained_args,
)

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

logits = outputs.logits
logits

ValueError: ignored

In [20]:
import torch

pred = torch.argmax(logits, axis = 1)[0]

NameError: ignored

In [21]:
for label_text in labels_dict.keys():
  if labels_dict[label_text] == pred:
    print(label_text)   # Actual text category : Biology

NameError: ignored