In [1]:
!pip install transformers datasets
!pip3 install rouge_score



In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re
import transformers
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split

from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer, DataCollatorForSeq2Seq, BigBirdPegasusPreTrainedModel
import datasets
from datasets import load_dataset, list_metrics, load_metric
from datasets import Features, Sequence, Value
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.utils.checkpoint import checkpoint

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
EPOCH = 4
MODEL_VERSION = 2
TRAIN = True
TRAIN_PATH = '/content/drive/MyDrive/Text-Mining/Data/sentence_selection/train_with_shortened_sent_sel_3072.tsv'
TEST_PATH = '/content/drive/MyDrive/Text-Mining/Data/sentence_selection/test_with_shortened_sent_sel_3072.tsv'

In [4]:
train_df = pd.read_csv(TRAIN_PATH, sep='\t', usecols=[2, 6, 12])
test_df = pd.read_csv(TEST_PATH, sep='\t', usecols=[2, 6, 12])


train_df, val_df = train_test_split(train_df.dropna(), test_size=0.2)

train_df.to_csv('clean_train.tsv', sep='\t', index=False)
val_df.to_csv('clean_val.tsv', sep='\t', index=False)
test_df.dropna().to_csv('clean_test.tsv', sep='\t', index=False)

train_df

Unnamed: 0,id,highlights,shortened_articles
4595,S0377221715003239,Anticipation is different in the open-loop com...,We find that under an open-loop information st...
2028,S0167839613000502,Two special syzygies for complex rational curv...,We present a fast algorithm for finding a μ-ba...
2052,S0167839614000211,Explicit Representations of three μ-basis elem...,We provide explicit representations of three m...
8292,S1568494614000052,A decentralized machine learning method that d...,It is thus demonstrated that the proposed lear...
1304,S0020019014001744,A unified framework is proposed for mutual exc...,Mutual exclusion is a fundamental process sync...
...,...,...,...
5436,S0885230814001211,We propose three different types of curriculum...,This paper addresses the issue of language mod...
898,S0010482514002042,Coupled bioheat and blood flow model has been ...,Graphical abstract specific heat (Jkg−1 K−1)\n...
2960,S0262885614001012,We present the first 3D dynamic spontaneous fa...,Most publically available databases are limite...
4119,S0377221714005876,We propose a robust revenue management model w...,While the criterion is defined on a myriad of ...


In [5]:
features = Features({'id': Value('string'), 
                     'highlights': Value('string'), 
                     'shortened_articles': Value('string')})

In [6]:
dataset = load_dataset('csv', 
                       data_files={'val': 'clean_val.tsv', 'test': 'clean_test.tsv'}, 
                       delimiter='\t',
                       features=features)

Using custom data configuration default-b357e8fa7ac76c91


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b357e8fa7ac76c91/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b357e8fa7ac76c91/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

In [8]:
model_for_eval = BigBirdPegasusForConditionalGeneration.from_pretrained(
    f"/content/drive/MyDrive/Text-Mining/model_v{MODEL_VERSION}/epoch{EPOCH}", 
    attention_type="block_sparse",
)

In [9]:
full_text = dataset['val']['shortened_articles']

In [10]:
all_preds = []

for text in tqdm(full_text[:150]):
    input = tokenizer(text, padding=True, truncation=True, max_length=3072, return_tensors='pt')
    prediction_ids = model_for_eval.generate(
        **input, 
        #attention_mask=input_ids['attention_mask'], 
        repetition_penalty=1.3,
        min_length=50,
        do_sample=True, 
        max_length=100, 
        top_k=20, 
        top_p=0.95,
        temperature=0.8
    )

    all_preds.append(prediction_ids)

  * num_indices_to_pick_from
 13%|█▎        | 20/150 [13:41<1:27:55, 40.58s/it]Attention type 'block_sparse' is not possible if sequence_length: 161 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
 29%|██▊       | 43/150 [31:28<1:18:18, 43.91s/it]


KeyboardInterrupt: ignored

In [16]:
tokenizer.decode(all_preds[18][0], skip_special_tokens=True)

'a new approach to the construction of piecewise polynomial spaces is presented.<n> it is based on the idea that each element of the space can be represented by a polynomial function.<n> the coefficients of such functions are analyzed.<n> the method is applied to the case of piecewise polynomial spaces generated by knot-spline.'

In [12]:
all_preds

[tensor([[   2,  109,  486,  113,  109, 1474,  752,  407,  140, 5221,  115,  109,
           908,  135, 5851,  112, 2191,  110,  107,  333,  136,  908,  110,  108,
           109,  344,  113,  910, 1519,  140, 1222,  221,  991,  110,  107,  106,
           109, 1077,  564,  113,  910, 1519,  140, 1222,  221,  991,  110,  107,
           106,  109, 1077, 8605,  564,  113,  910, 1519,  140, 1222,  221,  991,
           130,  210,  110,  107,    1]]),
 tensor([[    2,   145, 10287,   114,   501,   725,  7680,   143, 19765,   586,
          17758,  1356,   110,   158,   120, 12990,   116,   109,  4421,   725,
           5661,   113,   391,  4803, 12036,   725,   111,  1122,  4803, 12036,
            725,   110,   107,   109,  1356,   117,   451,   124,   391,  4803,
          12036,   725,   111,  1122,  4803, 12036,   725,   110,   107,   106,
            109,   637,   117,  1711,   464, 11779,  2489,   110,   107,     1]]),
 tensor([[    2,   136,   800, 12414,   109,   637,   113,   114