In [409]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re


In [410]:
import os

#print(os.getcwd())       # Confirm folder
#print(os.listdir())  

## Importing and cleaning data set

In [411]:
section_path = "section_train_0.parquet" #file path
section = pd.read_parquet(section_path)

In [412]:
documention_path_0 = "document_train_0.parquet" #file path
document_0 = pd.read_parquet(documention_path_0)

documention_path_1 = "document_train_1.parquet"
document_1 = pd.read_parquet(documention_path_1)

documention_path_2 = "document_train_2.parquet"
document_2 = pd.read_parquet(documention_path_2)

documention_path_3 = "document_train_3.parquet"
document_3 = pd.read_parquet(documention_path_3)

In [413]:
document = pd.concat([document_0, document_1, document_2, document_3])

In [414]:
document['abstract'] = document['abstract'].str.replace(' \n', '', regex=False) #remove \n
document['abstract'] = document['abstract'].str.replace(r'@\w+', '', regex=True) # remove @ formulas
document['abstract'] = document['abstract'].str.replace('  ', ' ', regex=False) #replace double space with single space

document['article'] = document['article'].str.replace(r'@\w+', '', regex=True) # remove @ formulas
document['article'] = document['article'].str.replace('  ', ' ', regex=False) #replace double space with single space

## Remove samples with missing info and samples with too much noise

In [415]:
article_summary = document['article'].str.len().describe()
article_summary['25%']
article_summary['75%']

article_summary

count     54144.000000
mean      30196.147292
std       22630.985389
min           0.000000
25%       16117.500000
50%       24903.000000
75%       38290.500000
max      607645.000000
Name: article, dtype: float64

In [416]:
abstract_summary = document['abstract'].str.len().describe()
abstract_summary['25%']
abstract_summary['75%']

abstract_summary

count    54144.000000
mean      1490.463542
std       2872.869021
min          7.000000
25%        659.000000
50%        938.000000
75%       1324.000000
max      69493.000000
Name: abstract, dtype: float64

In [417]:
document = document[document['article'].str.len() >= article_summary['25%']] 
document = document[document['article'].str.len() <= article_summary['75%']]  

document = document[document['abstract'].str.len() >= abstract_summary['25%']]
document = document[document['abstract'].str.len() <= abstract_summary['75%']]

document = document.drop_duplicates()
document = document.reset_index(drop=True)

In [418]:
document_sorted = (
    document
    .assign(article_length=document['article'].str.len())
    .sort_values(by='article_length', ascending=True)
)

document_sorted

Unnamed: 0,article,abstract,article_length
3651,the luminous blue variable ( lbv ) phase is be...,the paradigmatic luminous blue variable r127 i...,16118
9703,although the discovery of `` a higgs - like bo...,we consider the most general set of invariant ...,16120
15248,the formulation of gauge theories on discrete ...,the phase diagram of lattice gauge theory is i...,16122
12159,magnetic fields can greatly influence stellar ...,we present 2d mhd simulations of the radiative...,16124
13160,rare b decays are mediated by flavor changing ...,we calculate the zeroes of angular observables...,16126
...,...,...,...
6870,for more than two decades astrophysicists have...,we present results from the first and simulati...,38270
6218,vortex arrays ( va s ) in type ii superconduct...,the flow properties of confined vortex matter ...,38271
13153,"networked systems , such as social , biologica...",we consider three distinct and well studied pr...,38272
11051,a wireless relay network is one in which a set...,"in this paper , a cooperative transmission des...",38274


In [419]:
document_sorted = (
    document
    .assign(abstract_length=document['abstract'].str.len())
    .sort_values(by='abstract_length', ascending=True)
)

document_sorted

Unnamed: 0,article,abstract,abstract_length
9484,half - metallicity is the property of some spi...,we report on first - principles calculations o...,659
8298,one of the important tools to study the proper...,the paper presents analysis of the single top ...,659
14987,the growing flux of partons at high energy wil...,at the lhc multiple parton interactions will r...,659
6839,is a nearby ( distance to the earth ) young m...,the red spectral shape of the visible to near ...,659
4328,solar flares have been historically divided in...,gamma - ray production cross sections have bee...,659
...,...,...,...
8901,it has long been recognized that by studying t...,the magic collaboration has recently reported ...,1324
1293,the nature of unidentified high - energy -ray ...,we report the detection of -ray pulsations ( g...,1324
6305,the liberalised electricity market poses new c...,hydro storage system optimization is becoming ...,1324
6352,many of the ideas presented in this paper have...,the path integral by which quantum field theor...,1324


In [420]:
document['article'][0]

'additive models provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models . many examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space , see e.g. . in the last years many interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g. , , , , , and the references therein . of course , the least squares loss function is differentiable and has many nice ma

In [421]:
document['abstract'][0]

'additive models play an important role in semiparametric statistics . this paper gives learning rates for regularized kernel based methods for additive models . these learning rates compare favourably in particular in high dimensions to recent results on optimal learning rates for purely nonparametric regularized kernel based quantile regression using the gaussian radial basis function kernel , provided the assumption of an additive model is valid . additionally , a concrete example is presented to show that a gaussian function depending only on one variable lies in a reproducing kernel hilbert space generated by an additive gaussian kernel , but does not belong to the reproducing kernel hilbert space generated by the multivariate gaussian kernel of the same variance .  * key words and phrases . * additive model , kernel , quantile regression , semiparametric , rate of convergence , support vector machine .'

## BART (No training yet)

In [422]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

model_name = "facebook/bart-large-cnn"

tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name, output_attentions=True)

model.config.output_attentions = True
model.config.return_dict = True

model.eval()

Loading weights: 100%|██████████| 511/511 [00:00<00:00, 1004.17it/s, Materializing param=model.encoder.layers.11.self_attn_layer_norm.weight]  


BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

In [423]:
text = document['article'][0]

inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)

generated = model.generate(
    inputs["input_ids"],
    num_beams=4,
    max_length=500,
    min_length=100,
    #length_penalty=2.0, # can change min and max length as well as this to see if we want to have the summary be more brief     
    early_stopping=True,
    no_repeat_ngram_size=3,
    do_sample=False,
    return_dict_in_generate=True
)

generated_ids = generated.sequences

summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(summary)

#for attention values
outputs = model(
    input_ids=inputs["input_ids"],
    decoder_input_ids=generated_ids,
    output_attentions=True,
    return_dict=True
)

cross_attentions = outputs.cross_attentions

additive models provide an important family of models for semiparametric regression or classification. Some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability. In this paper we address the open question whether an svm with an additive kernel can provide a substantially better learning rate in high dimensions. We will not address the question how to check whether the assumption of an additive model is satisfied because this would be a topic of a paper of its own.


In [425]:
document['abstract'][0]

'additive models play an important role in semiparametric statistics . this paper gives learning rates for regularized kernel based methods for additive models . these learning rates compare favourably in particular in high dimensions to recent results on optimal learning rates for purely nonparametric regularized kernel based quantile regression using the gaussian radial basis function kernel , provided the assumption of an additive model is valid . additionally , a concrete example is presented to show that a gaussian function depending only on one variable lies in a reproducing kernel hilbert space generated by an additive gaussian kernel , but does not belong to the reproducing kernel hilbert space generated by the multivariate gaussian kernel of the same variance .  * key words and phrases . * additive model , kernel , quantile regression , semiparametric , rate of convergence , support vector machine .'

In [424]:
print(len(cross_attentions))
print(cross_attentions[0].shape)

12
torch.Size([1, 16, 105, 1024])


In [406]:
cross_attentions

(tensor([[[[1.9530e-01, 6.6491e-07, 8.6813e-10,  ..., 4.6027e-08,
            1.8740e-08, 3.3442e-02],
           [7.8566e-02, 3.6558e-05, 1.2620e-08,  ..., 3.2075e-08,
            1.8732e-09, 4.3937e-02],
           [1.4969e-03, 9.6271e-01, 2.0434e-06,  ..., 9.9358e-10,
            1.5781e-08, 1.2946e-03],
           ...,
           [2.9099e-03, 1.7626e-07, 4.7626e-07,  ..., 3.8807e-09,
            3.5607e-07, 9.4704e-04],
           [1.6940e-01, 9.5440e-07, 6.9736e-07,  ..., 5.7952e-09,
            1.4324e-08, 6.0313e-02],
           [7.8984e-02, 1.0426e-08, 5.9409e-11,  ..., 3.7038e-09,
            1.4941e-07, 8.5193e-02]],
 
          [[7.5060e-02, 1.1381e-04, 1.7044e-04,  ..., 1.7028e-04,
            1.5922e-04, 3.0086e-02],
           [8.7623e-02, 6.6379e-04, 3.1574e-04,  ..., 4.5471e-05,
            4.4424e-05, 2.2373e-02],
           [2.5617e-01, 1.4498e-03, 4.5393e-04,  ..., 5.4525e-05,
            8.1151e-05, 2.6895e-02],
           ...,
           [2.5342e-01, 7.4780e-04, 4.