In [30]:
import numpy as np
import pandas as pd
import bibtexparser
import arxiv
from tqdm import tqdm
# Take first page from each PDF
from PyPDF2 import PdfFileReader, PdfFileWriter


In [2]:
path = 'inspire.bib'

In [3]:
with open(path) as f:
    database = bibtexparser.load(f)

In [4]:
my_df = pd.DataFrame(database.entries)

In [5]:
my_df.shape

(215, 16)

In [6]:
my_df['journal'].unique()

array(['JHEP', 'Phys. Rev. D', 'Eur. Phys. J. C', 'Phys. Rev. Lett.',
       'Phys. Lett. B', 'Phys. Rev. C', 'JINST',
       'Comput. Softw. Big Sci.', 'Nature Phys.', 'PoS'], dtype=object)

In [7]:
qualis = {
            'Phys. Rev. D'    : ('PHYSICAL REVIEW D (PARTICLES, FIELDS, GRAVITATION AND COSMOLOGY)', 'A2'),
            'Phys. Lett. B'   : ('MODERN PHYSICS LETTERS B', 'B5'), # not allow
            'Eur. Phys. J. C' : ('The European Physical Journal C', '0'), 
            'Phys. Rev. Lett.': ('PHYSICAL REVIEW LETTERS', 'A1'),
            'Phys. Rev. C'    : ('PHYSICAL REVIEW C', 'A2'),
            'Nature Phys.'    : ('NATURE PHYSICS (PRINT)', 'A1'),
            'JHEP'            : ('Journal of High Energy Physics', 'A1' ),
            'JINST'           : ('Journal of Instrumentation', 'B1'),
            'PoS'             : ('Proceeding of science', '0'), # not allow
        }

In [8]:
my_df.shape

(215, 16)

In [9]:
def add_qualis_factor( row ):
    journal = row['journal']
    if journal in qualis.keys():
        return qualis[journal][1]
    else:
        return np.nan
def add_qualis_full_name(row):
    journal = row['journal']
    if journal in qualis.keys():
        return qualis[journal][0]
    else:
        return np.nan
    

In [10]:
my_df['Factor'] = my_df.apply( add_qualis_factor , axis=1)
my_df['Journal name'] = my_df.apply( add_qualis_full_name , axis=1)

In [11]:
my_articles = my_df.loc[(my_df['Factor']=='A1')|(my_df['Factor']=='A2')|(my_df['Factor']=='B1')]

In [12]:
my_articles.shape

(127, 18)

In [13]:
my_articles.head()

Unnamed: 0,year,pages,volume,journal,doi,reportnumber,primaryclass,archiveprefix,eprint,title,collaboration,author,ENTRYTYPE,ID,number,note,Factor,Journal name
0,2022,87,8,JHEP,10.1007/JHEP08(2022)087,CERN-EP-2022-025,hep-ex,arXiv,2203.01808,{Study of $ {\mathrm{B}}_{\mathrm{c}}^{+}\to \...,ATLAS,"Aad, Georges and others",article,ATLAS:2022aiy,,,A1,Journal of High Energy Physics
1,2022,5,6,JHEP,10.1007/JHEP06(2022)005,CERN-EP-2022-002,hep-ex,arXiv,2203.01009,{Search for neutral long-lived particles in $p...,ATLAS,"Aad, Georges and others",article,ATLAS:2022zhj,,,A1,Journal of High Energy Physics
2,2022,32005,106,Phys. Rev. D,10.1103/PhysRevD.106.032005,CERN-EP-2021-195,hep-ex,arXiv,2203.00587,{Search for events with a pair of displaced ve...,ATLAS,"Aad, Georges and others",article,ATLAS:2022gbw,3.0,,A2,"PHYSICAL REVIEW D (PARTICLES, FIELDS, GRAVITAT..."
3,2022,32008,106,Phys. Rev. D,10.1103/PhysRevD.106.032008,CERN-EP-2021-116,hep-ex,arXiv,2202.13901,{Measurements of jet observables sensitive to ...,ATLAS,"Aad, Georges and others",article,ATLAS:2022miz,3.0,,A2,"PHYSICAL REVIEW D (PARTICLES, FIELDS, GRAVITAT..."
4,2022,63,6,JHEP,10.1007/JHEP06(2022)063,CERN-EP-2022-003,hep-ex,arXiv,2202.12134,{Measurements of differential cross-sections i...,ATLAS,"Aad, Georges and others",article,ATLAS:2022xfj,,,A1,Journal of High Energy Physics


In [14]:
my_articles = my_articles.sort_values(by=['Factor'])

In [15]:
my_articles.head()

Unnamed: 0,year,pages,volume,journal,doi,reportnumber,primaryclass,archiveprefix,eprint,title,collaboration,author,ENTRYTYPE,ID,number,note,Factor,Journal name
0,2022,87,8,JHEP,10.1007/JHEP08(2022)087,CERN-EP-2022-025,hep-ex,arXiv,2203.01808,{Study of $ {\mathrm{B}}_{\mathrm{c}}^{+}\to \...,ATLAS,"Aad, Georges and others",article,ATLAS:2022aiy,,,A1,Journal of High Energy Physics
130,2020,222002,124,Phys. Rev. Lett.,10.1103/PhysRevLett.124.222002,CERN-EP-2020-030,hep-ex,arXiv,2004.0354,{Measurement of the Lund Jet Plane Using Charg...,ATLAS,"Aad, Georges and others",article,ATLAS:2020bbn,22.0,,A1,PHYSICAL REVIEW LETTERS
127,2020,61802,125,Phys. Rev. Lett.,10.1103/PhysRevLett.125.061802,CERN-EP-2020-046,hep-ex,arXiv,2004.04545,{$CP$ Properties of Higgs Boson Interactions w...,ATLAS,"Aad, Georges and others",article,ATLAS:2020ior,6.0,,A1,PHYSICAL REVIEW LETTERS
126,2020,5,10,JHEP,10.1007/JHEP10(2020)005,CERN-EP-2019-204,hep-ex,arXiv,2004.10894,{Search for direct production of electroweakin...,ATLAS,"Aad, Georges and others",article,ATLAS:2020qlk,,,A1,Journal of High Energy Physics
122,2020,131801,125,Phys. Rev. Lett.,10.1103/PhysRevLett.125.131801,CERN-EP-2020-062,hep-ex,arXiv,2005.02983,{Dijet resonance search with weak supervision ...,ATLAS,"Aad, Georges and others",article,ATLAS:2020iwa,13.0,,A1,PHYSICAL REVIEW LETTERS


In [16]:
my_articles.to_excel("articles.xlsx", sheet_name='Articles') 

In [36]:
path = '2203.01808v2.Study_of_B_c_to_J_ψD_s_and_B_c_to_J_ψD_s_decays_in_pp_collisions_at_sqrt_s_13_TeV_with_the_ATLAS_detector.pdf'
writer = PdfFileWriter()
reader = PdfFileReader(path)
page = reader.getPage(0)
writer.addPage(page)
with open("test.pdf", "wb") as output:
    writer.write(output)

In [33]:
for _,row in tqdm(my_articles.iterrows()):
    paper = next(arxiv.Search(id_list=[row.eprint]).results())
    # Download the PDF to the PWD with a default filename.
    paper.download_pdf()

1it [00:12, 12.49s/it]


KeyboardInterrupt: 

In [22]:
paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
# Download the PDF to the PWD with a default filename.
paper.download_pdf()

'./1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf'

## How many points?

In [17]:
many_to_one = 1

In [18]:
count = int(my_articles.loc[my_articles['Factor']=='A1'].shape[0]/many_to_one) * 2
count+= int(my_articles.loc[my_df['Factor']=='A2'].shape[0]/many_to_one) * 1.8
count+= int(my_articles.loc[my_articles['Factor']=='B1'].shape[0]/many_to_one) * 0.5
print(count)

231.1


In [19]:
 my_articles.loc[my_articles['Factor']=='A1'].shape[0]

84

In [20]:
 my_articles.loc[my_articles['Factor']=='A2'].shape[0]

32

In [21]:
 my_articles.loc[my_articles['Factor']=='B1'].shape[0]

11

In [None]:
def plot(x)
    n_bins = 20
    plt.hist(x, n_bins, density = True, 
         histtype ='bar')

In [38]:
import numpy as np
import matplotlib.pyplot as plt


In [None]:
s = np.random.uniform(-1,0,1000)