In [52]:
import requests
import pandas as pd
import json, os
import xml.etree.ElementTree as ET

ARXIV_NAMESPACE = '{http://www.w3.org/2005/Atom}'

def extract_from_arxiv(search_query='cat:cs.AI', max_results=100, json_file_path='arxiv_dataset.json'):    
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&max_results={max_results}'
    
    response = requests.get(url)
    
    root = ET.fromstring(response.content)
    
    papers = []
    
    for entry in root.findall(f'{ARXIV_NAMESPACE}entry'):
        title = entry.find(f'{ARXIV_NAMESPACE}title').text.strip()
        summary = entry.find(f'{ARXIV_NAMESPACE}summary').text.strip()

        author_elements = entry.findall(f'{ARXIV_NAMESPACE}author')
        authors = [author.find(f'{ARXIV_NAMESPACE}name').text for author in author_elements]

        paper_url = entry.find(f'{ARXIV_NAMESPACE}id').text
        arxiv_id = paper_url.split('/')[-1]

        pdf_link = next((link.attrib['href'] for link in entry.findall(f'{ARXIV_NAMESPACE}link') 
                         if link.attrib.get('title') == 'pdf'), None)

        papers.append({
            'title': title,
            'summary': summary,
            'authors': authors,
            'arxiv_id': arxiv_id,
            'url': paper_url,
            'pdf_link': pdf_link
        })
    
    
    df = pd.DataFrame(papers)
    

    with open(json_file_path, 'w') as f:
        json.dump(papers, f, ensure_ascii=False, indent=4)
        print(f'Data saved to {json_file_path} ...')

    
    return df

In [53]:
df = extract_from_arxiv(max_results= 2)

Data saved to arxiv_dataset.json ...


In [39]:
file_name = "arxiv_dataset.json"
with open(file_name, 'r') as file:
    data = json.load(file)

data

[{'title': 'Dynamic Backtracking',
  'summary': 'Because of their occasional need to return to shallow points in a search\ntree, existing backtracking methods can sometimes erase meaningful progress\ntoward solving a search problem. In this paper, we present a method by which\nbacktrack points can be moved deeper in the search space, thereby avoiding this\ndifficulty. The technique developed is a variant of dependency-directed\nbacktracking that uses only polynomial space while still providing useful\ncontrol information and retaining the completeness guarantees provided by\nearlier approaches.',
  'authors': ['M. L. Ginsberg'],
  'arxiv_id': '9308101v1',
  'url': 'http://arxiv.org/abs/cs/9308101v1',
  'pdf_link': 'http://arxiv.org/pdf/cs/9308101v1'},
 {'title': 'A Market-Oriented Programming Environment and its Application to\n  Distributed Multicommodity Flow Problems',
  'summary': 'Market price systems constitute a well-understood class of mechanisms that\nunder certain conditions 

In [40]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,summary,authors,arxiv_id,url,pdf_link
0,Dynamic Backtracking,Because of their occasional need to return to ...,[M. L. Ginsberg],9308101v1,http://arxiv.org/abs/cs/9308101v1,http://arxiv.org/pdf/cs/9308101v1
1,A Market-Oriented Programming Environment and ...,Market price systems constitute a well-underst...,[M. P. Wellman],9308102v1,http://arxiv.org/abs/cs/9308102v1,http://arxiv.org/pdf/cs/9308102v1


In [45]:
pdf_link = df['pdf_link'][0]

In [47]:
pdf_link

'http://arxiv.org/pdf/cs/9308101v1'

In [46]:
newfile = pdf_link.split('/')[-1] + '.pdf'
newfile

'9308101v1.pdf'