In [None]:
#| default_exp helper.arxiv

In [None]:
#| export
import datetime
import json
from typing import Callable, Optional, Union
import os
from os import PathLike
from pathlib import Path
import re

import arxiv
from arxiv import Client, Search, Result
from pathvalidate import sanitize_filename

from trouver.helper.files_and_folders import file_is_compressed, uncompress_file


In [None]:
import glob
import tempfile
from unittest.mock import patch, MagicMock
import shutil


from fastcore.test import *
from fastcore.test import test_is
from nbdev.showdoc import show_doc

from trouver.helper.tests import _test_directory

# helper.arxiv
> Functions for downloading (the source code of) articles from arXiv  

In [None]:
#| export
def arxiv_id(
        arxiv_id_or_url: str,
        ) -> str:
    """
    Return the arxiv id from a str which is either of the arxiv id itself or the url
    to the arxiv article.

    **Raises**
    - `ValueError`
        - If the input does not contain a valid arXiv ID.
    """
    id_pattern = r'(\d{4}\.\d{4,5}(?:v\d+)?)'
    
    # Check if input is a URL and extract the ID
    if 'arxiv.org' in arxiv_id_or_url:
        match = re.search(id_pattern, arxiv_id_or_url)
        if match:
            return match.group(1)
        else:
            raise ValueError("Invalid arXiv URL provided.")
    
    # If it's not a URL, assume it's an ID and validate it
    elif re.match(id_pattern, arxiv_id_or_url):
        return arxiv_id_or_url
    
    else:
        raise ValueError("Invalid input. Please provide a valid arXiv ID or URL.")

In [None]:

test_eq(arxiv_id("1234.5678"), "1234.5678")
test_eq(arxiv_id("https://arxiv.org/abs/1234.5678"), "1234.5678")
test_eq(arxiv_id("1234.5678v1"), "1234.5678v1")
test_eq(arxiv_id("https://arxiv.org/abs/1234.5678v1"), "1234.5678v1")

The `arxiv_search` function can be used to obtain an `arxiv.Search` object, which is used for downloading arxiv files.

In [None]:
#| export
def arxiv_search(
        arxiv_ids: Union[str, list[str]], # The ID of a single arXiv article or multiple arxiv articles
        client: Optional[Client] = None,  # an arxiv API Client. If `None`, create one on the spot.
        results: bool = True, # If `True` return a `Result` object. otherwise, return a `Search`` object`.
        ) -> Union[Result, Search]:
    if not client:
        client = Client()
    if not isinstance(arxiv_ids, list):
        arxiv_ids = [arxiv_ids]
    search = Search(id_list=arxiv_ids)
    if results:
        return client.results(search)
    return search

In [None]:
#| notest
# Specify the arXiv ID of the paper you want to download
# arxiv_id = "2106.10586"  # Replace with your desired arXiv ID
arxiv_id = "2106.10586"  # Replace with your desired arXiv ID

# Create a search object with the specified arXiv ID
# client = Client()
# search = Search(id_list=[arxiv_id])
# results = client.results(search)
results = arxiv_search(arxiv_id, results=True)
results

<itertools.islice>

In [None]:
#| notest
listy = list(results)
print(listy)

[arxiv.Result(entry_id='http://arxiv.org/abs/2106.10586v4', updated=datetime.datetime(2024, 6, 28, 1, 36, 47, tzinfo=datetime.timezone.utc), published=datetime.datetime(2021, 6, 19, 23, 50, 56, tzinfo=datetime.timezone.utc), title='Global $\\mathbb{A}^1$ degrees of covering maps between modular curves', authors=[arxiv.Result.Author('Hyun Jong Kim'), arxiv.Result.Author('Sun Woo Park')], summary="Given a projective smooth curve $X$ over any field $k$, we discuss two\nnotions of global $\\mathbb{A}^1$ degree of a finite morphism of smooth curves\n$f: X \\to \\mathbb{P}^1_k$ satisfying certain conditions. One originates from\ncomputing the Euler number of the pullback of the line bundle\n$\\mathscr{O}_{\\mathbb{P}^1}(1)$ as a generalization of Kass and Wickelgren's\nconstruction of Euler numbers. The other originates from the construction of\nglobal $\\mathbb{A}^1$ degree of morphisms of projective curves by Kass, Levine,\nSolomon, and Wickelgren as a generalization of Morel's constructio

In [None]:
#| notest
listy[0].entry_id

'http://arxiv.org/abs/2106.10586v4'

In [None]:
mock_result_2 = arxiv.Result(
    entry_id='http://arxiv.org/abs/2106.10586v4',
    updated=datetime.datetime(2024, 6, 28, 1, 36, 47, tzinfo=datetime.timezone.utc),
    published=datetime.datetime(2021, 6, 19, 23, 50, 56, tzinfo=datetime.timezone.utc),
    title='Global $\\mathbb{A}^1$ degrees of covering maps between modular curves',
    authors=[arxiv.Result.Author('Hyun Jong Kim'), arxiv.Result.Author('Sun Woo Park')],
    summary="Given a projective smooth curve $X$ over any field $k$, we discuss two\nnotions of global $\\mathbb{A}^1$ degree of a finite morphism of smooth curves\n$f: X \\to \\mathbb{P}^1_k$ satisfying certain conditions. One originates from\ncomputing the Euler number of the pullback of the line bundle\n$\\mathscr{O}_{\\mathbb{P}^1}(1)$ as a generalization of Kass and Wickelgren's\nconstruction of Euler numbers. The other originates from the construction of\nglobal $\\mathbb{A}^1$ degree of morphisms of projective curves by Kass, Levine,\nSolomon, and Wickelgren as a generalization of Morel's construction of\n$\\mathbb{A}^1$-Brouwer degree of a morphism $f: \\mathbb{P}^1_k \\to\n\\mathbb{P}^1_k$. We prove that under certain conditions on $N$, both notions of\nglobal $\\mathbb{A}^1$ degrees of covering maps between modular curves $X_0(N)\n\\to X(1)$, $X_1(N) \\to X(1)$, and $X(N) \\to X(1)$ agree to be equal to sums of\nhyperbolic elements $\\langle 1 \\rangle + \\langle -1 \\rangle$ in the\nGrothendieck-Witt ring $\\mathrm{GW}(k)$ for any field $k$ whose characteristic\nis coprime to $N$ and the pullback of $\\mathscr{O}_{\\mathbb{P}^1}(1)$ is\nrelatively oriented.",
    comment='35 pages. Modified various statements to more precisely speak of\n  "relatively oriented" maps or vector bundles instead of "relatively\n  orientable" maps or vector bundles where appropriate --- the former phrasing\n  suggests that a relative orientation is fixed. Additional minor edits',
    journal_ref=None,
    doi=None,
    primary_category='math.AG',
    categories=['math.AG', 'math.NT', '14F42, 14G35'],
    links=[arxiv.Result.Link('http://arxiv.org/abs/2106.10586v4', title=None, rel='alternate', content_type=None), arxiv.Result.Link('http://arxiv.org/pdf/2106.10586v4', title='pdf', rel='related', content_type=None)])


## Metadata extraction

In [None]:
#| export
def extract_metadata(
        results: Union[list[Result], Result],
        ) -> list[dict]: # Each dict corresponds to the metadata for each result.
    """
    Return the metadata from the arxiv search results
    """
    if not isinstance(results, list):
        results = [results]
    metadata_list = []
    for result in results:
        metadata = {
            "arxiv_id": result.get_short_id(),
            "authors": [author.name for author in result.authors],
            "title": result.title,
            "summary": result.summary,
            "primary_category": result.primary_category,
            "categories": result.categories,
            "published": result.published,
            "updated": result.updated,
            "doi": result.doi,
            "comment": result.comment,
            "journal_ref": result.journal_ref,
            "links": result.links
        }
        metadata_list.append(metadata)
    return metadata_list

In [None]:
mock_result = arxiv.Result(
    entry_id='http://arxiv.org/abs/1605.08386v1',
    updated=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    published=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    title='Heat-bath random walks with Markov bases',
    authors=[arxiv.Result.Author('Caprice Stanley'), arxiv.Result.Author('Tobias Windisch')],
    summary='Graphs on lattice points are studied whose edges come from a finite set of\nallowed moves of arbitrary length. We show that the diameter of these graphs on\nfibers of a fixed integer matrix can be bounded from above by a constant. We\nthen study the mixing behaviour of heat-bath random walks on these graphs. We\nalso state explicit conditions on the set of moves so that the heat-bath random\nwalk, a generalization of the Glauber dynamics, is an expander in fixed\ndimension.',
    comment='20 pages, 3 figures',
    journal_ref=None,
    doi=None,
    primary_category='math.CO',
    categories=['math.CO', 'math.ST', 'stat.TH', 'Primary: 05C81, Secondary: 37A25, 11P21'],
    links=[arxiv.Result.Link('http://arxiv.org/abs/1605.08386v1',
                             title=None, rel='alternate', content_type=None),
           arxiv.Result.Link('http://arxiv.org/pdf/1605.08386v1', title='pdf', rel='related',
                             content_type=None),])
extract_metadata(mock_result)

[{'arxiv_id': '1605.08386v1',
  'authors': ['Caprice Stanley', 'Tobias Windisch'],
  'title': 'Heat-bath random walks with Markov bases',
  'summary': 'Graphs on lattice points are studied whose edges come from a finite set of\nallowed moves of arbitrary length. We show that the diameter of these graphs on\nfibers of a fixed integer matrix can be bounded from above by a constant. We\nthen study the mixing behaviour of heat-bath random walks on these graphs. We\nalso state explicit conditions on the set of moves so that the heat-bath random\nwalk, a generalization of the Glauber dynamics, is an expander in fixed\ndimension.',
  'primary_category': 'math.CO',
  'categories': ['math.CO',
   'math.ST',
   'stat.TH',
   'Primary: 05C81, Secondary: 37A25, 11P21'],
  'published': datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
  'updated': datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
  'doi': None,
  'comment': '20 pages, 3 figures',
  'j

In [None]:
#| export
class ArxivMetadataEncoder(json.JSONEncoder):
    """
    `json` encoder to accomapny the `extract_metadta` function when using `json.dump`. 
    """
    def default(self, obj):
        if isinstance(obj, datetime.datetime):
            return obj.isoformat()
        elif isinstance(obj, arxiv.Result.Link):
            return obj.href
        return super().default(obj)

In [None]:
# Your dictionary with datetime and arxiv.Result.Link objects
data = {
    "timestamp": datetime.datetime.now(),
    "link": arxiv.Result.Link(href="https://example.com", title="Example")
}

# Convert the dictionary to JSON
json_data = json.dumps(data, cls=ArxivMetadataEncoder, indent=4)
print(json_data)

{
    "timestamp": "2024-12-05T17:29:43.843085",
    "link": "https://example.com"
}


## Downloading arxiv files

In [None]:
#| export
def extract_last_names(
        authors: list[str]
        ):
    last_names = []
    for author in authors:
        # Split the name into parts
        parts = author.split()
        # Handle special cases like "de Jong"
        if len(parts) > 2 and parts[-2].lower() in ['de', 'van', 'von', 'del', 'della', 'di', 'da', 'dos']:
            last_name = f"{parts[-2]} {parts[-1]}"
        else:
            last_name = parts[-1]
        
        # Remove any commas or periods
        last_name = re.sub(r'[,.]', '', last_name)
        last_names.append(last_name)
    return last_names


The `extract_last_names` function is a convenient helper function for naming downloaded arxiv files.

In [None]:
# Example usage
authors = ["John Smith", "Maria Garcia-Lopez", "Pieter de Jong", "Xin Li"]
last_names = extract_last_names(authors)
print(last_names)

['Smith', 'Garcia-Lopez', 'de Jong', 'Li']


In [None]:
#| export
def folder_name_for_source(
        result: Result,
        lowercase: bool = True
        ) -> str:
    family_names = extract_last_names([author.name for author in result.authors])
    if len(family_names) > 4:
        family_names_text = f'{family_names[0]}_et_al'
    else:
        family_names_text = '_'.join(family_names)
    if lowercase:
        output = f'{family_names_text.lower()}_{create_acronym(result.title)}'
    else:
        output = f'{family_names_text}_{create_acronym(result.title)}'
    return output


def create_acronym(title):
    # Words to exclude from acronym
    exclude_words = set(['a', 'an', 'the', 'on', 'and', 'of', 'to', 'over', 'in', 'for', 'with', 'by', 'at', 'from'])
    
    # Split the title into words
    words = re.findall(r'\b[\w-]+\b', title)
    
    acronym = ''
    for word in words:
        if word.lower() not in exclude_words:
            if len(word) == 1 and word.isupper():
                # Keep single uppercase letters (likely mathematical symbols) as is
                acronym += word
            elif word.lower() in ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x']:
                # Handle Roman numerals
                acronym += word.lower()
            elif '-' in word:
                # Handle hyphenated words
                parts = word.split('-')
                acronym += ''.join(part[0].lower() if not (len(part) == 1 and part.isupper()) else part[0] for part in parts)
            else:
                # Take the first letter of other words, always lowercase
                acronym += word[0].lower()
    
    return acronym

`folder_name_for_source` and `create_acronym` are convenient helper functions for naming folders newly created when downloading source code for arxiv files; the author of `trouver` roughly uses these conventions for organizing source code files.

In [None]:
# Test cases
titles = [
    "Lectures on K3 surfaces",
    "Positivity in Algebraic Geometry I",
    "On the Cohomology of Finite Groups",
    "An Introduction to A-infinity Algebras",
    "Quantum Field Theory and the Standard Model",
    "Category O for gl(n,C) and the Cohomology of Flag Varieties"
]

for title in titles:
    print(f"Title: {title}")
    print(f"Acronym: {create_acronym(title)}")
    print()

Title: Lectures on K3 surfaces
Acronym: lks

Title: Positivity in Algebraic Geometry I
Acronym: pagI

Title: On the Cohomology of Finite Groups
Acronym: cfg

Title: An Introduction to A-infinity Algebras
Acronym: iAia

Title: Quantum Field Theory and the Standard Model
Acronym: qftsm

Title: Category O for gl(n,C) and the Cohomology of Flag Varieties
Acronym: cOgnCcfv



In [None]:
mock_result = arxiv.Result(
    entry_id='http://arxiv.org/abs/1605.08386v1',
    updated=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    published=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    title='Heat-bath random walks with Markov bases',
    authors=[arxiv.Result.Author('Caprice Stanley'), arxiv.Result.Author('Tobias Windisch')],
    summary='Graphs on lattice points are studied whose edges come from a finite set of\nallowed moves of arbitrary length. We show that the diameter of these graphs on\nfibers of a fixed integer matrix can be bounded from above by a constant. We\nthen study the mixing behaviour of heat-bath random walks on these graphs. We\nalso state explicit conditions on the set of moves so that the heat-bath random\nwalk, a generalization of the Glauber dynamics, is an expander in fixed\ndimension.',
    comment='20 pages, 3 figures',
    journal_ref=None,
    doi=None,
    primary_category='math.CO',
    categories=['math.CO', 'math.ST', 'stat.TH', 'Primary: 05C81, Secondary: 37A25, 11P21'],
    links=[arxiv.Result.Link('http://arxiv.org/abs/1605.08386v1',
                             title=None, rel='alternate', content_type=None),
           arxiv.Result.Link('http://arxiv.org/pdf/1605.08386v1', title='pdf', rel='related',
                             content_type=None),])
extract_metadata(mock_result)
print(mock_result.title)
output = folder_name_for_source(mock_result)
print(output)
assert ' ' not in output
assert output.startswith('stanley_windisch')

Heat-bath random walks with Markov bases
stanley_windisch_hbrwmb


In [None]:
#| export
def file_name_for_pdf(
        result: Result
        ) -> str:
    family_names = extract_last_names([author.name for author in result.authors])
    if len(family_names) > 4:
        family_names_text = f'{family_names[0]} et al'
    else:
        family_names_text = ', '.join(family_names)
    output = f'{family_names_text} - {result.title}'
    return sanitize_filename(output)


`file_name_for_pdf` could be a good convention for naming downloaded pdf files of arxiv articles. Pass this as the `file_or_folder_names` parameter for `download_from_results`.

In [None]:
mock_result = arxiv.Result(
    entry_id='http://arxiv.org/abs/1605.08386v1',
    updated=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    published=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    title='Heat-bath random walks with Markov bases',
    authors=[arxiv.Result.Author('Caprice Stanley'), arxiv.Result.Author('Tobias Windisch')],
    summary='Graphs on lattice points are studied whose edges come from a finite set of\nallowed moves of arbitrary length. We show that the diameter of these graphs on\nfibers of a fixed integer matrix can be bounded from above by a constant. We\nthen study the mixing behaviour of heat-bath random walks on these graphs. We\nalso state explicit conditions on the set of moves so that the heat-bath random\nwalk, a generalization of the Glauber dynamics, is an expander in fixed\ndimension.',
    comment='20 pages, 3 figures',
    journal_ref=None,
    doi=None,
    primary_category='math.CO',
    categories=['math.CO', 'math.ST', 'stat.TH', 'Primary: 05C81, Secondary: 37A25, 11P21'],
    links=[arxiv.Result.Link('http://arxiv.org/abs/1605.08386v1',
                             title=None, rel='alternate', content_type=None),
           arxiv.Result.Link('http://arxiv.org/pdf/1605.08386v1', title='pdf', rel='related',
                             content_type=None),])
file_name_for_pdf(mock_result)

'Stanley, Windisch - Heat-bath random walks with Markov bases'

In [None]:
#| export
def download_from_results(
        results: Result | list[Result],
        dir: PathLike, # The directory into which to download the files
        source: bool = True, # If `True`, download the source file. Otherweise, download a pdf file.
        # filename: Optional[str] = None, # The file name to save the file as. If `None`, then the filename is set to the arXiv id of the article.
        decompress_compressed_file: bool = True, # If `True`and if `source` is `True`, then decompress the source file after downloading it.
        file_or_folder_names: None | str | list[str] | Callable[Result, str] = folder_name_for_source, # If `None`, then the file/folder is named the arxiv id. If a `str` (in which case `results` must be a single `Result` or a `list[Result]` of length 1) or `list[str]` (whose length must equal that of `results`), then each file/folder is named by the specified corresponding `str`. If `Callable[Result, str]`, then each file/folder is named using the specified `Callable` 
        delete_compressed_file: bool = True, # If `True` and if `source` and `decompress_compressed_file` are `True`, then delete the compressed source file after downloading and then uncompressing it.
        download_metadata: bool = True, # If `True`, and if `source` is `True`, then create a file called `metadata.json` and put it into the newly created folder, unless a file called `metadata.json` already exists, in which case, a unique file name is created  
        verbose: bool = False,
        ) -> list[Path]: # Each `Path` is the folder in which the source files are newly downloaded or the path to the pdf file that is newly downloaded.
    """
    Download either the source files or pdfs of the arxiv article encoded in the results.

    - If `source = True` and `decompress_compressed_file = True`, then 
        - Download the source file/folder into a newly created folder (whose name is specified
          by `file_or_folder_names`) within `dir` and decompress the source (if applicable) in
          this newly created folder.
        - If `delete_compressed_file = True`, then delete the compressed file.
    - If `source = False`, then just download a pdf.

    For `file_or_folder_names`, the recommanded `Callable` arguments are `folder_name_for_source`
    for downloading source files and `file_name_for_pdf` for downloading pdf files.

    """
    if not isinstance(results, list):
        results = [results]
    if file_or_folder_names is not None and not isinstance(file_or_folder_names, (list,str)):
        file_or_folder_names = [file_or_folder_names(result) for result in results]
    elif file_or_folder_names is None:
        file_or_folder_names = [None for result in results]
    elif isinstance(file_or_folder_names, str):
        file_or_folder_names = [file_or_folder_names]
    downloaded_paths = []
    for result, file_or_folder_name in zip(results, file_or_folder_names):
        if not source:
            downloaded_paths.append(
                _download_pdf_with_name(result, dir, file_or_folder_name))
            continue
        downloaded_paths.append(
            _download_source(
                result, dir, file_or_folder_name, decompress_compressed_file,
                delete_compressed_file, download_metadata, verbose))
    return downloaded_paths



def _download_pdf_with_name(
        result: Result,
        dir: PathLike,
        file_or_folder_name: str|None # The name of the pdf file. If `None`, then the default name for the pdf file is used.
        ) -> Path:
    """
    Downloads the pdf for `result`.

    Helper function to `download_from_results`
    """
    if file_or_folder_name is not None:
        return Path(result.download_pdf(dir, filename=f'{file_or_folder_name}.pdf'))
    else:
        return Path(result.download_pdf(dir))
    

            

def _create_folder_for_source_download(
        result: Result,
        dir: PathLike,
        file_or_folder_name: str|None # The name of the folder in which the source should be downloaded. If `None`, then the arxiv id for `result` is used.
        ) -> Path: # The newly created folder 
    """
    Creates a new folder inside `dir` in which to download the source for `result`.

    Helper function to `download_from_results`
    """
    if file_or_folder_name is None:
        file_or_folder_name = result.entry_id
    new_folder = Path(dir) / file_or_folder_name
    if os.path.isdir(new_folder): #If folder exists
        # TODO: warn that folder exists
        if file_or_folder_name == result.entry_id:
            file_or_folder_name = f'{file_or_folder_name}_dupl'    
        else:
            file_or_folder_name = f'{file_or_folder_name}_{result.get_short_id()}'
        new_folder = Path(dir) / file_or_folder_name
    while os.path.isdir(new_folder): #If folder still exists
        file_or_folder_name = f'{file_or_folder_name}_dupl'
        new_folder = Path(dir) / file_or_folder_name
    os.mkdir(new_folder)
    return new_folder


def _download_source(
        result: Result,
        dir: PathLike,
        file_or_folder_name: str,
        decompress_compressed_file: bool,
        delete_compressed_file: bool,
        download_metadata: bool,
        verbose: bool
        ) -> Path:
    """
    Download source file into folder, decompress (as needed) the source, and download metadata

    Helper function to `download_from_results`
    """
    new_source_folder = _create_folder_for_source_download(result, dir, file_or_folder_name)
    source_file_path = result.download_source(new_source_folder)
    source_file_path = Path(new_source_folder) / source_file_path
    if verbose:
        print(source_file_path)
    if decompress_compressed_file and file_is_compressed(source_file_path):
        uncompressed = uncompress_file(source_file_path)
        if delete_compressed_file:
            os.remove(source_file_path)
        if len(uncompressed) == 1 and file_is_compressed(uncompressed[0]):
            uncompressed_again = uncompress_file(uncompressed[0])
            if delete_compressed_file:
                os.remove(uncompressed[0])
    if not download_metadata:
        return new_source_folder
    metadata_file_name = _unique_metadata_file_name(new_source_folder)
    metadata = extract_metadata(result)[0] 
    with open(new_source_folder / metadata_file_name, 'w') as json_file:
        json.dump(metadata, json_file, cls=ArxivMetadataEncoder, indent=4)
    return new_source_folder


def _unique_metadata_file_name(
        new_source_folder: PathLike # The folder in which to make the metadata file.
        ) -> str: #s tr: A unique file name for the metadata file within `new_source_folder`.

    """
    Identify a name to name the metadata file within `new_source_folder`; 
    the default name is `metadata.json` unless there is already a file with that name.
    
    If `metadata.json` exists, it appends a numeric suffix to create a unique file name 
    (e.g., `metadata_1.json`, `metadata_2.json`, etc.).
    """
    # Ensure the input is a Path object
    folder = Path(new_source_folder)
    
    # Default file name
    base_name = "metadata"
    extension = ".json"
    candidate = folder / f"{base_name}{extension}"
    
    # Check if the default file name exists
    counter = 1
    while candidate.exists():
        # TODO: warn that metadata.json exists
        # Generate a new candidate with a numeric suffix
        candidate = folder / f"{base_name}_{counter}{extension}"
        counter += 1
    
    return str(candidate.name)  # Return only the file name, not the full path


`download_from_results` downloads an arxiv article (the source or a pdf).

In [None]:
mock_result_1 = arxiv.Result(
    entry_id='http://arxiv.org/abs/1605.08386v1',
    updated=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    published=datetime.datetime(2016, 5, 26, 17, 59, 46, tzinfo=datetime.timezone.utc),
    title='Heat-bath random walks with Markov bases',
    authors=[arxiv.Result.Author('Caprice Stanley'), arxiv.Result.Author('Tobias Windisch')],
    summary='Graphs on lattice points are studied whose edges come from a finite set of\nallowed moves of arbitrary length. We show that the diameter of these graphs on\nfibers of a fixed integer matrix can be bounded from above by a constant. We\nthen study the mixing behaviour of heat-bath random walks on these graphs. We\nalso state explicit conditions on the set of moves so that the heat-bath random\nwalk, a generalization of the Glauber dynamics, is an expander in fixed\ndimension.',
    comment='20 pages, 3 figures',
    journal_ref=None,
    doi=None,
    primary_category='math.CO',
    categories=['math.CO', 'math.ST', 'stat.TH', 'Primary: 05C81, Secondary: 37A25, 11P21'],
    links=[arxiv.Result.Link('http://arxiv.org/abs/1605.08386v1',
                            title=None, rel='alternate', content_type=None),
        arxiv.Result.Link('http://arxiv.org/pdf/1605.08386v1', title='pdf', rel='related',
                            content_type=None),])
mock_result_2 = arxiv.Result(
    entry_id='http://arxiv.org/abs/2106.10586v4',
    updated=datetime.datetime(2024, 6, 28, 1, 36, 47, tzinfo=datetime.timezone.utc),
    published=datetime.datetime(2021, 6, 19, 23, 50, 56, tzinfo=datetime.timezone.utc),
    title='Global $\\mathbb{A}^1$ degrees of covering maps between modular curves',
    authors=[arxiv.Result.Author('Hyun Jong Kim'), arxiv.Result.Author('Sun Woo Park')],
    summary="Given a projective smooth curve $X$ over any field $k$, we discuss two\nnotions of global $\\mathbb{A}^1$ degree of a finite morphism of smooth curves\n$f: X \\to \\mathbb{P}^1_k$ satisfying certain conditions. One originates from\ncomputing the Euler number of the pullback of the line bundle\n$\\mathscr{O}_{\\mathbb{P}^1}(1)$ as a generalization of Kass and Wickelgren's\nconstruction of Euler numbers. The other originates from the construction of\nglobal $\\mathbb{A}^1$ degree of morphisms of projective curves by Kass, Levine,\nSolomon, and Wickelgren as a generalization of Morel's construction of\n$\\mathbb{A}^1$-Brouwer degree of a morphism $f: \\mathbb{P}^1_k \\to\n\\mathbb{P}^1_k$. We prove that under certain conditions on $N$, both notions of\nglobal $\\mathbb{A}^1$ degrees of covering maps between modular curves $X_0(N)\n\\to X(1)$, $X_1(N) \\to X(1)$, and $X(N) \\to X(1)$ agree to be equal to sums of\nhyperbolic elements $\\langle 1 \\rangle + \\langle -1 \\rangle$ in the\nGrothendieck-Witt ring $\\mathrm{GW}(k)$ for any field $k$ whose characteristic\nis coprime to $N$ and the pullback of $\\mathscr{O}_{\\mathbb{P}^1}(1)$ is\nrelatively oriented.",
    comment='35 pages. Modified various statements to more precisely speak of\n  "relatively oriented" maps or vector bundles instead of "relatively\n  orientable" maps or vector bundles where appropriate --- the former phrasing\n  suggests that a relative orientation is fixed. Additional minor edits',
    journal_ref=None,
    doi=None,
    primary_category='math.AG',
    categories=['math.AG', 'math.NT', '14F42, 14G35'],
    links=[arxiv.Result.Link('http://arxiv.org/abs/2106.10586v4', title=None, rel='alternate', content_type=None), arxiv.Result.Link('http://arxiv.org/pdf/2106.10586v4', title='pdf', rel='related', content_type=None)])

single_result = mock_result_1
multiple_results = [mock_result_1, mock_result_2] 
folder_name_1 = folder_name_for_source(mock_result_1)
folder_name_2 = folder_name_for_source(mock_result_2)

In [None]:
#| notest
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)
    # 1. Single Result vs. List of Results
    # Test with single Result
    downloaded_paths = download_from_results(mock_result_1, temp_vault, source=True)
    assert (temp_vault / folder_name_1).exists()
    assert downloaded_paths


We can also pass multiple results to `download_from_results`.

In [None]:
#| notest
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)
    # Test with multiple Results
    download_from_results(multiple_results, temp_vault, source=True)
    # os.startfile(temp_vault)
    # input()
    assert (temp_vault / folder_name_1).exists()
    assert (temp_vault / folder_name_2).exists()

Specifying `source=False` downloads the pdf instead of the source files.

In [None]:
#| notest
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)
    # 2. Source vs. PDF download
    download_from_results(single_result, temp_vault, source=False)
    assert (temp_vault / f'{folder_name_1}.pdf').exists()


By specifying `source=True` and `decompress_compressed_file=False`, we can just download the compressed file.

In [None]:
#| notest
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)
    # 3. Decompression options
    download_from_results(single_result, temp_vault, source=True, decompress_compressed_file=False)
    tar_gz_files = glob.glob(str(temp_vault / folder_name_1 / '*.tar.gz')) 
    assert len(tar_gz_files) > 0


The folder or pdf file can get a custon name

In [None]:
#| notest
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)
    # 4. File/folder naming
    download_from_results(single_result, temp_vault, file_or_folder_names='custom_name')
    assert (temp_vault / 'custom_name').exists()

    download_from_results(multiple_results, temp_vault, file_or_folder_names=['name1', 'name2'])
    assert (temp_vault / 'name1').exists()
    assert (temp_vault / 'name2').exists()


`delete_compresed_file` can be set to `False` to preserve the compressed file after decomppressing.

In [None]:
#| notest
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)
    # 5. Compressed file handling
    download_from_results(single_result, temp_vault, delete_compressed_file=False)
    tar_gz_files = glob.glob(str(temp_vault / folder_name_1 / '*.tar.gz')) 
    assert len(tar_gz_files) > 0

By default, if the source is downloaded into a folder, then the metadata of the arxiv article is stored in a `json` file.

In [None]:
#| notest
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)

    # 6. Metadata file
    download_from_results(single_result, temp_vault, download_metadata=True)
    assert (temp_vault / folder_name_1 / 'metadata.json').exists()

    # 7. Edge cases
    download_from_results([], temp_vault)  # Empty list
    # Test with non-existent arxiv ID (should handle gracefully)
    # non_existent = next(arxiv.Search(id_list=['0000.00000']).results())
    # download_from_results(non_existent, temp_vault)


In [None]:
with tempfile.TemporaryDirectory(prefix='temp_dir', dir=os.getcwd()) as temp_dir:
    temp_vault = Path(temp_dir) / 'arxiv_file_download_example_folder'
    shutil.copytree(_test_directory() / 'arxiv_file_download_example_folder', temp_vault)
    # 8. Folder creation (duplicate handling)
    download_from_results(single_result, temp_vault)
    download_from_results(single_result, temp_vault)  # Should create a duplicate folder
    assert (temp_vault / folder_name_1).exists()

    # 9. File types (if you have examples of different source types)
    # This would require specific known arxiv IDs with different source types

    # 10. Error handling
    with ExceptionExpected(Exception):
    # with pytest.raises(Exception):  # Replace with specific exception
        download_from_results(single_result, '/non/existent/path')
