In [1]:
import pandas as pd

In [4]:
data_dir = "train.parquet"

In [5]:
df = pd.read_parquet(data_dir)
print(df.shape, df.columns)

(63357, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


In [6]:
df.head()

Unnamed: 0,id,title,abstract,authors,published_date,link,markdown
0,2305.00379,Image Completion via Dual-path Cooperative Fil...,Given the recent advances with image-generatin...,"Pourya Shamsolmoali, Masoumeh Zareapoor, Eric ...",2023-04-30T03:54:53Z,http://arxiv.org/abs/2305.00379v1,# Image Completion via Dual-Path Cooperative F...
1,2307.16362,High Sensitivity Beamformed Observations of th...,We analyzed four epochs of beamformed EVN data...,"Rebecca Lin, Marten H. van Kerkwijk",2023-07-31T01:36:55Z,http://arxiv.org/abs/2307.16362v2,# High Sensitivity Beamformed Observations of ...
2,2301.07687,"Maybe, Maybe Not: A Survey on Uncertainty in V...",Understanding and evaluating uncertainty play ...,Krisha Mehta,2022-12-14T00:07:06Z,http://arxiv.org/abs/2301.07687v1,"# Maybe, Maybe Not: A Survey on Uncertainty in..."
3,2309.09088,Enhancing GAN-Based Vocoders with Contrastive ...,Vocoder models have recently achieved substant...,"Haoming Guo, Seth Z. Zhao, Jiachen Lian, Gopal...",2023-09-16T20:04:16Z,http://arxiv.org/abs/2309.09088v2,# Enhancing Gan-Based Vocoders with Contrastiv...
4,2307.16404,Nonvolatile Magneto-Thermal Switching in MgB2,Ongoing research explores thermal switching ma...,"Hiroto Arima, Yoshikazu Mizuguchi",2023-07-31T04:59:19Z,http://arxiv.org/abs/2307.16404v1,# Nonvolatile Magneto-Thermal Switching in MgB...


## Test arxiv-downloader

In [9]:
import os
import arxiv

# from arxiv-downloader
def download_article(article_id, directory, download_source=False):
    """
    Downloads the article with the given ID from arXiv and saves it to the specified directory.

    Args:
        article_id: The arXiv ID of the article to download.
        directory: The directory where the article will be saved.
        download_source: Whether to download the source files of the article.
    """
    # Proceed with the download
    directory = directory
    os.makedirs(directory, exist_ok=True)
    search_result = arxiv.Client().results(arxiv.Search(id_list=[article_id]))

    if article := next(search_result):
        print(f'Starting download of article: "{article.title}" ({article_id})')
        pdf_path = article.download_pdf(dirpath=directory)
        print(f"Download finished! Result saved at:\n{pdf_path}")

        if download_source:
            print(f'Starting download of article source files: "{article.title}" ({article_id})')
            article.download_source(dirpath=directory)
    else:
        print("Article not found.")

In [10]:
article_id = df.iloc[0]["id"]
save_dir = "cache"
download_article(article_id, directory=save_dir)

Starting download of article: "Image Completion via Dual-path Cooperative Filtering" (2305.00379)
Download finished! Result saved at:
cache/2305.00379v1.Image_Completion_via_Dual_path_Cooperative_Filtering.pdf


In [11]:
os.listdir("cache")

['2305.00379v1.Image_Completion_via_Dual_path_Cooperative_Filtering.pdf']

## Test pypdf (pdf->img)

In [13]:
from pdf2image import convert_from_path

In [12]:
fname = "cache/2305.00379v1.Image_Completion_via_Dual_path_Cooperative_Filtering.pdf"
png_path=f"cache/pdf2img/{article_id}"

if not os.path.exists(png_path):
    os.makedirs(png_path)

In [14]:
def convert_pdf_to_png(pdf_path, png_path):
    images = convert_from_path(pdf_path)
    for i, img in enumerate(images):
        png_fpath = os.path.join(png_path, f"{i}.png")
        img.save(png_fpath, "PNG")
        print(f"Saved {png_fpath}")

In [15]:
convert_pdf_to_png(pdf_path=fname, png_path=png_path)
os.listdir(png_path)

Saved cache/pdf2img/2305.00379/0.png
Saved cache/pdf2img/2305.00379/1.png
Saved cache/pdf2img/2305.00379/2.png
Saved cache/pdf2img/2305.00379/3.png
Saved cache/pdf2img/2305.00379/4.png


['0.png', '1.png', '2.png', '3.png', '4.png']