In [None]:
import os

arxiv_output_dir = './arxiv'

In [None]:
processing_dates = [
    '0001',
    '0002',
    '0003',
    '0004',
    '0005',
    '0006',
    '0007',
    '0008',
    '0009',
    '0010',
    '0011',
    '0012',
    '0501',
    '1001',
    '1501',
    '2001',
]

In [None]:
import os
import random

def save_random_selection(path, data_list, force=False):
    """
    Randomly select 150 entries from the list and save them into a file as one item per line.

    Parameters:
    - data_list (list): The list from which to select entries.
    - path (str): The path to the file where the selected entries should be saved.

    Returns:
    - list: The selected entries.
    """
    # If the file already exists, read its content and return it
    if os.path.exists(path) and not force:
        with open(path, "r") as file:
            existing_content = [line.strip() for line in file.readlines()]
        print('Existing file loaded.')
        return existing_content

    # Randomly select 150 entries from the list
    selected_entries = random.sample([i for i in data_list if i.endswith('.html')], 150)
    selected_entries.sort()

    # Save the selected entries to the file
    with open(path, "w") as file:
        for entry in selected_entries:
            file.write(str(entry) + '\n')

    print('Random selection saved.')

    return selected_entries

In [None]:
import shutil
from tqdm.notebook import tqdm

def transfer_selected_files(selection_list, datedir):
    os.makedirs(os.path.join(arxiv_output_dir, datedir, 'html_selected'), exist_ok=True)
    os.makedirs(os.path.join(arxiv_output_dir, datedir, 'pdf_selected'), exist_ok=True)

    pbar = tqdm(selection_list, desc="Moving selected files")
    for html_filename in pbar:
        html_filename_stem = os.path.splitext(html_filename)[0]
        html_path = os.path.join(arxiv_output_dir, datedir, 'html', html_filename)
        pdf_path = os.path.join(arxiv_output_dir, datedir, 'pdf', html_filename_stem + '.pdf')
        target_html_path = os.path.join(arxiv_output_dir, datedir, 'html_selected', html_filename)
        target_pdf_path = os.path.join(arxiv_output_dir, datedir, 'pdf_selected', html_filename_stem + '.pdf')
        if not os.path.exists(pdf_path):
            print(f'PDF not exists. ({pdf_path})')
            continue
        if not os.path.exists(target_html_path):
            shutil.copy(html_path, target_html_path)
        if not os.path.exists(target_pdf_path):
            shutil.copy(pdf_path, target_pdf_path)

In [None]:
for date_folder_name in processing_dates:
    curr_html = os.listdir(os.path.join(arxiv_output_dir, date_folder_name, 'html'))
    curr_pdf = os.listdir(os.path.join(arxiv_output_dir, date_folder_name, 'pdf'))
    print(date_folder_name, len(curr_html), len(curr_pdf))
    
    curr_html_selection = save_random_selection(os.path.join(arxiv_output_dir, date_folder_name, 'html_selection.txt'), curr_html)
    transfer_selected_files(curr_html_selection, date_folder_name)
    
    print(date_folder_name, 'done!')