In [4]:
import requests
import tarfile
import os
import re
from pathlib import Path
import tempfile

def get_arxiv_tex(identifier):
    """
    Download and extract arXiv source files, returning the main tex content.
    
    Args:
        identifier (str): Either full arXiv URL or just the paper number (e.g. '2412.06264')
        
    Returns:
        str: Content of the main tex file
        
    Raises:
        ValueError: If the identifier is invalid or source files cannot be accessed
        RuntimeError: If no main tex file is found
    """
    # Extract paper number from URL if needed
    if identifier.startswith('http'):
        match = re.search(r'arxiv.org/(?:abs|pdf)/(\d+\.\d+)', identifier)
        if not match:
            raise ValueError("Invalid arXiv URL format")
        paper_number = match.group(1)
    else:
        # Verify the paper number format
        if not re.match(r'^\d+\.\d+$', identifier):
            raise ValueError("Invalid arXiv identifier format")
        paper_number = identifier
    
    # Create source URL
    source_url = f'https://arxiv.org/src/{paper_number}'
    
    # Download the source files
    response = requests.get(source_url)
    if response.status_code != 200:
        raise ValueError(f"Failed to download source files (Status code: {response.status_code})")
    
    # Create a temporary directory for extraction
    with tempfile.TemporaryDirectory() as temp_dir:
        # Save the downloaded tar file
        tar_path = Path(temp_dir) / 'source.tar.gz'
        with open(tar_path, 'wb') as f:
            f.write(response.content)
        
        # Extract the tar file using the new filter parameter
        with tarfile.open(tar_path, 'r:gz') as tar:
            tar.extractall(path=temp_dir, filter='data')
        
        # Look for main tex file
        tex_files = list(Path(temp_dir).rglob('*.tex'))
        if not tex_files:
            raise RuntimeError("No tex files found in the source")
        
        # Common main file names (all lowercase for comparison)
        main_candidates = [
            'main.tex',
            'paper.tex',
            'article.tex',
            'manuscript.tex',
            'submission.tex',
            'arxiv.tex',
            'document.tex',
            'draft.tex',
            'preprint.tex',
            'source.tex',
            'neurips.tex',
            'icml.tex',
            'iclr.tex',
            'aaai.tex',
            'ijcai.tex',
            f'{paper_number}.tex'
        ]
        
        main_file = None
        
        # First try common file names (case-insensitive)
        for candidate in main_candidates:
            for tex_file in tex_files:
                if tex_file.name.lower() == candidate:
                    main_file = tex_file
                    break
            if main_file:
                break
                
        # If no common names found, try the directory name (case-insensitive)
        if not main_file:
            for tex_file in tex_files:
                if tex_file.parent.name.lower() + '.tex' == tex_file.name.lower():
                    main_file = tex_file
                    break
        
        # If still no match, look for file with \documentclass
        if not main_file:
            for tex_file in tex_files:
                with open(tex_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                    if r'\documentclass' in content:
                        main_file = tex_file
                        break
        
        # If still no main file found, use the first tex file
        if not main_file and tex_files:
            main_file = tex_files[0]
        
        if not main_file:
            raise RuntimeError("Could not identify main tex file")
        
        # Read and return the content
        with open(main_file, 'r', encoding='utf-8') as f:
            return f.read()
            
# Example usage:
# tex_content = get_arxiv_tex('2412.06264')
# tex_content = get_arxiv_tex('https://arxiv.org/abs/2412.06264')

In [3]:
from latex_converter import clean_latex_file

#input_file = './maintext.tex'  # Replace with your .tex file path
input_file = get_arxiv_tex('2412.06264')
clean_latex_file(input_file)

Cleaned file has been written to: output.txt
