#### cheatsheet: paper datatype attributes

- paper.title: Title of the paper
- paper.authors: List of authors (each author has a .name attribute)
- paper.summary: Abstract/summary of the paper
- paper.published: Publication date
- paper.updated: Last update date
- paper.pdf_url: URL to download the PDF
- paper.entry_id: arXiv ID
- paper.primary_category: Main category
- paper.categories: All categories the paper belongs to
- paper.links: All related links
- paper.comment: Author comments (if any)
- paper.journal_ref: Journal reference (if published)
- paper.doi: Digital Object Identifier (if available)

In [1]:
%%capture
! pip install arxiv
! pip install networkx
! pip install PyPDF2
!pip install transformers
!pip install torch
import arxiv
import networkx as nx
import requests
import os
import re
import nltk
import PyPDF2
import requests
from transformers import pipeline

In [2]:
# Define the API call
client = arxiv.Client()

search = arxiv.Search(
    query ="artificial intelligence",
    max_results = 100,
    sort_by = arxiv.SortCriterion.SubmittedDate
)


In [3]:
# Build nx graph
G = nx.DiGraph()

results = client.results(search)

for paper in results:
  G.add_node(paper.entry_id, title=paper.title, authors=paper.authors, summary=paper.summary, url=paper.pdf_url)

In [4]:
def download_pdfs():
    # Create a directory for the PDFs if it doesn't exist
    pdf_dir = "arxiv_papers"
    if not os.path.exists(pdf_dir):
        os.makedirs(pdf_dir)

    # Download PDFs for each node
    for node in G.nodes():
        pdf_url = G.nodes[node]['url']
        # Extract paper ID from the URL to use as filename
        paper_id = node.split('/')[-1]
        pdf_path = os.path.join(pdf_dir, f"{paper_id}.pdf")
        
        # Download if file doesn't already exist
        if not os.path.exists(pdf_path):
            try:
                response = requests.get(pdf_url)
                if response.status_code == 200:
                    with open(pdf_path, 'wb') as f:
                        f.write(response.content)
                    print(f"Downloaded: {paper_id}")
                else:
                    print(f"Failed to download {paper_id}: HTTP {response.status_code}")
            except Exception as e:
                print(f"Error downloading {paper_id}: {str(e)}")

#download_pdfs()

In [5]:
nltk.download('punkt')

# extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text


# find reference section
def extract_references(text):
    # Find all occurrences of the word "References" in the text (case insensitive)
    matches = list(re.finditer(r'\bReferences\b', text, re.IGNORECASE))
    
    # If no matches are found, return None
    if not matches:
        return None
    
    # Get the last match
    last_match = matches[-1]
    
    # Return all text that comes after the last occurrence of "References"
    return text[last_match.end():]

In [6]:
# Try out references extraction for first paper
pdf_path = "/Users/beatweichsler/Documents/1_Uni/Master/Semester3/SocialGraphs/FinalProject/repo1/arxiv_papers/2411.19274v1.pdf"
text = extract_text_from_pdf(pdf_path)
references = extract_references(text)
print(references)


1. Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin,
M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur,
M., Levenberg, J., Man´ e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B.,
Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´ egas, F., Vinyals, O., Warden, P., Wattenberg,
M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015),
https://www.tensorflow.org/, software available from tensorflow.org
2. Alhaija, H., Mustikovela, S., Mescheder, L., Geiger, A., Rother, C.: Augmented reality meets computer vision:
Efficient data generation for urban driving scenes. International Journal of Computer Vision (IJCV) (2018)
3. AnandTech: Arm reveals cortex-a72 architecture details (2015), https://www.anandtech.com/show/9184/
arm-reveals-cortex-a72-architecture-det

### TITLE EXTRACTION APPROACH 1

nltk

In [7]:
# TITLE EXTRACTION APPROACH 1: NLTK

# extract paper titles from reference section
def extract_titles_nltk(text):
    if not text:
        return []
    
    # Split text into lines to process reference entries individually
    lines = text.split('\n')
    titles = []
    
    for line in lines:
        # Skip empty lines
        if not line.strip():
            continue
            
        # Pattern 1: Look for titles after year in brackets
        year_match = re.search(r'\((?:19|20)\d{2}\)\s*[.,]?\s*([^.]+)', line)
        if year_match:
            title = year_match.group(1).strip()
            if len(title) > 10:  # Minimum length to avoid fragments
                titles.append(title)
            continue
            
        # Pattern 2: Look for titles after colon (common in citation formats)
        colon_match = re.search(r':\s*([^.]+)', line)
        if colon_match:
            title = colon_match.group(1).strip()
            if len(title) > 10:  # Minimum length to avoid fragments
                titles.append(title)
            continue
        
        # Pattern 3: Look for quotes (often containing titles)
        quote_match = re.search(r'"([^"]+)"', line)
        if quote_match:
            title = quote_match.group(1).strip()
            if len(title) > 10:  # Minimum length to avoid fragments
                titles.append(title)
            continue
        
        # Pattern 4: Look for titles after a period and authors
        # This assumes authors are typically listed first
        period_match = re.search(r'\.\s+([^.]+?)\s*(?:\(|$)', line)
        if period_match:
            title = period_match.group(1).strip()
            if len(title) > 10 and not re.search(r'^(19|20)\d{2}', title):  # Avoid years
                titles.append(title)
    
    # Clean up titles
    cleaned_titles = []
    for title in titles:
        # Remove common prefixes like "titled", "called", etc.
        title = re.sub(r'^(titled|called|entitled|on|in|the)\s+', '', title, flags=re.IGNORECASE)
        # Remove trailing separators and whitespace
        title = re.sub(r'[,;:]$', '', title.strip())
        if title and len(title) > 10:  # Final length check
            cleaned_titles.append(title)
    
    return cleaned_titles

In [8]:

nltk_titles = extract_titles_nltk(references)
print(nltk_titles)

['Augmented reality meets computer vision', 'International Journal of Computer Vision', 'https://www', 'Learn the architecture - introducing neon', 'Achieving better category separability for hyperspectral image', 'A spatial–spectral approach', 'Hsi-drive: A dataset for the', '2021 IEEE Intelligent Vehicles', 'https://doi', 'A survey on terrain traversability analysis for autonomous ground vehicles: Methods, sensors, and challenges', 'Semantic object classes in video: A high-definition ground truth database', 'Segmentation and recognition using structure from motion', 'Computationally efficient target classification in multispectral', 'Target and Background Signatures II', 'Deeplab: Semantic image segmentation with', 'IEEE transactions on pattern analysis and', 'Spectral reflectance characterization of the road environment to optimize', '2019 IEEE Intelligent Transportation Systems Conference (ITSC)', 'Title Suppressed Due to Excessive Length 35', 'Proceedings of the IEEE conference on

## TITLE EXTRACTION APPROACH 2

running llama model locally

In [64]:
def extract_titles_with_llm(references_text):
    # Initialize the pipeline
    pipe = pipeline(
        "text-generation",
        model="meta-llama/Llama-2-7b-chat-hf",  # or another suitable model
        token=True  # You'll need a HuggingFace token
    )
    
    # Craft the prompt
    prompt = """Extract the titles of the academic papers from the following references section. 
    Output only the titles, one per line.
    
    References:
    {references}
    
    Titles:""".format(references=references_text[:2000])  # Limit input length
    
    # Generate response
    response = pipe(
        prompt,
        max_length=2048,
        temperature=0.1,  # Keep it focused
        num_return_sequences=1
    )
    
    # Process the response
    titles = response[0]['generated_text'].split('\n')
    # Clean up titles (remove empty lines and any artifacts)
    titles = [t.strip() for t in titles if t.strip() and len(t.strip()) > 10]
    
    return titles

In [66]:
llm_titles = extract_titles_with_llm(references)
print(llm_titles)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

KeyboardInterrupt: 

### TITLE EXTRACTION APPROACH 3

calling llama model from huggingface

In [32]:
def extract_titles_with_llm_api(references_text):
    # API configuration for Llama-2
    API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
    headers = {
        "Authorization": "Bearer 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"
    }
    
    # Craft the prompt using Llama-2's instruction format
    prompt = f"""[INST] You are a helpful assistant that extracts paper titles from academic references. Given the following references section, list ONLY the titles of the papers. Output one title per line. Do not include any other information like authors, years, or numbers.

    References:
    {references_text[:3900]}  

    [/INST]"""
    
    try:
        response = requests.post(
            API_URL,
            headers=headers,
            json={
                "inputs": prompt,
                "parameters": {
                    "max_length": 4096,  # Increased max_length
                    "temperature": 0.1,
                    "top_p": 0.95
                }
            }
        )
        response.raise_for_status()
        
        # Parse response
        output = response.json()[0]['generated_text']
        titles = [
            title.strip() 
            for title in output.split('\n') 
            if title.strip() and len(title.strip()) > 10
        ]
        
        return titles
        
    except requests.exceptions.RequestException as e:
        print(f"Error making API call: {e}")
        return []
    except (KeyError, IndexError) as e:
        print(f"Error parsing API response: {e}")
        return []

In [33]:
titles = extract_titles_with_llm_api(references)
print(titles)


Error making API call: 400 Client Error: Bad Request for url: https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf
[]


In [24]:
print(references)


1. Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin,
M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur,
M., Levenberg, J., Man´ e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B.,
Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´ egas, F., Vinyals, O., Warden, P., Wattenberg,
M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015),
https://www.tensorflow.org/, software available from tensorflow.org
2. Alhaija, H., Mustikovela, S., Mescheder, L., Geiger, A., Rother, C.: Augmented reality meets computer vision:
Efficient data generation for urban driving scenes. International Journal of Computer Vision (IJCV) (2018)
3. AnandTech: Arm reveals cortex-a72 architecture details (2015), https://www.anandtech.com/show/9184/
arm-reveals-cortex-a72-architecture-det

In [31]:
prompt = """Extract the titles of academic papers from this references section. 
Output only the titles, one per line:

{references}""".format(references=references[:10000])

print(prompt)


Extract the titles of academic papers from this references section. 
Output only the titles, one per line:


1. Abadi, M., Agarwal, A., Barham, P., Brevdo, E., Chen, Z., Citro, C., Corrado, G.S., Davis, A., Dean, J., Devin,
M., Ghemawat, S., Goodfellow, I., Harp, A., Irving, G., Isard, M., Jia, Y., Jozefowicz, R., Kaiser, L., Kudlur,
M., Levenberg, J., Man´ e, D., Monga, R., Moore, S., Murray, D., Olah, C., Schuster, M., Shlens, J., Steiner, B.,
Sutskever, I., Talwar, K., Tucker, P., Vanhoucke, V., Vasudevan, V., Vi´ egas, F., Vinyals, O., Warden, P., Wattenberg,
M., Wicke, M., Yu, Y., Zheng, X.: TensorFlow: Large-scale machine learning on heterogeneous systems (2015),
https://www.tensorflow.org/, software available from tensorflow.org
2. Alhaija, H., Mustikovela, S., Mescheder, L., Geiger, A., Rother, C.: Augmented reality meets computer vision:
Efficient data generation for urban driving scenes. International Journal of Computer Vision (IJCV) (2018)
3. AnandTech: Arm reveals cortex-a