# Intro

## Description

This notebook contains the pipeline for querying and storing arxiv data into csv files.

## Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import feedparser
from datetime import datetime

## Global Variables and Script Setup

In [2]:
from helper import MASTER_CSV_COLUMNS, DEFAULT_SEARCH_QUERY, ARXIV_KEYS

base_url = "https://export.arxiv.org/api/"
endpoint = "query"
url = base_url + endpoint 

# Code

## Fetching Request

In [3]:
def fetch_initial_papers(query: str = DEFAULT_SEARCH_QUERY, verbose:int = 0) -> None:
    """
    TODO: make this docstring better lol
    most recent fetches 10 papers
    """
    # Set request params
    params = {
        "search_query": query,
        "sortBy": 'submittedDate',
        "sortOrder": 'descending'
    }

    # Fetching request
    response = requests.get(url, params=params)
    if response.status_code == 200:
        feed = feedparser.parse(response.content)
        print(f"Fetched {len(feed.entries)} entries.")
    else:
        print(f"Error: {response.status_code}")
        return

    # Parsing request
    all_papers = []
    num_missing_keys = 0
    for paper in feed.entries:
        paper_data = []
        for key in ARXIV_KEYS:
            try:
                paper_data.append(paper[key])
            except:
                paper_data.append(np.nan)
                if verbose == 1:
                    print(f"{paper['id']} does not have {key} key")
                    num_missing_keys += 1
                else:
                    num_missing_keys += 1
        all_papers.append(paper_data)
    print(f"{num_missing_keys} missing keys.")

    # Saving data to csv
    df = pd.DataFrame(data=all_papers, columns=MASTER_CSV_COLUMNS)
    try:
        df.to_csv("../data/arxiv.csv", index=False)
        print("Saved!")
    except:
        print("Failed to save...")

    # Return
    return

In [4]:
def fetch_more_papers(query: str = DEFAULT_SEARCH_QUERY, verbose:int = 0, n:int = 50) -> None:
    """
    TODO: make this docstring better lol
    fetches the next n papers published after the oldest entry in the csv
    """
    # Extract oldest published date in dataset 
    df = pd.read_csv("../data/arxiv.csv")
    start_date = df["published"].iloc[-1:].values[0]

    # Set request params
    params = {
        "search_query": query,
        "start_date": start_date,
        "sortBy": 'submittedDate',  # relevance, lastUpdatedDate, submittedDate
        "max_results": n,
        "sortOrder": 'descending'
    }

    # Fetching request
    response = requests.get(url, params=params)
    if response.status_code == 200:
        feed = feedparser.parse(response.content)
        print(f"Fetched {len(feed.entries)} entries.")
    else:
        print(f"Error: {response.status_code}")
        return

    # Parsing request
    all_papers = []
    num_missing_keys = 0
    for paper in feed.entries:
        paper_data = []
        for key in ARXIV_KEYS:
            try:
                paper_data.append(paper[key])
            except:
                paper_data.append(np.nan)
                if verbose == 1:
                    print(f"{paper['id']} does not have {key} key")
                    num_missing_keys += 1
                else:
                    num_missing_keys += 1
        all_papers.append(paper_data)
    print(f"{num_missing_keys} missing keys.")

    # Saving data to csv
    df = pd.DataFrame(data=all_papers, columns=MASTER_CSV_COLUMNS)
    try:
        df.to_csv("../data/arxiv.csv", mode="a", index=False, header=False)
        print("Saved!")
    except:
        print("Failed to save...")

    # Return
    return

In [5]:
def fetch_papers(query: str = DEFAULT_SEARCH_QUERY, verbose:int = 0, n:int = 50) -> None:
    """ 
    TODO docstring

    wrapper for fetch more papers
    """
    if n <= 0:
        print("n cannot be negative or 0.")
        return
    else:
        mult_of_10 = n // 10
        leftoever_of_10 = n - mult_of_10

        if mult_of_10 > 0:
            for _ in range(mult_of_10):
                fetch_more_papers(query, verbose)
        fetch_more_papers(query, verbose, leftoever_of_10)
        print("Finished loops!")
        return

In [6]:
# WARNING, RUNNING THIS FUNCTION WILL RESET THE DATABASE
fetch_initial_papers(DEFAULT_SEARCH_QUERY, 0)

Fetched 10 entries.
16 missing keys.
Saved!


In [7]:
fetch_more_papers(DEFAULT_SEARCH_QUERY, 0, 1000)

Fetched 1000 entries.
1698 missing keys.
Saved!
