In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
with open("../arxiv-public-datasets/pdf.txt") as f:
    pdf = f.read()

In [20]:
import re
from typing import List
import difflib


def f(k):
    return k**0.5


def rogue_lcs(X, Y, weighted=True):
    m, n = len(X), len(Y)

    # Initialize the c-table
    c_table = [[0]*(n+1) for i in range(m+1)]
    # Initialize the w-table
    w_table = [[0]*(n+1) for i in range(m+1)]

    for i in range(m+1):
        for j in range(n+1):
            if i == 0 or j == 0:
                continue
            # The length of consecutive matches at
            # position i-1 and j-1
            elif X[i-1] == Y[j-1]:
                # Increment would be +1 for normal LCS
                k = w_table[i-1][j-1]
                increment = f(k+1) - f(k) if weighted else 1
                # Add the increment
                c_table[i][j] = c_table[i-1][j-1] + increment
                w_table[i][j] = k + 1
            else:
                if c_table[i-1][j] > c_table[i][j-1]:
                    c_table[i][j] = c_table[i-1][j]
                    w_table[i][j] = 0  # no match at i,j
                else:
                    c_table[i][j] = c_table[i][j-1]
                    w_table[i][j] = 0  # no match at i,j
    return c_table[m][n]

def find_references(text: str, fig_num: int, caption_blacklist: str, window_size: int = 200) -> List[str]:
    """
    Finds references to Figure (fig_num) in a provided text corpus.

    Parameters
    ----------
    text : str
        Text corpus
    fig_num : int
        Figure number to look for
    window_size : int
        Number of characters before and after the match to include in the
        context window
    """
    contexts = []
    regex = rf"(?i)((?:Fig\.?|Figure|Table|Tab\.?) {fig_num})"
    for match in re.finditer(regex, text):
        start, end = match.span()
        window = text[start - window_size:end + window_size]

        lcs_length = rogue_lcs(window, caption_blacklist, weighted=False)
        if lcs_length <= 0.8 * len(caption_blacklist):
            contexts.append(window)
        else:
            print("Skipping", window, "due to LCS=", lcs_length)
            pass

    return contexts

import pprint

pprint.pprint(find_references(pdf, 1, "FIG. 1: (a) T-dependence of the resistance of samples grown at PO2 = 10"))

Skipping ng as a function of the distance from the
film/substrate interface) would lead, in the PMR configuration, to a superposition of different frequencies and to
a broadening or blurring of the spectrum.

FIG. 1: (a) T-dependence of the resistance of samples grown
at PO2 = 10−6 - 10−3 mbar; grey symbols correspond to the
sample grown at 10−6 mbar after removing the LAO film by
mechanical polishing (see text) due to LCS= 70
Skipping  grey symbols correspond to the
sample grown at 10−6 mbar after removing the LAO film by
mechanical polishing (see text) and (b) dependence of the
mobility at 4K µH,4K on the deposition pressure.

In Fig. 1 we present the dependence on PO2 of the
transport properties of LAO/STO samples with thickness t = 20 nm. The temperature (T) dependence of the
resistance and mobility of our LAO/STO samples (Fig.
1a due to LCS= 57
['ur LAO/STO samples (Fig.\n'
 '1a) grown at low pressure (PO2 < 10−5 mbar) are similar\n'
 'to those reported in other works [2, 3, 4, 5,

In [5]:
# Converter
from pathlib import Path
import json

root = Path("../scicap_data")
scicap_metadata = Path("../scicap_data/SciCap-Caption-All/train")
text_dir = Path("/data/kevin/arxiv/fulltext")
references_dir = root / "references"

references_dir.mkdir(exist_ok=True)

for json_file in scicap_metadata.iterdir():
    with open(json_file) as f:
        metadata = json.load(f)
    
    paper_id = metadata["paper-ID"]
    fig_num = re.search(r"Figure(\d+)", metadata["figure-ID"]).group(1)

    print("Processing Figure", fig_num, "for", paper_id, metadata["figure-ID"])

    with open(text_dir / (paper_id + ".txt")) as f:
        text = f.read()
    
    # Find all references
    references = find_references(text, metadata["figure-number"])
    
    # Write to file
    with open(root / "references" / json_file.name, "w") as f:
        json.dump({"references": references}, f)

Processing Figure 15 for 1207.3749v1 1207.3749v1-Figure15-1.png


FileNotFoundError: [Errno 2] No such file or directory: '/data/kevin/arxiv/fulltext/1207.3749v1.txt'