In [1]:
import bs4
import requests
from pydantic import BaseModel
import pandas as pd
from arxivist.citations import CitationExtractor

pd.set_option('display.max_colwidth', 500)

In [2]:
example_url = "https://browse.arxiv.org/html/2210.14986v2"

citations_df = CitationExtractor.get_citations_df(example_url)
citations_df



  return bs4.BeautifulSoup(content)


Unnamed: 0,title,year,source_type,source,authors,ref_name
26,Resolving indirect referring expressions for entity selection.,2023,In Proceedings of the 61st Annual Meeting of the Association for\nComputational Linguistics (Volume 1: Long Papers),"In Proceedings of the 61st Annual Meeting of the Association for\nComputational Linguistics (Volume 1: Long Papers), pages 12313–12335,\nToronto, Canada. Association for Computational Linguistics.\n\n","Hosseini, M. J., Radlinski, F., Pareti, S., and Louis, A. (2023).","Hosseini et al., (2023)"
0,Boosting search engines with interactive agents.,2022,Transactions on Machine Learning Research.,Transactions on Machine Learning Research.\n\n,"Adolphs, L., Börschinger, B., Buck, C., Huebscher, M. C., Ciaramita, M.,\nEspeholt, L., Hofmann, T., Kilcher, Y., Rothe, S., Sessa, P. G., and\nSestorain, L. (2022).","Adolphs et al., (2022)"
21,Improving alignment of dialogue agents via targeted human judgements.,2022,,,"Glaese, A., McAleese, N., Trebacz, M., Aslanides, J., Firoiu, V., Ewalds, T.,\nRauh, M., Weidinger, L., Chadwick, M., Thacker, P., Campbell-Gillingham, L.,\nUesato, J., Huang, P.-S., Comanescu, R., Yang, F., See, A., Dathathri, S.,\nGreig, R., Chen, C., Fritz, D., Sanchez Elias, J., Green, R., Mokrá, S.,\nFernando, N., Wu, B., Foley, R., Young, S., Gabriel, I., Isaac, W., Mellor,\nJ., Hassabis, D., Kavukcuoglu, K., Hendricks, L. A., and Irving, G. (2022).","Glaese et al., (2022)"
70,Opt: Open pre-trained transformer language models.,2022,,,"Zhang, S., Roller, S., Goyal, N., Artetxe, M., Chen, M., Chen, S., Dewan, C.,\nDiab, M., Li, X., Lin, X. V., Mihaylov, T., Ott, M., Shleifer, S., Shuster,\nK., Simig, D., Koura, P. S., Sridhar, A., Wang, T., and Zettlemoyer, L.\n(2022).","Zhang et al., (2022)"
66,Finetuned language models are zero-shot learners.,2022,In International Conference on Learning Representations.,In International Conference on Learning Representations.\n\n,"Wei, J., Bosma, M., Zhao, V., Guu, K., Yu, A. W., Lester, B., Du, N., Dai,\nA. M., and Le, Q. V. (2022).","Wei et al., (2022)"
...,...,...,...,...,...,...
25,Logic and conversation.,1975,In Cole,"In Cole, P. and Morgan, J. L., editors, Syntax and Semantics:\nVol. 3: Speech Acts, pages 41–58. Academic Press, New York.\n\n","Grice, H. P. (1975).","Grice, (1975)"
68,Philosophical Investigations.,1953,Philosophical Investigations,Philosophical Investigations,"Wittgenstein, L. (1953).","Wittgenstein, (1953)"
67,Tractatus logico-philosophicus.,1921,London: Routledge,"London: Routledge, 1981.\n\n","Wittgenstein, L. (1921).","Wittgenstein, (1921)"
55,Prompt programming for large language models: Beyond the few-shot\nparadigm.,56,In Extended Abstracts of the 2021 CHI Conference on Human\nFactors in Computing Systems,"In Extended Abstracts of the 2021 CHI Conference on Human\nFactors in Computing Systems, CHI EA ’21, New York, NY, USA. Association for\nComputing Machinery.\n\n","Reynolds, L. and McDonell, K. (2021b).",(56)


In [2]:
class ArxivExtractor:

    @classmethod
    def get_arxiv_html(cls, url):
        content = requests.get(url).content
        return bs4.BeautifulSoup(content)

    @classmethod
    def extract_bibliography_elements(cls, parsed_html):
        ref_fragment = parsed_html.find("ul", {"class": "ltx_biblist"})
        return ref_fragment.find_all("li")

    @classmethod
    def get_bibliography_elements(cls, url):
        return cls.extract_bibliography_elements(cls.get_arxiv_html(url))


class BibliographyExtractor:

    @classmethod
    def bib_element_to_record(cls, element):
        element_items = [e.get_text() for e in element.find_all("span")]
        ref_name, authors_str, title = element_items[:3]
        if len(element_items) > 3:
            source = element_items[3]
        else:
            source = None
        return {"title": title, "ref_name": ref_name, "authors_str": authors_str, "source": source}

    @classmethod
    def get_citations(cls, url) -> List[ArXivCitation]:
        bib_records = [cls.bib_element_to_record(e) for e in ArxivExtractor.get_bibliography_elements(example_url)]
        return [ArXivCitation.create(**s) for s in bib_records]

    @classmethod
    def get_citations_df(cls, url, sort_by_year=True):
        df = pd.DataFrame([s.dict() for s in cls.get_citations(url)])
        if sort_by_year:
            return df.sort_values("year", ascending=False)
        else:
            return df

In [3]:
example_url = "https://browse.arxiv.org/html/2210.14986v2"

In [4]:
bib_elements = ArxivExtractor.get_bibliography_elements(example_url)

In [8]:
from typing import List, Optional
import re
from returns.maybe import Maybe

class ArXivCitation(BaseModel):
    title: str
    year: Optional[int]
    source_type: Optional[str]
    source: Optional[str]
    authors: str
    ref_name: str
    
    @classmethod
    def create(cls, title, ref_name, authors_str, source):
        year_result = re.findall(r"\d+", ref_name)
        if len(year_result) == 0:
            year = None
        else:
            year = year_result[0]

        source_type = cls.get_source_type(Maybe.from_optional(source))
        return ArXivCitation(
            title=title.strip(),
            ref_name=ref_name.strip(),
            authors=authors_str.strip(),
            year=year,
            source=source,
            source_type=source_type
        )

    @classmethod
    def get_source_type(cls, source: Maybe[str]):
        return source.map(lambda s: s.strip().split(", ")[0]).value_or(None)

In [10]:
citations_df = BibliographyExtractor.get_citations_df(example_url)
citations_df

/tmp/ipykernel_119831/2471463130.py:20: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  df = pd.DataFrame([s.dict() for s in cls.get_citations(url)])


Unnamed: 0,title,year,source_type,source,authors,ref_name
26,Resolving indirect referring expressions for entity selection.,2023,In Proceedings of the 61st Annual Meeting of the Association for\nComputational Linguistics (Volume 1: Long Papers),"In Proceedings of the 61st Annual Meeting of the Association for\nComputational Linguistics (Volume 1: Long Papers), pages 12313–12335,\nToronto, Canada. Association for Computational Linguistics.\n\n","Hosseini, M. J., Radlinski, F., Pareti, S., and Louis, A. (2023).","Hosseini et al., (2023)"
0,Boosting search engines with interactive agents.,2022,Transactions on Machine Learning Research.,Transactions on Machine Learning Research.\n\n,"Adolphs, L., Börschinger, B., Buck, C., Huebscher, M. C., Ciaramita, M.,\nEspeholt, L., Hofmann, T., Kilcher, Y., Rothe, S., Sessa, P. G., and\nSestorain, L. (2022).","Adolphs et al., (2022)"
21,Improving alignment of dialogue agents via targeted human judgements.,2022,,,"Glaese, A., McAleese, N., Trebacz, M., Aslanides, J., Firoiu, V., Ewalds, T.,\nRauh, M., Weidinger, L., Chadwick, M., Thacker, P., Campbell-Gillingham, L.,\nUesato, J., Huang, P.-S., Comanescu, R., Yang, F., See, A., Dathathri, S.,\nGreig, R., Chen, C., Fritz, D., Sanchez Elias, J., Green, R., Mokrá, S.,\nFernando, N., Wu, B., Foley, R., Young, S., Gabriel, I., Isaac, W., Mellor,\nJ., Hassabis, D., Kavukcuoglu, K., Hendricks, L. A., and Irving, G. (2022).","Glaese et al., (2022)"
70,Opt: Open pre-trained transformer language models.,2022,,,"Zhang, S., Roller, S., Goyal, N., Artetxe, M., Chen, M., Chen, S., Dewan, C.,\nDiab, M., Li, X., Lin, X. V., Mihaylov, T., Ott, M., Shleifer, S., Shuster,\nK., Simig, D., Koura, P. S., Sridhar, A., Wang, T., and Zettlemoyer, L.\n(2022).","Zhang et al., (2022)"
66,Finetuned language models are zero-shot learners.,2022,In International Conference on Learning Representations.,In International Conference on Learning Representations.\n\n,"Wei, J., Bosma, M., Zhao, V., Guu, K., Yu, A. W., Lester, B., Du, N., Dai,\nA. M., and Le, Q. V. (2022).","Wei et al., (2022)"
...,...,...,...,...,...,...
25,Logic and conversation.,1975,In Cole,"In Cole, P. and Morgan, J. L., editors, Syntax and Semantics:\nVol. 3: Speech Acts, pages 41–58. Academic Press, New York.\n\n","Grice, H. P. (1975).","Grice, (1975)"
68,Philosophical Investigations.,1953,Philosophical Investigations,Philosophical Investigations,"Wittgenstein, L. (1953).","Wittgenstein, (1953)"
67,Tractatus logico-philosophicus.,1921,London: Routledge,"London: Routledge, 1981.\n\n","Wittgenstein, L. (1921).","Wittgenstein, (1921)"
55,Prompt programming for large language models: Beyond the few-shot\nparadigm.,56,In Extended Abstracts of the 2021 CHI Conference on Human\nFactors in Computing Systems,"In Extended Abstracts of the 2021 CHI Conference on Human\nFactors in Computing Systems, CHI EA ’21, New York, NY, USA. Association for\nComputing Machinery.\n\n","Reynolds, L. and McDonell, K. (2021b).",(56)
