In [None]:
import os
from doi2pdf import doi2pdf
import string
from dotenv import load_dotenv
import requests
from app.v1.utils.constants import APIConstants


class ExtractResearchArticles:
    def __init__(self, max_articles=10):
        self.max_articles = max_articles

        load_dotenv()

        # Validate required environment variables
        required_vars = ["CROSSREF_BASE_URL", "EMAIL", "JOURNAL_ARTICLE_DIRECTORY"]
        missing_vars = [var for var in required_vars if not os.getenv(var)]

        if missing_vars:
            raise Exception(
                f"Missing required environment variables: {', '.join(missing_vars)}"
            )

        self._configure_from_env()

    def _configure_from_env(self):
        required_vars = [
            "CROSSREF_BASE_URL",
            "EMAIL",
            "JOURNAL_ARTICLE_DIRECTORY",
            "UNPAYWALL_BASE_URL",
        ]
        missing_vars = [var for var in required_vars if not os.getenv(var)]

        if missing_vars:
            raise Exception(
                f"Missing required environment variables: {', '.join(missing_vars)}"
            )

        self.email = os.getenv("EMAIL")
        self.crossref_base_url = os.getenv("CROSSREF_BASE_URL")
        self.unpaywall_base_url = os.getenv("UNPAYWALL_BASE_URL")
        self.output_directory = os.getenv("JOURNAL_ARTICLE_DIRECTORY")

        self.user_agent = APIConstants.USER_AGENT_TEMPLATE.format(email=self.email)

    def _get_crossref_headers(self):
        """Return headers for Crossref API requests"""
        return {"User-Agent": self.user_agent}

    def _get_crossref_params(self, query):
        """Return standardized parameters for Crossref API"""
        return {
            "query": query,
            "filter": APIConstants.CROSSREF_FILTER,
            "rows": self.max_articles,
        }

    def _get_unpaywall_url(self, doi):
        """Generate the appropriate Unpaywall URL for a given DOI"""
        return f"{self.unpaywall_base_url}/{doi}?email={self.email}"

    def get_dois_from_crossref(self, query):
        params = self._get_crossref_params(query)
        headers = self._get_crossref_headers()

        try:
            response = requests.get(
                self.crossref_base_url, params=params, headers=headers
            )
            data = response.json()

            article_list = []
            for item in data["message"]["items"]:
                article_dict = {
                    "doi": item["DOI"],
                    "title": item["title"],
                    "author": item["author"],
                    "year_published": item["published"]["date-parts"][0][0],
                    "url": item["URL"],
                    "abstract": item["abstract"],
                }

                article_list.append(article_dict)

            return article_list

        except requests.exceptions.RequestException as e:
            raise Exception(f"Error fetching DOIs: {e}")

    def check_for_open_access(self, article_list):
        open_article_list = []

        for article in article_list:
            doi = article["doi"]
            url = self._get_unpaywall_url(doi)
            response = requests.get(url)

            if response.status_code == 200:
                data = response.json()

                if data.get("is_oa"):
                    open_article_list.append(article)

        if len(open_article_list) == 0:
            raise Exception("No open-access articles found.")

        return open_article_list

    @staticmethod
    def create_file_name(title):
        remove_punctuation = str.maketrans("", "", string.punctuation)

        return (
            title.translate(remove_punctuation).lower().replace(" ", "_").lower()
            + ".pdf"
        )

    def download_papers(self, open_article_list):
        os.makedirs(os.path.dirname(self.output_directory), exist_ok=True)

        exported_articles = []

        for article in open_article_list:
            doi = article["doi"]
            short_title = article["title"][0][0:40]
            file_name = self.create_file_name(short_title)
            article["file_name"] = file_name
            output_path = os.path.join(self.output_directory, file_name)
            article["output_path"] = output_path
            doi2pdf(doi, output=output_path)

            if os.path.exists(output_path):
                exported_articles.append(article)

        if len(exported_articles) == 0:
            raise Exception("No articles downloaded.")

        return exported_articles

    def search_and_download_open_papers(self, query):
        article_list = self.get_dois_from_crossref(query)

        open_article_list = self.check_for_open_access(article_list)

        downloaded_articles = self.download_papers(open_article_list)

        return downloaded_articles

In [None]:
extract_article_cls = ExtractResearchArticles(max_articles=10)

article_result = await extract_article_cls.search_and_download_open_papers(
    "multi-agent workflows"
)