## Database setup

Connect to the database, extract the necessary data and close the connection.


In [4]:
import psycopg2

conn = psycopg2.connect("dbname=hn user=julien")
cur = conn.cursor()

cur.execute("""
SELECT
	hn_post.id,
	hn_post.url
FROM
	hn_post
	LEFT JOIN hn_article ON hn_post.id = hn_article.id -- We do a left join to get all posts
WHERE
	hn_article IS NULL -- only get posts that have not been processed yet
	AND hn_post.score > 100 -- only get posts with a score higher than 700
    AND (hn_post.url IS NOT NULL); -- only get posts with a url
""")

posts = cur.fetchall()

print("Number of posts to process: {}".format(len(posts)))

cur.close()

Number of posts to process: 53


## Text extraction
Using the Diffbot API, extract the text from the URLs and save it in the database.

In [5]:
from os import getenv
import aiohttp
from dotenv import load_dotenv
from urllib.parse import urlparse, quote
from fitz import open as open_pdf
from yarl import URL


load_dotenv(".env", override=True)

requests = aiohttp.ClientSession()


async def get_text_Article(url: str) -> (str, str, str):
    """
    Fetch the text of an article using Diffbot's Article API.

    Args:
        url (str): The URL of the article.

    Returns:
        (str, str, str) : title, language, text
    """

    """ params = {
        "token": getenv("DIFFBOT_API_KEY"),
        "url": url,
    } """
    # params = "token={}&url={}".format(getenv("DIFFBOT_API_KEY"), quote(url, safe=":/"))
    params = "?url={}&token={}".format(quote(url, safe=""), getenv("DIFFBOT_API_KEY"))

    headers = {
        "Accept": "application/json",
    }
    """ 
    Two hours of work to find out that the URL was encoded twice.
    Diffbot doesn't accept / in the params. But aiohttp leaves them unencoded.
    And any attempts to pre encode the URL with quote() would result to escaping twice the url.
     
    """

    url = URL("https://api.diffbot.com/v3/article"+params, encoded=True)

    response = await requests.get(
        url, headers=headers)
    

    # We check if the request was successful.
    if (response.status != 200):
        raise Exception("Error while fetching the text of the article. Status code: {}".format(
            response.status))

    data = (await response.json())

    if "error" in data:
        raise Exception("Error {} while fetching the text of the article: {}".format(data["errorCode"],
            data["error"]))

    data = data["objects"][0]

    # We check if the text is returned by the API.
    if ("text" not in data):
        raise Exception("Error no text for the article: {}".format(
            data["error"]))

    return data["title"], data["humanLanguage"], data["text"]


def set_null_in_database(id: int):
    """
    Set the text of the article to NULL in the database.

    Args:
        id (int): The id of the article.
    """
    try:
        cur = conn.cursor()

        cur.execute("""
        INSERT INTO hn_article (id, title, language, text) VALUES (%s, NULL, NULL, NULL);""", (id,))

        conn.commit()

        cur.close()
    except Exception as e:
        print(e)
        conn.rollback()


async def addArticleToDatabase(params: (int, str)):
    """

    Args:
        params (int, str): The id and the url of the article.
    """

    id, url = params

    parsed = urlparse(url)

    # These websites are not supported by Diffbot.
    if parsed.hostname == "www.youtube.com" or parsed.hostname == "youtu.be" or parsed.hostname == "youtube.com" or parsed.hostname == "twitter.com" or parsed.hostname == "x.com":
        set_null_in_database(id)
        print("Unsupported website ({})".format(id))

    if parsed.hostname == "arxiv.org":
        documentCode = url.split("/")[-1]
        url = "https://arxiv.org/pdf/{}.pdf".format(documentCode)

    try:
        title, language, text = await get_text_Article(url)
        if text == "":
            raise Exception("Empty text for article {}".format(id))
    except Exception as e:
        print("Error", id, str(e), url)
        set_null_in_database(id)
        return

    cur = conn.cursor()
    try:
        cur.execute("""
        INSERT INTO hn_article (id, title, language, text)
        VALUES (%s, %s, %s, %s);""", (id, title, language, text))
    except Exception as e:
        conn.rollback()

    conn.commit()

    cur.close()

    print("Success ({})".format(id))
    return

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x1055521d0>


### Run the extraction

In [6]:
import aiometer

await aiometer.run_on_each(addArticleToDatabase, posts, max_per_second=5)


Success (38968269)
Success (38968550)
Success (38965509)
Success (38967262)
Success (38965306)
Success (38977128)
Success (38971966)
Unsupported website (38966306)
Success (38966601)
Success (38966306)
Success (38975453)
Success (38971178)
Success (38964675)
Success (38969985)
Success (38972735)
Success (38969348)
Success (38976955)
Success (38965310)
Success (38966145)
Success (38975204)
Success (38969114)
Success (38975226)
Success (38969759)
Success (38966875)
Success (38969461)
Success (38977692)
Success (38978705)
Success (38972362)
Success (38963678)
Success (38964958)
Success (38964983)
Error 38971012 Error 500 while fetching the text of the article: Error processing page. https://www.amazon.com/fulfill-request-respectful-information-users-Brown/dp/B0CM82FJL2
Success (38976254)
Success (38968619)
Success (38971221)
Success (38978665)
Unsupported website (38974802)
Success (38972358)
Success (38965003)
Success (38978289)
Unsupported website (38968359)
Success (38969893)
Success (