<h1>IMFDB Connector</h1>
<h3>Python script for populating a PostgreSQL database with article data from the Internet Movies and Firearms Database Wiki</h3>

In [103]:
# Imports
import psycopg2
import requests
import os

In [104]:
# Establish a connection to the database
cnx = psycopg2.connect(
    host="localhost",
    user="imfdb",
    password=os.environ.get("PG_IMFDB_PASSWORD"),
    database="imfdb"
)

# Create a cursor object
cursor = cnx.cursor()

In [105]:
def api_request(url):
    # Makes a get request to the specified API endpoint. A JSON response is expected.

    # Make a GET request to the IMFDB API endpoint
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Get the JSON data from the response
        return response.json()
    else:
        # Handle the error
        print(f"ERROR: api_request(): Request failed with status code: {response.status_code}")
        return None

In [106]:
def parse_page_by_id(pageid, prop, format):
    # Example: parse_page_by_id("215875","text", "json") to parse the wiki text of Weird Al Yankovic as json

    data = api_request(f"https://www.imfdb.org/api.php?action=parse&pageid={pageid}&prop={prop}&format={format}")

    # Error Handling
    if data is None:
        print("ERROR: parse_page_by_id(): Data is None!")
        return
    
    return data

In [107]:
def get_page_text_by_id(pageid):
    data = parse_page_by_id(pageid, "text", "json")
    return str(data["parse"]["text"]["*"])

In [108]:
def query_categorymembers(cmtitle, format):
    # Example: query_categorymembers("Category:Actor", "json") to query all actor pages as json.

    # Make a GET request to the IMFDB API endpoint
    data = api_request(f"https://www.imfdb.org/api.php?action=query&list=categorymembers&cmtitle={cmtitle}&format={format}")

    # Error Handling
    if data is None:
        print("ERROR: query_categorymembers(): Data is None!")
        return None

    #Initialize list
    categorymembers = []

    # Loop through the first batch of category members
    for member in data["query"]["categorymembers"]:
        print(f"Adding {member['title']}")
        categorymembers.append(member)

    # Continue fetching while there is something to be fetched
    # while "continue" in data:
    #     data = api_request(f"https://www.imfdb.org/api.php?action=query&list=categorymembers&cmtitle={cmtitle}&format={format}&cmcontinue={data['continue']['cmcontinue']}")
        
    #     # Error Handling
    #     if data is None:
    #         print(f"ERROR: query_categorymembers(): Data is None in continuation batch {data['continue']['cmcontinue']}")
    #         return None
            
    #     # Loop through continuation batch:
    #     for member in data["query"]["categorymembers"]:
    #         print(f"Adding {member['title']}")
    #         categorymembers.append(member)

    return categorymembers

In [109]:
def populate_actors_table():
    actors = query_categorymembers("Category:Actor", "json")

    for actor in actors:

        actorpageid = str(actor['pageid'])
        actorurl = f"https://www.imfdb.org/index.php?curid={actorpageid}"
        actorpagecontent = get_page_text_by_id(actorpageid)
        actorname = str(actor['title'])

        statement = "INSERT INTO actors (actorurl, actorpageid, actorpagecontent, actorname) VALUES (%s, %s, %s, %s)"
        cursor.execute(statement, (actorurl, actorpageid, actorpagecontent, actorname))
    
    cnx.commit()


In [110]:
### We do stuff here: ###

populate_actors_table()

#query_categorymembers("Category:Actor", "json")
#parse_page_by_id("215875", "text", "json")

Adding "Weird Al" Yankovic
Adding A Martinez
Adding A. Russell Andrews
Adding A.J. Buckley
Adding A.J. Cook
Adding A.J. Langer
Adding Aaliyah
Adding Aamir Khan
Adding Aare Laanemets
Adding Aarne Üksküla


In [111]:
# Close database connection
cursor.close()
cnx.close()