<h1>IMFDB Connector</h1>
<h3>Python script for populating a PostgreSQL database with article data from the Internet Movies and Firearms Database Wiki</h3>

In [2063]:
# Imports
import psycopg2
import requests
import os
import re
from bs4 import BeautifulSoup

In [2064]:
# Establish a connection to the database
cnx = psycopg2.connect(
    host="localhost",
    user="imfdb",
    password=os.environ.get("PG_IMFDB_PASSWORD"),
    database="imfdb"
)

# Create a cursor object
cursor = cnx.cursor()

<h3> MediaWiki API related functions</h3>
These are used to query the MW API for article data and populating the Postgres DB with the minimum necessary data for further processing.

In [2065]:
def api_request(url):
    # Makes a get request to the specified API endpoint. A JSON response is expected.

    # Make a GET request to the IMFDB API endpoint
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Get the JSON data from the response
        return response.json()
    else:
        # Handle the error
        print(f"ERROR: api_request(): Request failed with status code: {response.status_code}")
        return None

In [2066]:
def parse_page_by_id(pageid, prop, format):
    # Example: parse_page_by_id("215875","text", "json") to parse the wiki text of Weird Al Yankovic as json

    data = api_request(f"https://www.imfdb.org/api.php?action=parse&pageid={pageid}&prop={prop}&format={format}")

    # Error Handling
    if data is None:
        print("ERROR: parse_page_by_id(): Data is None!")
        return
    
    return data

In [2067]:
def get_page_text_by_id(pageid):
    response = requests.get(f"https://www.imfdb.org/index.php?curid={pageid}")
    return str(response.text)
# We don't use the API anymore for this due to issues with the HTML it responds with
#def get_page_text_by_id(pageid):
#    data = parse_page_by_id(pageid, "text", "json")
#    return str(data["parse"]["text"]["*"])

In [2068]:
def query_categorymembers(cmtitle, format):
    # Example: query_categorymembers("Category:Actor", "json") to query all actor pages as json.

    # Make a GET request to the IMFDB API endpoint
    data = api_request(f"https://www.imfdb.org/api.php?action=query&list=categorymembers&cmtitle={cmtitle}&format={format}")

    # Error Handling
    if data is None:
        print("ERROR: query_categorymembers(): Data is None!")
        return None

    #Initialize list
    categorymembers = []

    # Loop through the first batch of category members
    for member in data["query"]["categorymembers"]:
        print(f"Adding {member['title']}")
        categorymembers.append(member)

    # Continue fetching while there is something to be fetched
    while "continue" in data:
        data = api_request(f"https://www.imfdb.org/api.php?action=query&list=categorymembers&cmtitle={cmtitle}&format={format}&cmcontinue={data['continue']['cmcontinue']}")
        
        # Error Handling
        if data is None:
            print(f"ERROR: query_categorymembers(): Data is None in continuation batch {data['continue']['cmcontinue']}")
            return None
            
        # Loop through continuation batch:
        for member in data["query"]["categorymembers"]:
            print(f"Adding {member['title']}")
            categorymembers.append(member)

    return categorymembers

In [2069]:
def populate_actors_table():
    actors = query_categorymembers("Category:Actor", "json")

    for actor in actors:

        actorpageid = str(actor['pageid'])
        actorurl = f"https://www.imfdb.org/index.php?curid={actorpageid}"
        actorpagecontent = get_page_text_by_id(actorpageid)
        actorname = str(actor['title'])
        if "Category:" in actorname:
            continue
        print(f"INSERTing: {actorname}, {actorpageid}")
        statement = "INSERT INTO actors (actorurl, actorpageid, actorpagecontent, actorname) VALUES (%s, %s, %s, %s)"
        cursor.execute(statement, (actorurl, actorpageid, actorpagecontent, actorname))
    
    cnx.commit()

In [2070]:
def populate_movies_table():
    movies = query_categorymembers("Category:Movie", "json")

    for movie in movies:

        moviepageid = str(movie['pageid'])
        movieurl = f"https://www.imfdb.org/index.php?curid={moviepageid}"
        moviepagecontent = get_page_text_by_id(moviepageid)
        movietitle = str(movie['title'])
        if "Category:" in movietitle:
            continue
        print(f"INSERTing: {movietitle}, {moviepageid}")
        statement = "INSERT INTO movies (movieurl, moviepageid, moviepagecontent, movietitle) VALUES (%s, %s, %s, %s)"
        cursor.execute(statement, (movieurl, moviepageid, moviepagecontent, movietitle))
    
    cnx.commit()

In [2071]:
def populate_tvseries_table():
    tvseries = query_categorymembers("Category:Television", "json")

    for series in tvseries:

        tvseriespageid = str(series['pageid'])
        tvseriesurl = f"https://www.imfdb.org/index.php?curid={tvseriespageid}"
        tvseriespagecontent = get_page_text_by_id(tvseriespageid)
        tvseriestitle = str(series['title'])
        if "Category:" in tvseriestitle:
            continue
        print(f"INSERTing: {tvseriestitle}, {tvseriespageid}")
        statement = "INSERT INTO tvseries (tvseriesurl, tvseriespageid, tvseriespagecontent, tvseriestitle) VALUES (%s, %s, %s, %s)"
        cursor.execute(statement, (tvseriesurl, tvseriespageid, tvseriespagecontent, tvseriestitle))
    
    cnx.commit()

In [2072]:
def populate_firearms_table_minimally():
    firearms = query_categorymembers("Category:Gun", "json")

    for firearm in firearms:

        firearmpageid = str(firearm['pageid'])
        firearmurl = f"https://www.imfdb.org/index.php?curid={firearmpageid}"
        firearmpagecontent = get_page_text_by_id(firearmpageid)
        firearmtitle = str(firearm['title'])
        if "Category:" in firearmtitle:
            continue
        print(f"INSERTing: {firearmtitle}, {firearmpageid}")
        statement = "INSERT INTO firearms (firearmurl, firearmpageid, firearmpagecontent, firearmtitle) VALUES (%s, %s, %s, %s)"
        cursor.execute(statement, (firearmurl, firearmpageid, firearmpagecontent, firearmtitle))
    
    cnx.commit()

<h3>Database related functions</h3>
Once we have downloaded the necessary article data, we extract useful information from the HTML to fill the rest of our column.

In [2073]:
# Some useful dictionaries #

firearms_dict = {
    "firearmid" : 0,
    "firearmurl" : 1,
    "parentfirearmid" : 2,
    "firearmpageid" : 3,
    "firearmpagecontent" : 4,
    "specificationid" : 5,
    "firearmtitle" : 6,
    "firearmversion" : 7,
    "isfamily" : 8,
    "isfictional" : 9
}

In [2074]:
def get_page_content_from_db(pageid, table):
    if table in ["actors", "movies", "tvseries", "firearms"]:
        singular = table.rstrip("s")
        content = "{}pagecontent".format(singular)
        id = "{}pageid".format(singular)
    else:
        print("ERROR: get_page_content_from_db(): {} is not a valid table!".format(table))
        return None

    statement = "select {} from {} where {} = '{}';".format(content, table, id, pageid)
    if table == "firearms": # If the firearms table is queried, filter out child firearm rows
        statement = "select {} from firearms where {} = '{}' and parentfirearmid is null;".format(content, id, pageid)
    cursor.execute(statement)
    return cursor.fetchone()[0]

In [2075]:
def update_firearms_isfictional():
    statement = "UPDATE firearms SET isfictional = FALSE WHERE NOT firearmtitle LIKE '(%) -%';"
    cursor.execute(statement)
    statement = "UPDATE firearms SET isfictional = TRUE WHERE firearmtitle LIKE '(%) -%';"
    cursor.execute(statement)
    cnx.commit()

In [2076]:
def is_multi_gun_page(pageid):
    # Exceptions
    if (pageid == "464719" or pageid == "314208"): #Both of these have a table of contents despite being singles
        return False

    # If there are multiple h1s in an article, which are not See Also or Specification, it's a multi-gun page
    soup = soup = BeautifulSoup(get_page_content_from_db(pageid, "firearms"), 'html.parser')

    toctitle = soup.find("div", class_="toctitle") #If there is no table of contents, we don't need to check further, it's not multi-gun
    if toctitle is None:
        return False

    see_also = soup.find(id = "See_Also")
    if see_also is not None:
        if see_also.parent.name == "h1":
            see_also.parent.extract()

    spec = soup.find(id = "Specifications")
    if spec is not None:
        if spec.parent.name == "h1":
            spec.parent.extract()

    h1_tags = soup.find_all("h1")
    count = len(h1_tags)

    if count > 1:
        return True
    else:
        return False

In [2077]:
def update_firearms_isfamily():
    # We assume a firearm is a family when it is named 'series' or has a multi-gun page
    keyword1 = "series"
    keyword2 = "Series"
    statement = "UPDATE firearms set isfamily = FALSE WHERE NOT (firearmtitle LIKE '%%%s%%' OR firearmtitle LIKE '%%%s%%' OR firearmtitle = 'Air Guns')" % (keyword1, keyword2)
    cursor.execute(statement)
    statement = "UPDATE firearms set isfamily = TRUE WHERE (firearmtitle LIKE '%%%s%%' OR firearmtitle LIKE '%%%s%%' OR firearmtitle = 'Air Guns')" % (keyword1, keyword2)
    cursor.execute(statement)

    statement = "SELECT * FROM firearms"
    cursor.execute(statement)
    firearms = cursor.fetchall()
    for firearm in firearms:
        if (is_multi_gun_page(pageid=firearm[3]) and firearm[8] == False): # If we have determined the article is multi-gun, it's a family
            statement = "UPDATE firearms set isfamily = TRUE WHERE firearmid = '%s'" % (firearm[0])
            cursor.execute(statement)
        if firearm[2] is not None: # Child firearms are never families 
            statement = "UPDATE firearms set isfamily = FALSE WHERE firearmid = '%s'" % (firearm[0])
            cursor.execute(statement)

    cnx.commit()

In [2078]:
def get_number_of_specifications(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    spec_tags = soup.find_all(id=lambda value: value and value.startswith("Specifications"))
    spec_count = len(spec_tags)

    print("The number of h2 tags with 'Specifications' is:", spec_count)
    return spec_count

In [2079]:
def strip_spec_list_item(item):
    index = item.index(":")
    return item[index + 1:].strip()

In [2080]:
def get_single_specification(html_content, pageid=None):
    # Finds the first specification within any given HTML code

    spec_dict = {
        "production":None,
        "type":None,
        "caliber":None,
        "capacity":None,
        "fire_modes":None
    }

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all h1 headers in the content
    headers = soup.find_all("h1")

    if (headers is None or len(headers) == 0):
        if pageid is not None:
            print(f"ERROR: get_first_specification(): {pageid} does not contain any headers!")
            return

    # Page Title
    title = headers[0].text

    # Find the first Element with the Specifications id
    span = soup.find(id = "Specifications")
    if span is None:
        print("ERROR: get_first_specification(): No <span> tag with id = 'Specifications' was found!")
        return

    spec_lists = span.parent.find_next_siblings("ul")

    # Each spec row is its own unordered list with only a single list item
    if spec_lists is None:
        print("ERROR: get_first_specification(): No <ul> tags were found!")
        return

    specifications = [li.text for ul in spec_lists for li in ul.find_all('li')]

    if specifications is None:
        print("ERROR: get_first_specification(): No <li> tags were found!")
        return

    # Iterate through the list items and set a dict for each
    for item in specifications:
        if "Type:" in item:
            spec_dict["type"] = strip_spec_list_item(item)
        elif "Caliber:" in item:
            spec_dict["caliber"] = strip_spec_list_item(item)
        elif "Capacity:" in item:
            spec_dict["capacity"] = strip_spec_list_item(item)
        elif "Fire Modes:" in item:
            spec_dict["fire_modes"] = strip_spec_list_item(item)
    
    # Initialize with None so we can check later if a value was extracted from the HTML
    production = None

    # Determine whether a <p> tag containing a date exists
    possible_p_tag = span.parent.find_next_sibling()
    if possible_p_tag is not None:
        if possible_p_tag.name == "p" and re.match(r"\(\.*\d{4}.*\)",str(possible_p_tag.text)):
            production = possible_p_tag.text
            match = re.search(r"\(.*(\d{4})s?\s*(-\s|–\s)(\d{4}|Present)s?.*\)", production)
            if match is not None:
                production_year = match.group(1).strip()
                production_end_year = match.group(3).strip()
                production = f"({production_year} - {production_end_year})"
            # Add it to the specification
            spec_dict["production"] = production.rstrip("\n")
        
    return spec_dict

In [2081]:
def articles_has_h1_variants(soup):
# Some firearm pages have variants ("Military, Civilian, etc.") as their h1, instead of different models
    if soup.find(id = "Specifications") is not None:
        first_specification_header = soup.find(id = "Specifications").parent
        if first_specification_header.name == "h3":
            return True
        else:
            return False
    else: # If the table of contents has a depth of 3 or more, we also assume variants
        toctitle = soup.find("div", class_="toctitle")
        if toctitle is not None:
            toc = toctitle.find_next_sibling("ul")
            regex = re.compile(r'(\d\.){2,}\d')
            for li in toc.find_all('li'):
                if regex.search(li.text):
                    return True
            return False
    return None

In [2082]:
def generate_firearms_from_multi(html_content, url, pageid, parentuuid):
    # This function splits multi-gun pages at every h1, if it doesn't use h1s as variants, and at every h2, if it does.

    soup = BeautifulSoup(html_content, 'html.parser')

    firearmtitle = None
    content = ""
    version = None

    see_also = soup.find(id = "See_Also")
    if see_also is not None:
        if see_also.parent.name == "h1":
            see_also.parent.extract()

    spec = soup.find(id = "Specifications")
    if spec is not None:
        if spec.parent.name == "h1":
            h1spec= spec.parent.extract()

    if articles_has_h1_variants(soup): # If it has variants in h1...
        headers = soup.find_all("h1")
        headers.pop(0) #Skip the first one, since its the page heading
        for h1 in headers:
            version = h1.text
            headers2 = h1.find_next_siblings("h2")
            for h2 in headers2:
                slices = []
                slices.append(h2)
                firearmtitle = h2.text
                for slice in h2.find_next_siblings():
                    if (slice.name == "h2" or slice.name == "h1"):
                        break
                    slices.append(slice.extract())
                # Now that we have built up our slices of html, it's time to insert
                for slice in slices:
                    content = content+str(slice)
                if (content == "" or firearmtitle is None or version is None):
                    print(f"ERROR: generate_firearm_from_multi(): {pageid} produced empty version, content or title!")
                    continue
                if firearmtitle in ["Video Games", "Film", "Television", "Anime"]:
                    print(f"ERROR: generate_firearm_from_multi(): Version check failed for {pageid}!")
                    continue
                print(f"INSERTing: {firearmtitle}, {pageid}, {parentuuid}")
                statement = "INSERT INTO firearms (firearmurl, parentfirearmid, firearmpageid, firearmpagecontent, firearmtitle, isfamily, firearmversion) VALUES (%s, %s, %s, %s, %s, %s, %s)"
                cursor.execute(statement, (url, parentuuid, pageid, content,  firearmtitle, 'FALSE', version))
                return
                
    else: # If it doesn't have variants in h1...
        headers = soup.find_all("h1")
        headers.pop(0) # Skip the first one
        for h1 in headers:
            slices = []
            slices.append(h1)
            firearmtitle = h1.text
            for slice in h1.find_next_siblings():
                if slice.name == "h1":
                    break
                slices.append(slice.extract())
            # Now that we have built up our slices of html, it's time to insert
            for slice in slices:
                content = content+str(slice)
            if (content == "" or firearmtitle is None):
                print(f"ERROR: generate_firearm_from_multi(): {pageid} produced empty content or title!")
                continue
            if firearmtitle in ["Video Games", "Film", "Television", "Anime"]:
                print(f"ERROR: generate_firearm_from_multi(): Version check failed for {pageid}!")
                continue
            print(f"INSERTing: {firearmtitle}, {pageid}, {parentuuid}")
            statement = "INSERT INTO firearms (firearmurl, parentfirearmid, firearmpageid, firearmpagecontent, firearmtitle, isfamily) VALUES (%s, %s, %s, %s, %s, %s)"
            cursor.execute(statement, (url, parentuuid, pageid, content,  firearmtitle, 'FALSE'))
            return

    if articles_has_h1_variants(soup) is None:
        print(f"generate_firearm_from_multi(): Version check failed for {pageid}!")

In [2083]:
def generate_firearms_from_multis():
    statement = "SELECT * FROM firearms WHERE isfamily = 'True' AND parentfirearmid IS NULL;"
    cursor.execute(statement)
    firearms = cursor.fetchall()

    for firearm in firearms:
        generate_firearms_from_multi(html_content=firearm[4], url=firearm[1], pageid=firearm[3], parentuuid=firearm[0])
    cnx.commit()

In [2084]:
def check_for_family_candidates():
# Debugging function
    statement = "SELECT * FROM firearms"
    cursor.execute(statement)
    firearms = cursor.fetchall()
    
    with open('candidates.txt', 'w') as writer:
        for firearm in firearms:
            if (is_multi_gun_page(pageid=firearm[3]) and firearm[8] == False):
                writer.write(f"{firearm[3]}\n")

In [2085]:
def populate_specs_for_singles():
    statement = "SELECT * FROM firearms WHERE isfamily = 'False';"
    cursor.execute(statement)
    firearms = cursor.fetchall()

    for firearm in firearms:
        print(f"Fetching spec for: {firearm[3]}")
        spec = get_single_specification(firearm[4])
        if spec is not None:
            print(f"INSERTing: {firearm[3]} specification")
            statement = "INSERT INTO specifications (firearmid, type, caliber, capacity, firemode, productiontimeframe) VALUES (%s, %s, %s, %s, %s, %s)"
            cursor.execute(statement, (firearm[0], spec["type"], spec["caliber"], spec["capacity"], spec["fire_modes"], spec["production"]))
    
    cnx.commit()

In [2086]:
def populate_specs_for_multies():
    return

In [2087]:
def populate_specifications_table():
    #populate_specs_for_singles()
    #populate_specs_for_multies()
    return

<h3>Corner and edge cases:<h3>

In [2088]:
# Corner-case handling:

# https://www.imfdb.org/index.php?curid=62 has just a single Spec at the beginning of the page
#
# X https://www.imfdb.org/index.php?curid=464719 Sage BML-37 has a table of contents despite being a single
# https://www.imfdb.org/index.php?curid=348107 HK AG Grenade Launchers are a weird mix of versioned and non-versioned, ie live-fire models are non-versioned, non-firing replicas are versioned
# https://www.imfdb.org/index.php?curid=3564 SIG P210 has its Video Game table nested in the Television segment
# X https://www.imfdb.org/index.php?curid=314208 Flintlock Musket has a table of contents despite being a single


<h3>Main<h3>

In [2089]:
### We do stuff here: ###

#Populate the database skeleton:
#populate_actors_table()
#populate_movies_table()
#populate_tvseries_table()
#populate_firearms_table_minimally()

#Finalize the firearms table:
#update_firearms_isfictional()
#update_firearms_isfamily()
#generate_firearms_from_multis()

#populate_specifications_table()
#populate_specs_for_singles()
#populate_specs_for_multies()


In [2090]:
# Close database connection
cursor.close()
cnx.close()