# MongoDB Scraper for Metacritic movie data
DA 230 Midterm

Author: Heather Marie

Contributers: Ted Spence, Vincent Hong, Jamie Kirsila, Natalia Sadkov

In [1]:
# Import modules
import urllib3
import certifi
import re
import json
import pymongo
import pandas

# Define source link
link = f'https://www.metacritic.com/browse/movies/score/metascore/year'

# Define regex
title_regex = re.compile(r"class=\"title\"><h3>(.+)</h3>")
date_regex = re.compile(r"class=\"clamp-details\">\s+<span>(.+)<\/span>")
description_regex = re.compile(r"<div class=\"summary\">\s*([\S\s]+?)\s*<\div>")
score_regex = re.compile(r"<span class=\"title\">Metascore:</span>\s+<a class=\"metascore_anchor\" href=\"/movie/.*?/critic-reviews\">\s+<div class=\"metascore_w large movie .+\">(.*?)</div>")
image_regex = re.compile(r"<a href=\"/movie/.*\"><img\src=\"(.*)\" alt")

# Build a python request pool
http = urllib3.PoolManager(cert_reqs = 'CERT_NONE')

# Fix SSL cert error 
urllib3.disable_warnings()

# Intiate a web request
res = http.request('GET', link, headers={'User-Agent': 'Mozilla/5.0'})

# Convert results from raw bytes to test
datastring = str(res.data, "utf-8")

# Check for success and how much data retrieved from the site
print(f'Status: {res.status}')
print(f'Fetched {len(datastring)} characters from {link}.')

Status: 200
Fetched 505994 characters from https://www.metacritic.com/browse/movies/score/metascore/year.


In [2]:
# Loading the json file that containins my secret MongoDB connection string
with open (r'C:\Users\hsely\OneDrive\Documents\GitHub\DA320\Midterm\credentials.json') as u:
    data = json.load(u)

secret_key = data['mongodb']

# We can safely print the length of the secret key. That won't leak any sensitive information.
print(f"My secret key is {len(secret_key)} characters in length.")

My secret key is 67 characters in length.


In [3]:
# Connecting to the database using known good certificates
client = pymongo.MongoClient(secret_key, tlsCAFile=certifi.where())

# Fetching my database titled "DA320"
da320_database = client.DA320
metacritic_data = da320_database.Metacritic

# Accessing my collections from my "DA320" database
allCollections = da320_database.list_collection_names()

# Print statements that display the current version as well as the collections present in DA320
print(f"Using MongoDB version {client.server_info()['version']}.")
print(f"This database has the collections {allCollections}")

Using MongoDB version 5.0.13.
This database has the collections ['IMDB', 'IMDB_Pipeline_View']


In [4]:
# Retrieve a list of movies from a particular year and page of Metacritic
def metacritic_scraper(year: int, page: int) -> pandas.DataFrame:
    
    # Fetch webpage
    url = f"https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected={year}&sort=desc&view=detailed&page={page}"
    response = http.request('GET', url, headers={'User-Agent': 'Mozilla/5.0'})
    datastring = str(response.data, "utf-8")

    # Execute all regex
    titles = title_regex.findall(datastring)
    dates = date_regex.findall(datastring)
    descriptions = description_regex.findall(datastring)
    scores = score_regex.findall(datastring)
    images = image_regex.findall(datastring)

    # Return a unified collection
    dataset = {"title": titles, "date": dates, "description": descriptions, "score": scores, "image": images}
    return pandas.DataFrame(dataset)

In [5]:
# Write a CSV file with this data
for year in range(2000, 2023):
    page = 0
    print(f"Collecting data for {year} page {page}...")

    #Retry a page multiple times if necessary
    while True:
        data = metacritic_scraper(year, page)

        # Stop at page with zero rows
        if len(data) == 0:
            break

        #Convert the dataframe into a list of movies to insert into Mongo DB
        movies_to_insert = []
        for row in data.itertuples():
            movie = {
                "title": row.title,
                "release_date": row.date,
                "description": row.description, 
                "metascore": row.score, 
                "image_url": row.image,
            }
            movies_to_insert.append(movie)
        
        # Insert records into MongoDB
        print(f"Inserting {len(movies_to_insert)} movies for the year {year} page {page}")
        metacritic_data.insert_many(movies_to_insert)
        page = page + 1


Collecting data for 2000 page 0...
