In [23]:
#import libraries
import urllib3
import certifi
import pymongo
import pandas
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re

In [24]:
#constract all regular expressions
title_regex = re.compile(r"class=\"title\"><h3>(.+)</h3>")
date_regex = re.compile(r"class=\"clamp-details\">\s+<span>(.+)</span>")
description_regex = re.compile(r"<div class=\"summary\">\s*([\S\s]+?)\s*</div>")
score_regex = re.compile(r"<span class=\"title\">Metascore:</span>\s+<a class=\"metascore_anchor\" href=\"/movie/.*?/critic-reviews\">\s+<div class=\"metascore_w large movie .+\">(.*?)</div>")
image_regex = re.compile(r"<a href=\"/movie/.*\"><img src=\"(.*)\" alt=\"")

# Construct an HTTP pool for connection
http=urllib3.PoolManager(ca_certs=certifi.where())

In [25]:
import json

# Demonstration of how to load a file that contains secrets without accidentally leaking those secrets
with open('C:\\Jodi_BC\\DA 320\\Week 5\\Secrets.json') as f:
    data = json.load(f)

    # If you want your data to be secure, don't print this variable out!
    # Jupyter will retain a cached version of any printed data and it can be
    # accidentally committed to version control.
secret_key = data['MongoDB']

# We can safely print the length of the secret key. That won't leak any sensitive information.
print(f"My secret key is {len(secret_key)} characters in length.")

My secret key is 69 characters in length.


In [26]:
client = pymongo.MongoClient(secret_key, tlsCAFile=certifi.where())

# Fetch the database named "DA-320"
da320_database = client['DA320']
metacritic_data = da320_database['MetaCritic']

In [28]:
# Retrieve a list of movies from a particualr year and page of Metacritic
def metacritic_scraper(year: int, page: int) -> pandas.DataFrame:
    
    # fetch the webpage
    url = f"https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected={year}&sort=desc&view=detailed&page={page}"
    response = http.request('GET', url, headers={'User-Agent': 'Mozilla/5.0'})
    datastring = str(response.data, "utf-8")
    
    # Execute all the regular expressions
    titles = title_regex.findall(datastring)
    dates = date_regex.findall(datastring)
    descriptions = description_regex.findall(datastring)
    scores = score_regex.findall(datastring)
    images = image_regex.findall(datastring)
    
    # Return a unified collection
    dataset = {"title": titles, "date": dates, "description": descriptions, "score": scores, "image": images}
    return pandas.DataFrame(dataset)

In [29]:
import time

# write a csv file with this data
for year in range(2000, 2023):
    page = 0
    print(f"Collecting data for {year} page {page}...")
    
    # Retry a page multiple times if necessary
    while True:
        data = metacritic_scraper(year, page)
        
        # stop when we reach a page with zero rows
        if len(data) == 0:
            break
            
        # Convert the dataframe into a list of movies to insert into MongoDB
        movies_to_insert = []
        for row in data.itertuples():
            movie = {
                "title": row.title,
                "release_date": row.date,
                "description": row.description,
                "metascore": row.score,
                "image_url": row.image,    
            }
            movies_to_insert.append(movie)
            
        # Insert records into MongoDB
        print(f"Inserting {len(movies_to_insert)} movies for the year {year} page {page}")
        metacritic_data.insert_many(movies_to_insert)
        page = page + 1


Collecting data for 2000 page 0...
Inserting 100 movies for the year 2000 page 0
Inserting 100 movies for the year 2000 page 1
Inserting 100 movies for the year 2000 page 2
Inserting 65 movies for the year 2000 page 3
Collecting data for 2001 page 0...
Inserting 100 movies for the year 2001 page 0
Inserting 100 movies for the year 2001 page 1
Inserting 100 movies for the year 2001 page 2
Inserting 85 movies for the year 2001 page 3
Collecting data for 2002 page 0...
Collecting data for 2003 page 0...
Inserting 100 movies for the year 2003 page 0
Inserting 100 movies for the year 2003 page 1
Inserting 100 movies for the year 2003 page 2
Inserting 100 movies for the year 2003 page 3
Inserting 9 movies for the year 2003 page 4
Collecting data for 2004 page 0...
Inserting 100 movies for the year 2004 page 0
Inserting 100 movies for the year 2004 page 1
Inserting 100 movies for the year 2004 page 2
Inserting 100 movies for the year 2004 page 3
Inserting 76 movies for the year 2004 page 4
Co