### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
from pandas import json_normalize

In [2]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [3]:
# Set the base URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
url = f"{url}api-key={nyt_api_key}&q={filter_query}&sort={sort}&fl={field_list}&begin_date={begin_date}&end_date={end_date}"



In [4]:
# Create an empty list to store the reviews
reviews_list = []

# loop through pages 0-19
for page in range(20):
    # create query with a page number
    page_query = f"{url}&page={page}"
    
    # API results show 10 articles at a time

    
    # Make a "GET" request and retrieve the JSON
    response = requests.get(page_query).json()

    
    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)

    
    # Try and save the reviews to the reviews_list
     try:
        reviews = response["response"]["docs"]
        if len(reviews) == 0:
            print(f"No results found on page {page}")
            break
        for review in reviews:
            reviews_list.append(review)
        print(f"Retrieved page {page}")
    except KeyError:
        print(f"No results found on page {page}")
        break

        # loop through the reviews["response"]["docs"] and append each review to the list

        # Print the page that was just retrieved


        # Print the page number that had no results then break from the loop


IndentationError: unexpected indent (2092400988.py, line 21)

In [5]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(reviews_list[:5], indent=4))


NameError: name 'reviews_list' is not defined

In [6]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
reviews_df = json_normalize(reviews_list)


NameError: name 'reviews_list' is not defined

In [7]:
# Extract the title from the "headline.main" column and
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early
reviews_df["title"] = reviews_df["headline.main"].apply(lambda x: x.split("\u2018")[1].split("\u2019")[0] + " Review")


NameError: name 'reviews_df' is not defined

In [8]:
# Extract 'name' and 'value' from items in "keywords" column
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
reviews_df["keywords"] = reviews_df["keywords"].apply(extract_keywords)
reviews_df["keywords"] = reviews_df["keywords"].apply(lambda x: ', '.join(x))


NameError: name 'reviews_df' is not defined

In [9]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
titles_list = reviews_df["title"].to_list()


NameError: name 'reviews_df' is not defined

### Access The Movie Database API

In [10]:
# Prepare The Movie Database query
url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

tmdb_url = "https://api.themoviedb.org/3/search/movie?query="
tmdb_key_string = "&api_key=" + tmdb_api_key

TypeError: can only concatenate str (not "NoneType") to str

In [12]:
# Create an empty list to store the results
tmdb_movies_list = []


# Create a request counter to sleep the requests after a multiple
# of 50 requests
request_counter = 0


# Loop through the titles

    # Check if we need to sleep before making a request

for title in titles_list:
    # Check if we need to sleep before making a request
    if request_counter % 50 == 0 and request_counter != 0:
        time.sleep(10)

    # Add 1 to the request counter
    request_counter += 1

    
    # Perform a "GET" request for The Movie Database
    tmdb_query = f"{tmdb_url}{title}{tmdb_key_string}"
    tmdb_response = requests.get(tmdb_query).json()

    # Include a try clause to search for the full movie details.
    # Use the except clause to print out a statement if a movie
    # is not found.
 try:
        # Get movie id
        movie_id = tmdb_response["results"][0]["id"]

        # Make a request for a the full movie details
        tmdb_movie_query = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb_api_key}"
        tmdb_movie_response = requests.get(tmdb_movie_query).json()

        # Extract the genre names into a list
        genres = [genre["name"] for genre in tmdb_movie_response["genres"]]

        # Extract the spoken_languages' English name into a list
        languages = [language["english_name"] for language in tmdb_movie_response["spoken_languages"]]

        # Extract the production_countries' name into a list
        countries = [country["name"] for country in tmdb_movie_response["production_countries"]]

        # Add the relevant data to a dictionary and
        # append it to the tmdb_movies_list list
        tmdb_movie_data = {
            "title": title,
            "tmdb_id": movie_id,
            "genres": genres,
            "languages": languages,
            "countries": countries
        }
        tmdb_movies_list.append(tmdb_movie_data)

        # Print out the title that was found
        print(f"Found movie: {title}")
    except IndexError:
        print(f"Movie not found for title: {title}")



IndentationError: unindent does not match any outer indentation level (<tokenize>, line 30)

In [13]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data


In [14]:
# Convert the results to a DataFrame
tmdb_movies_df = pd.DataFrame(tmdb_movies_list)


NameError: name 'tmdb_movies_list' is not defined

### Merge and Clean the Data for Export

In [15]:
# Merge the New York Times reviews and TMDB DataFrames on title
merged_df = pd.merge(reviews_df, tmdb_movies_df, on="title", how="left")


NameError: name 'reviews_df' is not defined

In [16]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing
columns_to_fix = ["genres", "languages", "countries"]

chars_to_remove = ['[', ']', '"']

# Create a list of characters to remove


# Loop through the list of columns to fix
for column in columns_to_fix:

    # Convert the column to type 'str'
    merged_df[column] = merged_df[column].astype(str)


    # Loop through characters to remove
    for char in chars_to_remove:

        merged_df[column] = merged_df[column].str.replace(char, '')

# Display the fixed DataFrame
print(merged_df.head())


NameError: name 'merged_df' is not defined

In [17]:
# Drop "byline.person" column
merged_df.drop(columns=["byline.person"], inplace=True)


NameError: name 'merged_df' is not defined

In [18]:
# Delete duplicate rows and reset index
merged_df.drop_duplicates(inplace=True)
merged_df.reset_index(drop=True, inplace=True)

NameError: name 'merged_df' is not defined

In [19]:
# Export data to CSV without the index
merged_df.to_csv("merged_data.csv", index=False)

NameError: name 'merged_df' is not defined