### Import Required Libraries and Set Up Environment Variables

In [2]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
from pandas import json_normalize
import json
import urllib.parse

In [3]:
# Set environment variables from the .env in the local environment
load_dotenv()

nyt_api_key = os.getenv("NYT_API_KEY")
tmdb_api_key = os.getenv("TMDB_API_KEY")

### Access the New York Times API

In [5]:
# Set the base URL
base_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Filter for movie reviews with "love" in the headline
# section_name should be "Movies"
# type_of_material should be "Review"
filter_query = 'section_name:"Movies" AND type_of_material:"Review" AND headline:"love"'

# Use a sort filter, sort by newest
sort = "newest"

# Select the following fields to return:
# headline, web_url, snippet, source, keywords, pub_date, byline, word_count
field_list = "headline,web_url,snippet,source,keywords,pub_date,byline,word_count"

# Search for reviews published between a begin and end date
begin_date = "20130101"
end_date = "20230531"

# Build URL
# Construct the query parameters
query_params = {
    "q": filter_query,
    "sort": sort,
    "fl": field_list,
    "begin_date": begin_date,
    "end_date": end_date,
    "api-key": nyt_api_key
}

# Encode the query parameters
encoded_params = urllib.parse.urlencode(query_params)

# Build the final URL
url = f"{base_url}?{encoded_params}"

print(url)

https://api.nytimes.com/svc/search/v2/articlesearch.json??q=section_name%3A%22Movies%22+AND+type_of_material%3A%22Review%22+AND+headline%3A%22love%22&sort=newest&fl=headline%2Cweb_url%2Csnippet%2Csource%2Ckeywords%2Cpub_date%2Cbyline%2Cword_count&begin_date=20130101&end_date=20230531&api-key=x0GkJw53TM2MuRP1T7lKmefz7QQKXL34


In [6]:
# Create an empty list to store the reviews
reviews_list =[]

# loop through pages 0-19
 # create query with a page number
    # API results show 10 articles at a time
for page in range(20):
    query_url = url
        
# Make a "GET" request and retrieve the JSON
    try:
        response = requests.get(query_url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        reviews = response.json()

        # Try and save the reviews to the reviews_list
        for review in reviews.get("response", {}).get("docs", []):
            reviews_list.append(review)

        # Print the page that was just retrieved
        print(f"Page {page} retrieved successfully. {len(reviews.get('response', {}).get('docs', []))} reviews found.")

    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        break

    except KeyError:
        # Print the page number that had no results then break from the loop
        print(f"No results on page {page}.")
        break

    # Add a twelve second interval between queries to stay within API query limits
    time.sleep(12)

# Print a summary of the results
print(f"Total reviews retrieved: {len(reviews_list)}")  

Page 0 retrieved successfully. 10 reviews found.
Page 1 retrieved successfully. 10 reviews found.
Page 2 retrieved successfully. 10 reviews found.
Page 3 retrieved successfully. 10 reviews found.
Page 4 retrieved successfully. 10 reviews found.
Page 5 retrieved successfully. 10 reviews found.
Page 6 retrieved successfully. 10 reviews found.
Page 7 retrieved successfully. 10 reviews found.
Page 8 retrieved successfully. 10 reviews found.
Page 9 retrieved successfully. 10 reviews found.
Page 10 retrieved successfully. 10 reviews found.
Page 11 retrieved successfully. 10 reviews found.
Page 12 retrieved successfully. 10 reviews found.
Page 13 retrieved successfully. 10 reviews found.
Page 14 retrieved successfully. 10 reviews found.
Page 15 retrieved successfully. 10 reviews found.
Page 16 retrieved successfully. 10 reviews found.
Page 17 retrieved successfully. 10 reviews found.
Page 18 retrieved successfully. 10 reviews found.
Page 19 retrieved successfully. 10 reviews found.
Total revi

In [7]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print("\nPreview of the first 5 results:")
print(json.dumps(reviews_list[:5], indent=4))



Preview of the first 5 results:
[
    {
        "web_url": "https://www.nytimes.com/2023/05/31/us/politics/hunter-biden-bruen-justice-department.html",
        "snippet": "Hunter Biden\u2019s legal team is invoking a Supreme Court decision his father has denounced as an affront to \u201ccommon sense and the Constitution.\u201d",
        "source": "The New York Times",
        "headline": {
            "main": "Hunter Biden\u2019s Lawyers Cite Landmark Gun Ruling in Bid to Stave Off Charges",
            "kicker": null,
            "content_kicker": null,
            "print_headline": "Top Court\u2019s Ruling on Guns May Void Biden Son\u2019s Case",
            "name": null,
            "seo": null,
            "sub": null
        },
        "keywords": [
            {
                "name": "persons",
                "value": "Biden, Hunter",
                "rank": 1,
                "major": "N"
            },
            {
                "name": "subject",
                "value"

In [8]:
# Convert reviews_list to a Pandas DataFrame using json_normalize()
nyt_df = json_normalize(reviews_list)
nyt_df.head()

Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/05/31/us/politics...,Hunter Biden’s legal team is invoking a Suprem...,The New York Times,"[{'name': 'persons', 'value': 'Biden, Hunter',...",2023-05-31T23:50:50+0000,863,Hunter Biden’s Lawyers Cite Landmark Gun Rulin...,,,Top Court’s Ruling on Guns May Void Biden Son’...,,,,By Glenn Thrush and Michael S. Schmidt,"[{'firstname': 'Glenn', 'middlename': None, 'l...",
1,https://www.nytimes.com/2023/05/31/us/texas-se...,"In Ken Paxton’s impeachment trial this summer,...",The New York Times,"[{'name': 'subject', 'value': 'Impeachment', '...",2023-05-31T23:36:36+0000,1502,How Angela Paxton Could Help Decide the Fate o...,,,Texas State Senator Could Help Decide Fate of ...,,,,By J. David Goodman,"[{'firstname': 'J.', 'middlename': 'David', 'l...",
2,https://www.nytimes.com/2023/05/31/us/politics...,"Gov. Greg Abbott chose John Scott, a former de...",The New York Times,"[{'name': 'subject', 'value': 'Impeachment', '...",2023-05-31T23:30:33+0000,590,Texas Governor Names Interim Attorney General ...,,,,,,,By David Montgomery,"[{'firstname': 'David', 'middlename': None, 'l...",
3,https://www.nytimes.com/2023/05/31/sports/tenn...,"In recent days, the Serbian tennis star Novak ...",The New York Times,"[{'name': 'subject', 'value': 'Tennis', 'rank'...",2023-05-31T23:28:46+0000,1253,"At the French Open, Djokovic Storms the Court ...",,,Djokovic Inserts Himself Into Another Fiery Co...,,,,By Matthew Futterman,"[{'firstname': 'Matthew', 'middlename': None, ...",
4,https://www.nytimes.com/2023/05/31/us/politics...,Investigators are trying to determine if there...,The New York Times,"[{'name': 'persons', 'value': 'Trump, Donald J...",2023-05-31T23:28:04+0000,1179,Prosecutors Scrutinize Handling of Security Fo...,,,Mar-a-Lago Workers Are Said to Be Questioned A...,,,,"By Alan Feuer, Maggie Haberman and Ben Protess","[{'firstname': 'Alan', 'middlename': None, 'la...",


In [9]:
# Extract the title from the "headline.main" column and  
# save it to a new column "title"
# Title is between unicode characters \u2018 and \u2019. 
# End string should include " Review" to avoid cutting title early

def extract_title(headline):
    start_marker = '\u2018'
    end_marker = '\u2019'
    
    start_index = headline.find(start_marker)
    end_index = headline.find(end_marker, start_index + 1)
    
    if start_index != -1 and end_index != -1:
        title = headline[start_index + 1:end_index]
        if not title.endswith(" Review"):
            title += " Review"
        return title
    return "Untitled Review"

# Add a new column "title" with extracted titles
for review in reviews_list:
    headline = review.get("headline", {}).get("main", "")
    review["title"] = extract_title(headline)



In [10]:
# Extract 'name' and 'value' from items in "keywords" column
reviews_df = pd.DataFrame(nyt_df)
def extract_keywords(keyword_list):
    extracted_keywords = ""
    for item in keyword_list:
        # Extract 'name' and 'value'
        keyword = f"{item['name']}: {item['value']};" 
        # Append the keyword item to the extracted_keywords list
        extracted_keywords += keyword
    return extracted_keywords

# Fix the "keywords" column by converting cells from a list to a string
reviews_df['keywords'] = reviews_df['keywords'].apply(extract_keywords)
reviews_df.head(5)


Unnamed: 0,web_url,snippet,source,keywords,pub_date,word_count,headline.main,headline.kicker,headline.content_kicker,headline.print_headline,headline.name,headline.seo,headline.sub,byline.original,byline.person,byline.organization
0,https://www.nytimes.com/2023/05/31/us/politics...,Hunter Biden’s legal team is invoking a Suprem...,The New York Times,"persons: Biden, Hunter;subject: Firearms;subje...",2023-05-31T23:50:50+0000,863,Hunter Biden’s Lawyers Cite Landmark Gun Rulin...,,,Top Court’s Ruling on Guns May Void Biden Son’...,,,,By Glenn Thrush and Michael S. Schmidt,"[{'firstname': 'Glenn', 'middlename': None, 'l...",
1,https://www.nytimes.com/2023/05/31/us/texas-se...,"In Ken Paxton’s impeachment trial this summer,...",The New York Times,subject: Impeachment;subject: Politics and Gov...,2023-05-31T23:36:36+0000,1502,How Angela Paxton Could Help Decide the Fate o...,,,Texas State Senator Could Help Decide Fate of ...,,,,By J. David Goodman,"[{'firstname': 'J.', 'middlename': 'David', 'l...",
2,https://www.nytimes.com/2023/05/31/us/politics...,"Gov. Greg Abbott chose John Scott, a former de...",The New York Times,subject: Impeachment;subject: Attorneys Genera...,2023-05-31T23:30:33+0000,590,Texas Governor Names Interim Attorney General ...,,,,,,,By David Montgomery,"[{'firstname': 'David', 'middlename': None, 'l...",
3,https://www.nytimes.com/2023/05/31/sports/tenn...,"In recent days, the Serbian tennis star Novak ...",The New York Times,subject: Tennis;subject: French Open (Tennis);...,2023-05-31T23:28:46+0000,1253,"At the French Open, Djokovic Storms the Court ...",,,Djokovic Inserts Himself Into Another Fiery Co...,,,,By Matthew Futterman,"[{'firstname': 'Matthew', 'middlename': None, ...",
4,https://www.nytimes.com/2023/05/31/us/politics...,Investigators are trying to determine if there...,The New York Times,"persons: Trump, Donald J;subject: Classified I...",2023-05-31T23:28:04+0000,1179,Prosecutors Scrutinize Handling of Security Fo...,,,Mar-a-Lago Workers Are Said to Be Questioned A...,,,,"By Alan Feuer, Maggie Haberman and Ben Protess","[{'firstname': 'Alan', 'middlename': None, 'la...",


In [15]:
# Create a list from the "title" column using to_list()
# These titles will be used in the query for The Movie Database
nyk_reviews = pd.DataFrame(reviews_list)
nyk_title_list = nyk_reviews['title'].to_list
nyk_reviews

Unnamed: 0,web_url,snippet,source,headline,keywords,pub_date,byline,word_count,title
0,https://www.nytimes.com/2023/05/31/us/politics...,Hunter Biden’s legal team is invoking a Suprem...,The New York Times,{'main': 'Hunter Biden’s Lawyers Cite Landmark...,"[{'name': 'persons', 'value': 'Biden, Hunter',...",2023-05-31T23:50:50+0000,{'original': 'By Glenn Thrush and Michael S. S...,863,Untitled Review
1,https://www.nytimes.com/2023/05/31/us/texas-se...,"In Ken Paxton’s impeachment trial this summer,...",The New York Times,{'main': 'How Angela Paxton Could Help Decide ...,"[{'name': 'subject', 'value': 'Impeachment', '...",2023-05-31T23:36:36+0000,"{'original': 'By J. David Goodman', 'person': ...",1502,Untitled Review
2,https://www.nytimes.com/2023/05/31/us/politics...,"Gov. Greg Abbott chose John Scott, a former de...",The New York Times,{'main': 'Texas Governor Names Interim Attorne...,"[{'name': 'subject', 'value': 'Impeachment', '...",2023-05-31T23:30:33+0000,"{'original': 'By David Montgomery', 'person': ...",590,Untitled Review
3,https://www.nytimes.com/2023/05/31/sports/tenn...,"In recent days, the Serbian tennis star Novak ...",The New York Times,"{'main': 'At the French Open, Djokovic Storms ...","[{'name': 'subject', 'value': 'Tennis', 'rank'...",2023-05-31T23:28:46+0000,"{'original': 'By Matthew Futterman', 'person':...",1253,Untitled Review
4,https://www.nytimes.com/2023/05/31/us/politics...,Investigators are trying to determine if there...,The New York Times,{'main': 'Prosecutors Scrutinize Handling of S...,"[{'name': 'persons', 'value': 'Trump, Donald J...",2023-05-31T23:28:04+0000,"{'original': 'By Alan Feuer, Maggie Haberman a...",1179,Untitled Review
...,...,...,...,...,...,...,...,...,...
195,https://www.nytimes.com/2023/05/31/movies/real...,A new docudrama starring Sydney Sweeney as Rea...,The New York Times,{'main': '‘Reality’ Review: An Unusual Suspect...,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-31T23:26:13+0000,"{'original': 'By Amy Nicholson', 'person': [{'...",499,Reality Review
196,https://www.nytimes.com/2023/05/31/us/gre-test...,Graduate school applicants will take the new v...,The New York Times,{'main': 'The GRE Test Is Cut in Half: Two Hou...,"[{'name': 'subject', 'value': 'Admissions Stan...",2023-05-31T23:22:47+0000,"{'original': 'Stephanie Saul', 'person': [{'fi...",538,Untitled Review
197,https://www.nytimes.com/2023/05/31/us/politics...,Federal prosecutors obtained the recording as ...,The New York Times,{'main': 'Trump Was Taped Discussing Sensitive...,"[{'name': 'persons', 'value': 'Trump, Donald J...",2023-05-31T23:21:44+0000,"{'original': 'By Maggie Haberman, Jonathan Swa...",1156,Untitled Review
198,https://www.nytimes.com/2023/05/31/arts/armie-...,A woman had accused the actor of assaulting an...,The New York Times,{'main': 'Armie Hammer Won’t Be Charged After ...,"[{'name': 'subject', 'value': 'Sex Crimes', 'r...",2023-05-31T23:04:30+0000,"{'original': 'By Eduardo Medina', 'person': [{...",587,Untitled Review


### Access The Movie Database API

In [18]:
# query for The Movie Database
api_key = tmdb_api_key
base_url = 'https://api.themoviedb.org/3'
search_endpoint = f'{base_url}/search/movie'

# Query parameters
query = 'Inception'
params = {
    'api_key': api_key,
    'query': query
}

In [20]:
if response.status_code == 200:
    data = response.json()
    if 'results' in data:
        for movie in data['results']:
            title = movie.get('title', 'N/A')
            release_date = movie.get('release_date', 'N/A')
            overview = movie.get('overview', 'N/A')
            print(f"Title: {title}")
            print(f"Release Date: {release_date}")
            print(f"Overview: {overview}")
            print('-' * 10)
    else:
        print("No results found")
else:
    print(f"Error: {response.status_code}")
    print(response.json())

No results found


In [22]:
# List of movie titles to search
movie_titles = ['Insane Fight Club', 'The Matrix', 'Interstellar', 'The Dark Knight', 'Fight Club']

# Create an empty list to store the results
tmdb_movies_list = []

# Create a request counter to sleep the requests after a multiple of 50 requests
request_counter = 0

# Loop through the titles
for title in movie_titles:
    # Search for the movie by title
    params = {
        'api_key': api_key,
        'query': title
    }
    response = requests.get(search_endpoint, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        if 'results' in data and len(data['results']) > 0:
            movie = data['results'][0]  # Get the first result
            movie_id = movie['id']
            
            # Get the movie details
            details_endpoint = f'{base_url}/movie/{movie_id}'
            details_params = {
                'api_key': api_key
            }
            details_response = requests.get(details_endpoint, params=details_params)
            
            if details_response.status_code == 200:
                details = details_response.json()
                
                # Extract genres
                genres = [genre['name'] for genre in details.get('genres', [])]
                
                # Extract spoken languages
                spoken_languages = [language['name'] for language in details.get('spoken_languages', [])]
                
                # Extract production countries
                production_countries = [country['name'] for country in details.get('production_countries', [])]
                
                # Append the movie details to the list
                tmdb_movies_list.append({
                    'Title': title,
                    'Release Date': movie.get('release_date', 'N/A'),
                    'Overview': movie.get('overview', 'N/A'),
                    'Genres': genres,
                    'Spoken Languages': spoken_languages,
                    'Production Countries': production_countries
                })
            
            # Increment the request counter and handle rate limiting
            request_counter += 1
            if request_counter % 50 == 0:
                time.sleep(10)  # Sleep for 10 seconds to respect rate limits
        else:
            print(f"No results found for title: {title}")
    else:
        print(f"Error: {response.status_code}")
        print(response.json())

# Preview the first five results
for movie in tmdb_movies_list[:5]:
    print(movie)

# Convert the results to a DataFrame
df = pd.DataFrame(tmdb_movies_list)

# Display the DataFrame
print(df)

{'Title': 'Insane Fight Club', 'Release Date': '2014-03-11', 'Overview': "A group of friends have created a brand new subculture that is taking over the streets of Glasgow. They've established their very own fight club, but this is no ordinary wrestling event - this is brutal, riotous chaos. Fights don't always stay inside the ring, people are bounced off the side of buses and thrown off balconies in pubs. They now plan the biggest show of their lives. The stakes are high, will it bring them the fame and recognition they need to survive?", 'Genres': ['Documentary'], 'Spoken Languages': ['English'], 'Production Countries': ['United Kingdom']}
{'Title': 'The Matrix', 'Release Date': '1999-03-31', 'Overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.', 'Genres': ['Action', 'Science Fiction'], 'Spoken Languages': ['English'], 'Production Countries': [

In [23]:
# Preview the first 5 results in JSON format
# Use json.dumps with argument indent=4 to format data
print(json.dumps(tmdb_movies_list[:5], indent=4))


[
    {
        "Title": "Insane Fight Club",
        "Release Date": "2014-03-11",
        "Overview": "A group of friends have created a brand new subculture that is taking over the streets of Glasgow. They've established their very own fight club, but this is no ordinary wrestling event - this is brutal, riotous chaos. Fights don't always stay inside the ring, people are bounced off the side of buses and thrown off balconies in pubs. They now plan the biggest show of their lives. The stakes are high, will it bring them the fame and recognition they need to survive?",
        "Genres": [
            "Documentary"
        ],
        "Spoken Languages": [
            "English"
        ],
        "Production Countries": [
            "United Kingdom"
        ]
    },
    {
        "Title": "The Matrix",
        "Release Date": "1999-03-31",
        "Overview": "Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting t

In [24]:
# Convert the results to a DataFrame
tmdb_df = pd.DataFrame(tmdb_movies_list)
tmdb_df

Unnamed: 0,Title,Release Date,Overview,Genres,Spoken Languages,Production Countries
0,Insane Fight Club,2014-03-11,A group of friends have created a brand new su...,[Documentary],[English],[United Kingdom]
1,The Matrix,1999-03-31,"Set in the 22nd century, The Matrix tells the ...","[Action, Science Fiction]",[English],[United States of America]
2,Interstellar,2014-11-05,The adventures of a group of explorers who mak...,"[Adventure, Drama, Science Fiction]",[English],"[United Kingdom, United States of America]"
3,The Dark Knight,2008-07-16,Batman raises the stakes in his war on crime. ...,"[Drama, Action, Crime, Thriller]","[English, 普通话]","[United Kingdom, United States of America]"
4,Fight Club,1999-10-15,A ticking-time-bomb insomniac and a slippery s...,[Drama],[English],"[Germany, United States of America]"


### Merge and Clean the Data for Export

In [33]:
nyk_reviews = pd.DataFrame(reviews_list)
nyk_title_list = df['Title'].to_list
nyk_reviews

Unnamed: 0,web_url,snippet,source,headline,keywords,pub_date,byline,word_count,title
0,https://www.nytimes.com/2023/05/31/us/politics...,Hunter Biden’s legal team is invoking a Suprem...,The New York Times,{'main': 'Hunter Biden’s Lawyers Cite Landmark...,"[{'name': 'persons', 'value': 'Biden, Hunter',...",2023-05-31T23:50:50+0000,{'original': 'By Glenn Thrush and Michael S. S...,863,Untitled Review
1,https://www.nytimes.com/2023/05/31/us/texas-se...,"In Ken Paxton’s impeachment trial this summer,...",The New York Times,{'main': 'How Angela Paxton Could Help Decide ...,"[{'name': 'subject', 'value': 'Impeachment', '...",2023-05-31T23:36:36+0000,"{'original': 'By J. David Goodman', 'person': ...",1502,Untitled Review
2,https://www.nytimes.com/2023/05/31/us/politics...,"Gov. Greg Abbott chose John Scott, a former de...",The New York Times,{'main': 'Texas Governor Names Interim Attorne...,"[{'name': 'subject', 'value': 'Impeachment', '...",2023-05-31T23:30:33+0000,"{'original': 'By David Montgomery', 'person': ...",590,Untitled Review
3,https://www.nytimes.com/2023/05/31/sports/tenn...,"In recent days, the Serbian tennis star Novak ...",The New York Times,"{'main': 'At the French Open, Djokovic Storms ...","[{'name': 'subject', 'value': 'Tennis', 'rank'...",2023-05-31T23:28:46+0000,"{'original': 'By Matthew Futterman', 'person':...",1253,Untitled Review
4,https://www.nytimes.com/2023/05/31/us/politics...,Investigators are trying to determine if there...,The New York Times,{'main': 'Prosecutors Scrutinize Handling of S...,"[{'name': 'persons', 'value': 'Trump, Donald J...",2023-05-31T23:28:04+0000,"{'original': 'By Alan Feuer, Maggie Haberman a...",1179,Untitled Review
...,...,...,...,...,...,...,...,...,...
195,https://www.nytimes.com/2023/05/31/movies/real...,A new docudrama starring Sydney Sweeney as Rea...,The New York Times,{'main': '‘Reality’ Review: An Unusual Suspect...,"[{'name': 'subject', 'value': 'Movies', 'rank'...",2023-05-31T23:26:13+0000,"{'original': 'By Amy Nicholson', 'person': [{'...",499,Reality Review
196,https://www.nytimes.com/2023/05/31/us/gre-test...,Graduate school applicants will take the new v...,The New York Times,{'main': 'The GRE Test Is Cut in Half: Two Hou...,"[{'name': 'subject', 'value': 'Admissions Stan...",2023-05-31T23:22:47+0000,"{'original': 'Stephanie Saul', 'person': [{'fi...",538,Untitled Review
197,https://www.nytimes.com/2023/05/31/us/politics...,Federal prosecutors obtained the recording as ...,The New York Times,{'main': 'Trump Was Taped Discussing Sensitive...,"[{'name': 'persons', 'value': 'Trump, Donald J...",2023-05-31T23:21:44+0000,"{'original': 'By Maggie Haberman, Jonathan Swa...",1156,Untitled Review
198,https://www.nytimes.com/2023/05/31/arts/armie-...,A woman had accused the actor of assaulting an...,The New York Times,{'main': 'Armie Hammer Won’t Be Charged After ...,"[{'name': 'subject', 'value': 'Sex Crimes', 'r...",2023-05-31T23:04:30+0000,"{'original': 'By Eduardo Medina', 'person': [{...",587,Untitled Review


In [35]:
# Merge the New York Times reviews and TMDB DataFrames on title
# Rename 'Title' to 'title' in TMDB DataFrame
tmdb_df = tmdb_df.rename(columns={'Title': 'title'})

merged_df = pd.merge(nyk_reviews, tmdb_df, on='title')
print("\nMerged DataFrame:")
print(merged_df)


Merged DataFrame:
Empty DataFrame
Columns: [web_url, snippet, source, headline, keywords, pub_date, byline, word_count, title, Release Date, Overview, Genres, Spoken Languages, Production Countries]
Index: []


In [37]:
# Remove list brackets and quotation marks on the columns containing lists
# Create a list of the columns that need fixing
columns_to_fix = ['Genres', 'Spoken Languages','Production Countries']

# Create a list of characters to remove
characters_to_remove = ['[', ']', "'"]

# Loop through the list of columns to fix
for column in columns_to_fix:
    # Convert the column to type 'str'
    merged_df[column] = merged_df[column].astype(str)

    # Loop through characters to remove
    for char in characters_to_remove:
        merged_df[column] = merged_df[column].str.replace(char, '')

# Display the fixed DataFrame
print("\nFixed DataFrame:")
print(merged_df.head())


Fixed DataFrame:
Empty DataFrame
Columns: [web_url, snippet, source, headline, keywords, pub_date, byline, word_count, title, Release Date, Overview, Genres, Spoken Languages, Production Countries]
Index: []


In [39]:
# Drop "byline.person" column
merged_df = merged_df.drop(columns=['byline.person'], errors='ignore')
merged_df

Unnamed: 0,web_url,snippet,source,headline,keywords,pub_date,byline,word_count,title,Release Date,Overview,Genres,Spoken Languages,Production Countries


In [41]:
# Delete duplicate rows and reset index
merged_df = merged_df.reset_index(drop=True)
merged_df

Unnamed: 0,web_url,snippet,source,headline,keywords,pub_date,byline,word_count,title,Release Date,Overview,Genres,Spoken Languages,Production Countries


In [43]:
# Export data to CSV without the index
merged_df.to_csv('merged_movie_data.csv', index=False)