In [1]:
import pandas as pd
import requests
import json
import time
from datetime import datetime
from pprint import pprint
from config import api_key

## Most Popular 100 Movies of each year between 1990 and 2018


In [2]:
# The Movie Database URL 
url = "https://api.themoviedb.org/3/"

In [3]:

# Empty list to capture the movie details
popular_100 = []
genre_codes = []

# Loop for the years that we need data for
for year in range(1990,2019):
    
    # Initialize counter for listing popularity ranking in a given year
    rank_counter = 1 
    
    for page in range(1,6): 
        
        # URL to discover movies
        discover_url = url + "discover/movie?api_key="+ api_key \
        + "&language=en-US&region=US&sort_by=popularity.desc&include_adult=false&include_video=false&page=" \
        + str(page) + "&primary_release_year=" + str(year)
        
        # Fetch the data
        popular_100_data = requests.get(discover_url).json()
        
        # add the info as dictionary
        for movie in popular_100_data['results']:
            popular_100.append({"movie id": movie['id'], 
                                "title": movie['title'], 
                                "year": year,
                               "overview": movie['overview'],
                               "popularity rank": rank_counter})
            
            genre_codes.append({"movie id": movie['id'], 
                                "title": movie['title'], 
                                "genre_ids": movie["genre_ids"]
                                })
            
            rank_counter+=1

In [46]:
# Convert the dictionary to a dataframe
popular_movies_df = pd.DataFrame(popular_100).set_index("movie id")
popular_movies_df.head()

Unnamed: 0_level_0,overview,popularity rank,title,year
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
348350,Through a series of daring escapades deep with...,1,Solo: A Star Wars Story,2018
439079,When a young nun at a cloistered abbey in Roma...,2,The Nun,2018
346910,From the outer reaches of space to the small-t...,3,The Predator,2018
442249,To push the crime rate below one percent for t...,4,The First Purge,2018
299536,As the Avengers and their allies have continue...,5,Avengers: Infinity War,2018


In [55]:
for movie_id, row in popular_movies_df.iterrows():
    print(movie_id, ": ", row['title'])

348350 :  Solo: A Star Wars Story
439079 :  The Nun
346910 :  The Predator
442249 :  The First Purge
299536 :  Avengers: Infinity War
351286 :  Jurassic World: Fallen Kingdom
345940 :  The Meg
400535 :  Sicario: Day of the Soldado
383498 :  Deadpool 2
284054 :  Black Panther
353081 :  Mission: Impossible - Fallout
402900 :  Ocean's Eight
260513 :  Incredibles 2
530442 :  My Teacher, My Obsession
400155 :  Hotel Transylvania 3: Summer Vacation
335983 :  Venom
476292 :  Maquia: When the Promised Flower Blooms
460885 :  Mandy
363088 :  Ant-Man and the Wasp
466282 :  To All the Boys I've Loved Before


### Movies details
1. Budget and Revenue
2. Lead Actor

In [56]:
# Empty lists to capture budget/revenue and cast information
budget_revenue = []
cast_movies = []

# Initialize counter to count number of runs in the loop
timeout_count = 1

# Loop across movies identified
for movie_id, movie in popular_movies_df.iterrows():

    # ------- Budget/Revenue data -------
    # URL to get movie details
    details_url = url + "movie/"+ str(movie_id) + "?api_key=" + api_key + "&language=en-US"

    details_data = requests.get(details_url).json()
    
    try:
        budget_revenue.append({"movie id": movie_id, 
                                "budget": details_data["budget"],
                                "revenue": details_data["revenue"]})
    except IndexError:
        print("Budget/Revenue info not available for movie: " + movie["title"])
    
    # ------- Cast (Lead Actor) data -------
    # URL to get credits details for movie
    credits_url = url + "movie/" + str(movie_id) + "/credits?api_key=" + api_key

    casting_data = requests.get(credits_url).json()
    
    try:
        if casting_data["cast"][0]["gender"] == 1:
            gender = "Female"
        elif casting_data["cast"][0]["gender"] == 2:
            gender = "Male"
        else:
            gender = "Unknown"

        cast_movies.append({"movie id": movie_id, 
                            "character": casting_data["cast"][0]["character"],
                            "actor id": casting_data["cast"][0]["id"],
                            "actor name": casting_data["cast"][0]["name"],
                            "gender": gender
                            })
        
    except IndexError:
        print("Cast info not available for movie: " + movie["title"])
        
        
    # --- to avoid exceeding the limit on API calls set by the external source
    timeout_count+=1
    if (timeout_count%500) == 0:
        time.sleep(10)
        
# loop ends here

In [57]:
# convert dictionary to dataframe
budget_revenue_df = pd.DataFrame(budget_revenue).set_index("movie id")
budget_revenue_df.head()

Unnamed: 0_level_0,budget,revenue
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1
348350,250000000,392952373
439079,22000000,54470000
346910,88000000,56594362
442249,13000000,41953945
299536,300000000,2046239637


In [50]:
# convert dictionary to dataframe
cast_movies_df = pd.DataFrame.from_dict(cast_movies).set_index("movie id")
cast_movies_df.head()

Unnamed: 0_level_0,actor id,actor name,character,gender
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
348350,71375,Alden Ehrenreich,Han Solo,Male
439079,87287,Bonnie Aarons,The Nun / Valak,Female
346910,467645,Boyd Holbrook,Quinn McKenna,Male
442249,1694278,Y'lan Noel,Dmitri,Unknown
299536,3223,Robert Downey Jr.,Toni Stark / Temir odam,Male


In [8]:
# Combine all the movies details
popular_movies_df = pd.merge(popular_movies_df, budget_revenue_df, on = "movie id", how = "left")
popular_movies_df = pd.merge(popular_movies_df, cast_movies_df, on = "movie id", how = "left")

# Clean the data to remove unknown gender
popular_movies_df = popular_movies_df[popular_movies_df["gender"] != 'Unknown']
popular_movies_df.reset_index()
popular_movies_df.head()

Unnamed: 0,movie id,overview,popularity rank,title,year,budget,revenue,actor id,actor name,character,gender
0,348350,Through a series of daring escapades deep with...,1,Solo: A Star Wars Story,2018,250000000,392952373,71375,Alden Ehrenreich,Han Solo,Male
1,439079,When a young nun at a cloistered abbey in Roma...,2,The Nun,2018,22000000,54470000,87287,Bonnie Aarons,The Nun / Valak,Female
2,346910,From the outer reaches of space to the small-t...,3,The Predator,2018,88000000,56594362,467645,Boyd Holbrook,Quinn McKenna,Male
4,299536,As the Avengers and their allies have continue...,5,Avengers: Infinity War,2018,300000000,2046239637,3223,Robert Downey Jr.,Toni Stark / Temir odam,Male
5,351286,Three years after the demise of Jurassic World...,6,Jurassic World: Fallen Kingdom,2018,170000000,1303459585,73457,Chris Pratt,Owen Grady,Male


In [15]:
# Save the information to csv file
popular_movies_df.to_csv("Resources/Popular movies.csv")

###  Types of Genres

In [16]:
# URL to fetch the information on genre types
genre_url = url + "genre/movie/list?api_key=" + api_key+ "&language=en-US"

# Empty list to capture the data
genre_type = []

# Fetch the data
genre_data = requests.get(genre_url).json()

for genre in genre_data["genres"]:
    genre_type.append({"genre_id": genre["id"],
                       "genre_name": genre["name"]})

# convert the dictionary to dataframe    
genre_types_df = pd.DataFrame.from_dict(genre_type).set_index("genre_id")

In [17]:
# Save Genre types into a .csv file
genre_types_df.to_csv("Resources/Genre types.csv")

### Add Genre info

In [20]:

genre_codes_df = pd.DataFrame(genre_codes)
genre_codes_df.head()

Unnamed: 0,genre_ids,movie id,title
0,"[28, 12, 878]",348350,Solo: A Star Wars Story
1,"[27, 9648, 53]",439079,The Nun
2,"[27, 878, 28, 35]",346910,The Predator
3,"[28, 878, 53, 27]",442249,The First Purge
4,"[12, 878, 28]",299536,Avengers: Infinity War


In [42]:
# Find Genre names for every movie
genre_movies = []
for index, row in genre_codes_df.iterrows():
    for genre_id in row["genre_ids"]:
            genre_name = genre_list_df.loc[genre_id, "genre_name"]
            genre_movies.append({"movie id": row["movie id"],
                                "title": row["title"],
                                 "genre": genre_name
                                })
    

In [53]:
# Convert data to dataframe
genre_movies_df = pd.DataFrame.from_dict(genre_movies)
genre_movies_df.head()

Unnamed: 0,genre,movie id,title
0,Action,348350,Solo: A Star Wars Story
1,Adventure,348350,Solo: A Star Wars Story
2,Science Fiction,348350,Solo: A Star Wars Story
3,Horror,439079,The Nun
4,Mystery,439079,The Nun


In [44]:
# Save Movies Genre info into a .csv file
genre_movies_df.to_csv("Resources/Popular movies with genre.csv")