In [1]:
import pandas as pd
import requests
import json
import time
from config import api_key

## Most Popular 100 Movies of each year between 1990 and 2018


In [2]:
# The Movie Database URL 
url = "https://api.themoviedb.org/3/"

In [3]:

# Empty list to capture the movie details
popular_100 = []
genre_codes = []

# Loop for the years that we need data for
for year in range(1990,2019):
    
    # Initialize counter for listing popularity ranking in a given year
    rank_counter = 1 
    
    for page in range(1,6): 
        
        # URL to discover movies
        discover_url = url + "discover/movie?api_key="+ api_key \
        + "&language=en-US&region=US&sort_by=popularity.desc&include_adult=false&include_video=false&page=" \
        + str(page) + "&primary_release_year=" + str(year)
        
        # Fetch the data
        popular_100_data = requests.get(discover_url).json()
        
        # add the info as dictionary
        for movie in popular_100_data['results']:
            popular_100.append({"movie id": movie['id'], 
                                "title": movie['title'], 
                                "release year": year,
                               "overview": movie['overview'],
                               "popularity rank": rank_counter})
            
            genre_codes.append({"movie id": movie['id'], 
                                "title": movie['title'], 
                                "genre_ids": movie["genre_ids"]
                                })
            
            rank_counter+=1
            
time.sleep(10)            

In [4]:
# Convert the dictionary to a dataframe
popular_movies_df = pd.DataFrame(popular_100).set_index("movie id")
popular_movies_df.head()

Unnamed: 0_level_0,overview,popularity rank,release year,title
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
71805,A young Norwegian boy in 1850s England goes to...,1,1990,Shipwrecked
169,Ten years after a band of mercenaries first ba...,2,1990,Predator 2
162,A small suburban town receives a visit from a ...,3,1990,Edward Scissorhands
1573,Off-duty cop John McClane is gripped with a fe...,4,1990,Die Hard 2
242,In the midst of trying to legitimize his busin...,5,1990,The Godfather: Part III


### Add Movie details
1. Budget and Revenue
2. Lead Actor

In [5]:
# Empty lists to capture budget/revenue and cast information
budget_revenue = []
cast_movies = []

# Initialize counter to count number of runs in the loop
timeout_count = 1

# Loop across movies identified
for movie_id, movie in popular_movies_df.iterrows():

    # ------- Budget/Revenue data -------
    # URL to get movie details
    details_url = url + "movie/"+ str(movie_id) + "?api_key=" + api_key + "&language=en-US"

    details_data = requests.get(details_url).json()
    
    try:
        budget_revenue.append({"movie id": movie_id, 
                                "budget": details_data["budget"],
                                "revenue": details_data["revenue"]})
    except IndexError:
        print("Budget/Revenue info not available for movie: " + movie["title"])
    
    # ------- Cast (Lead Actor) data -------
    # URL to get credits details for movie
    credits_url = url + "movie/" + str(movie_id) + "/credits?api_key=" + api_key

    casting_data = requests.get(credits_url).json()
    
    try:
        if casting_data["cast"][0]["gender"] == 1:
            gender = "Female"
        elif casting_data["cast"][0]["gender"] == 2:
            gender = "Male"
        else:
            gender = "Unknown"

        cast_movies.append({"movie id": movie_id, 
                            "character": casting_data["cast"][0]["character"],
                            "actor id": casting_data["cast"][0]["id"],
                            "actor name": casting_data["cast"][0]["name"],
                            "gender": gender
                            })
        
    except IndexError:
        print("Cast info not available for movie: " + movie["title"])
                
    # --- to avoid exceeding the limit on API calls set by the external source
    timeout_count+=1
    if (timeout_count%1000) == 0:
        time.sleep(10)
        
# loop ends here

Cast info not available for movie: The Seventh Brother
Cast info not available for movie: Doraemon: Nobita's the Legend of the Sun King
Cast info not available for movie: One Man Band


In [6]:
# convert dictionary to dataframe
budget_revenue_df = pd.DataFrame(budget_revenue)
budget_revenue_df.head()

Unnamed: 0,budget,movie id,revenue
0,0,71805,0
1,35000000,169,57120318
2,20000000,162,53000000
3,70000000,1573,240031094
4,54000000,242,136766062


In [7]:
# convert dictionary to dataframe
cast_movies_df = pd.DataFrame.from_dict(cast_movies)
cast_movies_df.head()

Unnamed: 0,actor id,actor name,character,gender,movie id
0,563838,Stian Smestad,Haakon Haakonsen,Unknown,71805
1,1109,Kevin Peter Hall,The Predator,Male,169
2,85,Johnny Depp,Edward Scissorhands,Male,162
3,62,Bruce Willis,John McClane,Male,1573
4,1158,Al Pacino,Don Michael Corleone,Male,242


In [8]:
# Combine all the movies details
popular_movies_df = pd.merge(popular_movies_df, budget_revenue_df, on = "movie id", how = "left")
popular_movies_df = pd.merge(popular_movies_df, cast_movies_df, on = "movie id", how = "left")

# Clean the data to remove unknown gender
popular_movies_df = popular_movies_df[popular_movies_df["gender"] != 'Unknown']
popular_movies_df.reset_index()
popular_movies_df.head()

Unnamed: 0,movie id,overview,popularity rank,release year,title,budget,revenue,actor id,actor name,character,gender
1,169,Ten years after a band of mercenaries first ba...,2,1990,Predator 2,35000000,57120318,1109.0,Kevin Peter Hall,The Predator,Male
2,162,A small suburban town receives a visit from a ...,3,1990,Edward Scissorhands,20000000,53000000,85.0,Johnny Depp,Edward Scissorhands,Male
3,1573,Off-duty cop John McClane is gripped with a fe...,4,1990,Die Hard 2,70000000,240031094,62.0,Bruce Willis,John McClane,Male
4,242,In the midst of trying to legitimize his busin...,5,1990,The Godfather: Part III,54000000,136766062,1158.0,Al Pacino,Don Michael Corleone,Male
5,1669,"A new, technologically-superior Soviet sub, th...",6,1990,The Hunt for Red October,30000000,199200000,7447.0,Alec Baldwin,Jack Ryan,Male


In [9]:
# Save the information to .csv file
popular_movies_df.to_csv("Resources/Popular movies.csv")

###  Types of Genres

In [10]:
# URL to fetch the information on genre types
genre_url = url + "genre/movie/list?api_key=" + api_key+ "&language=en-US"

# Empty list to capture the data
genre_type = []

# Fetch the data
genre_data = requests.get(genre_url).json()

for genre in genre_data["genres"]:
    genre_type.append({"genre_id": genre["id"],
                       "genre_name": genre["name"]})

# convert the dictionary to dataframe    
genre_types_df = pd.DataFrame.from_dict(genre_type).set_index("genre_id")

In [11]:
# Save Genre types into a .csv file
genre_types_df.to_csv("Resources/Genre types.csv")

### Add Genre info

In [12]:
# convert genre details of movies dictionary to dataframe
genre_codes_df = pd.DataFrame(genre_codes)
genre_codes_df.head()

Unnamed: 0,genre_ids,movie id,title
0,"[12, 10751]",71805,Shipwrecked
1,"[878, 28, 53]",169,Predator 2
2,"[14, 18, 10749]",162,Edward Scissorhands
3,"[28, 53]",1573,Die Hard 2
4,"[80, 18, 53]",242,The Godfather: Part III


In [14]:
# Find Genre names for every movie
genre_movies = []
for index, row in genre_codes_df.iterrows():
    for genre_id in row["genre_ids"]:
            genre_name = genre_types_df.loc[genre_id, "genre_name"]
            genre_movies.append({"movie id": row["movie id"],
                                "title": row["title"],
                                 "genre": genre_name
                                })
    

In [15]:
# Convert data to dataframe
genre_movies_df = pd.DataFrame.from_dict(genre_movies)
genre_movies_df.head()

Unnamed: 0,genre,movie id,title
0,Adventure,71805,Shipwrecked
1,Family,71805,Shipwrecked
2,Science Fiction,169,Predator 2
3,Action,169,Predator 2
4,Thriller,169,Predator 2


In [16]:
# Save Movies Genre info into a .csv file
genre_movies_df.to_csv("Resources/Popular movies with genre.csv")