In [1]:
import requests
import os
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np

from api_keys import rapid_key, moviedb_key, omdb_key

# Data Pulls

In [2]:
# # Rapid API data pull
# EDB_url="https://ivaee-internet-video-archive-entertainment-v1.p.rapidapi.com/entertainment/search/"

# querystring = {"YearRange_Start":"2005","YearRange_End":"2020","ReleaseTypes":"Theatrical_Limited_Release,Theatrical_Wide_Release"}

# headers = {
#     'x-rapidapi-host': "ivaee-internet-video-archive-entertainment-v1.p.rapidapi.com",
#     'x-rapidapi-key': rapid_key,
#     'content-type': "application/json"
#     }

# response = requests.request("GET", EDB_url, headers=headers, params=querystring).json()

# pprint(response)


In [4]:
# The Movie DB data pull

tmdb_url="https://api.themoviedb.org/3/discover/movie?"
params={
    "api_key":moviedb_key,
    "primary_release_date.gte":"2005-01-01",
    "sort_by":"primary_release_date.desc",
    "primary_release_date.lte":"2020-01-01",
    "region":"US",
    "with_release_type":"2,3",
    "language":"en-US"
}
movie_ids=[]
movie_titles=[]
movie_dates=[]
page=1

# Repeat the api ping for each page in the query because each ping only returns 20 results

while page<=500:
    params["page"]=page
    response = requests.get(tmdb_url, params=params).json()
    
    for movie in response["results"]:
        try:
            movie_ids.append(movie["id"])
            movie_titles.append(movie["title"])
            movie_dates.append(movie["release_date"])
        except(TypeError):
            f = open(os.path.join("ErrorLogs","TMDB_Pull_Error_Log.txt"), "a")
            f.write(f'Missing information on page {page}\n")
            f.close()            
    page=page+1


Missing information on page 2


In [42]:
# Create the Initial Data Frame to use to pull from OMDB and export the data to a csv file

Movie_df=pd.DataFrame({"TMDB ID":movie_ids,"Title":movie_titles,"Release Date":movie_dates})
export_file_path=os.path.join("Resources","MovieTitles.csv")
Movie_df.to_csv(export_file_path)


In [43]:
# Add Necessary Columns to data frame in order to stor additional info

Movie_df["IMDB ID"]=None
Movie_df["Rating"]=None
Movie_df["Runtime"]=None
Movie_df["Genre"]=None
Movie_df["Director"]=None
Movie_df["Writer"]=None
Movie_df["Actors"]=None
Movie_df["Plot"]=None
Movie_df["Language"]=None
Movie_df["Country"]=None
Movie_df["Awards"]=None
Movie_df["Poster"]=None
Movie_df["IMDB Rating"]=None
Movie_df["IMDB Votes"]=None
Movie_df["Metascore"]=None
Movie_df["Rating Dictionary"]=None
Movie_df["Type"]=None
Movie_df["DVD Release Date"]=None
Movie_df["Box Office"]=None
Movie_df["Production"]=None

In [44]:
# OMDB API Data Pull

omdb_url="http://www.omdbapi.com/?"
params={
    "apikey":omdb_key
}

# pulls determines how many rows i want to iterate through. This is for testing purposes and will be removed in the 

i=1
pulls=100
for index,row in Movie_df.iterrows():
    params["t"]=row["Title"]
    response=requests.get(omdb_url, params=params).json() 
    try:
        Movie_df.loc[index,"Rating"]=response["Rated"]
        Movie_df.loc[index,"Runtime"]=response["Runtime"]
        Movie_df.loc[index,"Genre"]=response["Genre"]
        Movie_df.loc[index,"Director"]=response["Director"]
        Movie_df.loc[index,"Writer"]=response["Writer"]
        Movie_df.loc[index,"Actors"]=response["Actors"]
        Movie_df.loc[index,"Plot"]=response["Plot"]
        Movie_df.loc[index,"Language"]=response["Language"]
        Movie_df.loc[index,"Country"]=response["Country"]
        Movie_df.loc[index,"Awards"]=response["Awards"]
        Movie_df.loc[index,"Poster"]=response["Poster"]
        Movie_df.loc[index,"IMDB Rating"]=response["imdbRating"]
    #     Movie_df.loc[index,"Metascore"]=response[""]
    #     Movie_df.loc[index,"Rating Dictionary"]=response["Ratings"]
        Movie_df.loc[index,"IMDB ID"]=response["imdbID"]
        Movie_df.loc[index,"IMDB Votes"]=response["imdbVotes"]
        Movie_df.loc[index,"Type"]=response["Type"]
        Movie_df.loc[index,"DVD Release Date"]=response["DVD"]
        Movie_df.loc[index,"Box Office"]=response["BoxOffice"]
        Movie_df.loc[index,"Production"]=response["Production"]
    except(KeyError):
        f = open(os.path.join("ErrorLogs","IMDB_Pull_Error_Log.txt"), "a")
        f.write(f"There was a problem with movie {row['TMDB ID']}\n")
        f.close()        
                
    #checks to see if the max number of pulls was reached and breaks out of the for loop if they have been
                
    if(i==pulls):
        break
    i=i+1



# Data Pull Complete: Data Wrangling Begins Now

In [46]:
# Drops any rows that are missing more than 4 values and export the file

Movie_df.dropna(thresh=4, inplace=True)
Movie_df.head(50)
Movie_df.to_csv(os.path.join("Resources","MovieData_PreCleanup.csv"))
