In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import urllib.parse
from scipy.stats import linregress
from api_keys import omdb_api_key
from pprint import pprint
import scipy.stats as st

#print(omdb_api_key)

# Sets the path to our movie csv file
path = 'Resources/boxoffice.csv'


In [None]:
# Pulls in a list of ~16000 top movies of all time
top_boxoffice_df = pd.read_csv(path)

# Drops the maunally created index column
top_boxoffice_df.drop(labels=['index'], axis=1, inplace=True)

top_boxoffice_df

In [None]:
top1k_boxoffice_df = pd.read_csv(path, nrows = 1000)

# Drops the maunally created index column
top1k_boxoffice_df.drop(labels=['index'], axis=1, inplace=True)

top1k_boxoffice_df

In [None]:
bot1k_boxoffice_df = pd.read_csv(path, skiprows= 15542, nrows = 1001, names=['rank', 'index', 'title', 'studio', 'lifetime_gross', 'year'])

# Drops the maunally created index column
bot1k_boxoffice_df.drop(labels=['index'], axis=1, inplace=True)

bot1k_boxoffice_df

In [None]:
# Creates a list of all of the movie titles in the top 1k movies from the dataframe
movies_list = top1k_boxoffice_df['title'].tolist()

# Creates empty lists for all of the information we want from the json requests
req_imdbID = []
req_titles = []
req_studio = []
req_boxoffice = []
req_runtime = []
req_age_rating = []
req_release_date = []
req_country = []
req_genre = []
req_imdb_rating = []
req_imdb_votecount = []
req_critic_rating = []
req_directors = []

print('Beginning OMDb data retriveal:')
print('______________________________')

try:
    
    # Loops through the dataframe and pulls the rest of  the information for the movies from OMDb API
    for movie_title in movies_list:

        # Encodes the entire movie title to account for characters
        encoded_title = urllib.parse.quote(movie_title)

        # Sets the base url for OMDb API and prints it out, comment out the printing function if using in public
        omdb_url = f'http://www.omdbapi.com/?t={encoded_title}&apikey={omdb_api_key}'
        print(f"API Request URL: {omdb_url}")

        # Creates a request to the OMDb API
        response = requests.get(omdb_url)
        request = response.json()

        # Tells us what is wrong with a request if it is not good
        if response.status_code != 200 or request['Response'] == 'False':
            print(f"Failed to retrieve movie '{movie_title}' from OMDb API.")
            print(f"Response: {request}")
            continue

        # Skips tv series from the data set
        if request['Type'] == 'series':
            print(f"'{movie_title}' is a TV series. Skipping...")
            continue

        # Gets the studio of each movie when it loops through the list
        studio_info = top1k_boxoffice_df.loc[top1k_boxoffice_df['title'] == movie_title, 'studio'].values[0]

        # Adds the information of each title to each list
        req_imdbID.append(request['imdbID'])
        req_titles.append(request['Title'])
        req_studio.append(studio_info)
        req_boxoffice.append(request['BoxOffice'])
        req_runtime.append(request['Runtime'])
        req_age_rating.append(request['Rated'])
        req_release_date.append(request['Released'])
        req_country.append(request['Country'])
        req_genre.append(request['Genre'])
        req_imdb_rating.append(request['imdbRating'])
        req_imdb_votecount.append(request['imdbVotes'])
        req_critic_rating.append(request['Metascore'])
        req_directors.append(request['Director'])

# Prints any errors that occured
except Exception as e:    
    print('An error occurred:', str(e))

print('______________________________')
print('Data retrieval complete')

    

In [None]:
# Creates our usable dataframe for our movies
total_movie_df = pd.DataFrame({
    'IMDb ID': req_imdbID,
    'Movie Title': req_titles,
    'Studio': req_studio,
    'Box Office Sales': req_boxoffice,
    'Movie Runtime': req_runtime,
    'Age Rating': req_age_rating,
    'Release Date' : req_release_date,
    'Country' : req_country,
    'Genre': req_genre,
    'IMDb Rating' : req_imdb_rating,
    'IMDb Vote Count' : req_imdb_votecount,
    'Critic Rating' : req_critic_rating,
    'Director(s)' : req_directors,
})


total_movie_df

In [None]:
#runtime vs box office (scatter)

#clean total_movie_df to better fit upcoming variables
total_movie_df['Movie Runtime'] = total_movie_df['Movie Runtime'].str.replace(' min','').str.replace('N/A','0').astype(float)
total_movie_df['Box Office Sales'] = total_movie_df['Box Office Sales'].str.replace('$','').str.replace(',','').str.replace('N/A','0').astype(float)

#set runtime and box office
runtime = total_movie_df['Movie Runtime']
box_office = total_movie_df['Box Office Sales']

#scatter plot, correlation coefficient and line regression
print(f"The correlation coefficient between movie runtime and box office sales is {round(st.pearsonr(runtime,box_office)[0],2)}")
m_slope, m_int, m_r, m_p, m_std_err = st.linregress(runtime, box_office)
m_fit = m_slope * runtime + m_int
plt.scatter(runtime, box_office)
plt.plot(runtime,m_fit,"-",color='red')
plt.ylim(0, 1000000000)
plt.xlabel('Movie Runtime (min)')
plt.ylabel('Box Office Sales (mil)')
plt.title('Movie Runtime vs. Box Office Sales')

#there is a weak positive correlation between movie runtime and how well they do in the box office. 

In [None]:
#genre vs box office (bar)

#take top listed genre for each film
total_movie_df['Genre'] = total_movie_df['Genre'].str.split(',').str[0]

#find average box office for each genre, set genre variable for x-axis
genre_boxoffice = total_movie_df.groupby('Genre')['Box Office Sales'].mean().drop(labels=['Family','N/A','Short']).sort_values(ascending=True)
genre = genre_boxoffice.index

#bar graph
plt.bar(genre,genre_boxoffice,align='center')
plt.xticks(rotation=90)
plt.xlabel('Genre')
plt.ylabel('Average Box Office Sales (mil)')
plt.title('Average Box Office Sales of Each Genre')

