## Raw code notebook

In [None]:
import pandas as pd
import sqlite3
import re
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

# Clean up the larger numbers in the dataframes
pd.set_option('display.float_format', lambda x: '%.f' % x)

#upload the data from the various databases
bom_df = pd.read_csv('./zippedData/bom.movie_gross.csv.gz')
tmdb_df = pd.read_csv('./zippedData/tmdb.movies.csv.gz', index_col=0)
tn_df = pd.read_csv('./zippedData/tn.movie_budgets.csv.gz')
conn = sqlite3.Connection('./zippedData/john/im.db')

## Cleaning the data from 'The Movie Database'

In [None]:
# I didnt need the original title of the movie
tmdb_df.drop('original_title', axis=1, inplace = True)
# Here i replace all of the genre cells that have no value with their own 'undecided' category
tmdb_df['genre_ids'].replace(to_replace= '[]', value = '[7]', inplace = True)
# This dictionary will translate the numbers into actuall genre strings
getGenre = {
    '7': 'undecided', '12': 'Adventure', '14': 'Fantasy', '16': 'Animation', '18': 'Drama', '27': 'Horror', '28': 'Action',
    '35': 'Comedy', '36': 'History', '37': 'Western', '53': 'Thriller', '80': 'Crime', '99': 'Documentary', 
    '878': 'Sci-Fi','9648': 'Mystery','10402': 'Music','10749':'Romance','10751': 'Family','10752': 'War','10770': 'TV'
}

# this function will take in a list of genres in string form, extract the numbers representing individual 
# genres, and use the above dictionary to return the actuall name of the genre
def convert(glist):
    description = ''
    for num in glist.strip('][').split(', '):
        description += getGenre[num]
        description += ','
    return description.rstrip(',')

# This will convert the genre numbers into genre titles
tmdb_df['genre_ids'] = tmdb_df['genre_ids'].map(lambda x: convert(x))
# This will drop the unnecessary columns and then reorder the rest to be more organized
tmdb_df = tmdb_df[['title', 'release_date', 'genre_ids', 'popularity', 'vote_average', 'vote_count']]
# This will make the release date column into a datetime object column
tmdb_df['release_date'] = pd.to_datetime(tmdb_df['release_date'])
# Drops the duplicate entries
tmdb_df.drop_duplicates(inplace=True)

In [None]:
#checks to see that the dataframe looks right
tmdb_df.head()

## Cleaning the data from 'The Numbers'

In [None]:
# changes the release_date column into a datetime column
tn_df['release_date'] = pd.to_datetime(tn_df['release_date'])
# changes the string of production_budget into an int
tn_df['production_budget'] = tn_df['production_budget'].map(lambda x: int(x.replace('$','').replace(',','')))
# changes the domestic into an int
tn_df['domestic_gross'] = tn_df['domestic_gross'].map(lambda x: int(x.replace('$','').replace(',','')))
# changes the worldwide into an int
tn_df['worldwide_gross'] = tn_df['worldwide_gross'].map(lambda x: int(x.replace('$','').replace(',','')))
# reorganizes the columns to be more organized
tn_df = tn_df[['movie', 'release_date', 'production_budget', 'domestic_gross', 'worldwide_gross']]

In [None]:
# makes sure the dataframe is correct
tn_df.head()

## Clean the data from 'Box Office Mojo'

In [None]:
# this will fill the empty values of foreign gross with zero
bom_df['foreign_gross'] = bom_df['foreign_gross'].fillna('0')
# this will convert the values in foreign gross from ints to stings
bom_df['foreign_gross'] = bom_df['foreign_gross'].map(lambda x: x.replace(',', '')).map(lambda x: float(x))
# this will convert the domestic gross to float
bom_df['domestic_gross'] = bom_df['domestic_gross'].astype(float)
# this will reorganize the columns
bom_df = bom_df[['title', 'year', 'domestic_gross', 'foreign_gross']]

In [None]:
# make sure that the database looks right
bom_df.head()

## Merge and clean the differnt tables from 'IMDB'
we will take in the different sql tables and put them together in order to get the info 
for the movie title, release date, runtime, votes, director, and actor

In [None]:
# this will get the relevant columns from the joined tables of movie_basics and movie_ratings using the common movie_id column
mvi = pd.read_sql('''
SELECT movie_id, (primary_title) AS title, (start_year) year, (runtime_minutes) AS runtime, genres,
(averagerating) AS avg_rating, numvotes
FROM movie_basics
LEFT JOIN movie_ratings
USING(movie_id)

''', conn)

# this will get the table of director info by joining the principals and persons tables
directors = pd.read_sql('''
SELECT movie_id, (primary_name) AS director
FROM principals
JOIN persons
USING(person_id)
WHERE category = 'director'
''', conn)
# there are sometimes multiple directors for each movie so these three lines will group them together by movie
directors = directors.drop_duplicates()
seperator = ','
directors = directors.groupby('movie_id').agg(lambda x: seperator.join(x.tolist())).reset_index()

# this will get the actors from the pricipals and perosn tables
actors = pd.read_sql('''
SELECT movie_id, (primary_name) AS actors
FROM principals
JOIN persons
USING(person_id)
WHERE category = 'actor'
ORDER BY movie_id
''', conn)
# this will do the same thing for the actors that happened with directors
actors = actors.drop_duplicates()
actors = actors.groupby('movie_id').agg(lambda x: seperator.join(x.tolist())).reset_index()

# first merge the tables of movies and directors
draft1 = pd.merge(mvi,directors, how = 'left', on = 'movie_id')
# then merge the result with the actors to get the final dataframe
sql_df = pd.merge(draft1, actors, how = 'left', on = 'movie_id')
# fill in the empty runtime rows witht the median so that you don't loose data
sql_df['runtime'].fillna(sql_df['runtime'].median(), inplace = True)
# now that everything is merged you no longer need the movie_id
sql_df.drop('movie_id', axis=1, inplace=True)
# get rid of the movies that are impossibly from the future
sql_df= sql_df[sql_df['year'] <= 2024]
# drop all of the empty records from the genres column because there are only a few and will not affect the larger dataframe
sql_df.dropna(subset=['genres'], inplace=True)
# drop the duplicate entries
sql_df.drop_duplicates(inplace=True)

In [None]:
# check that the dataframe is correct
sql_df

## Merge 'Box Office Mojo' with 'The Numbers'
we merge these tables together because they have almost the exact same data, makeing it a conveinient source for
the revenue of each movie

In [None]:
# this will merge the two dataframes that share the common info about movie revenue
bomtn = pd.merge(bom_df, tn_df, how='outer', left_on='title', right_on='movie')

# making the first title column the primary so you fill it up where it is empty with the secondary title column
bomtn['title'] = bomtn['title'].fillna(bomtn['movie'])
# fill in the gaps witht the second domestic column
bomtn['domestic_gross_x'] = bomtn['domestic_gross_x'].fillna(bomtn['domestic_gross_y'])
# fill in the gaps with the difference in the worldwide and domestic secondary columns
bomtn['foreign'] = (bomtn['worldwide_gross'] - bomtn['domestic_gross_y']).fillna(bomtn['foreign_gross'])
# fill the release date with the converted datetime from the second date column
bomtn['release_date'] = bomtn['release_date'].fillna(pd.to_datetime(bomtn['year'], format='%Y'))
# fill the worldwide gross with the sum of the domestic and foreign
bomtn['worldwide_gross'] = bomtn['worldwide_gross'].fillna(bomtn['domestic_gross_x'] + bomtn['foreign_gross'])
# reorganize the columns and drop the unnecessary columns
bomtn = bomtn[['title','release_date', 'domestic_gross_x', 'foreign', 'foreign_gross', 'worldwide_gross']]
# rename the domestic column to be more readible
bomtn.rename(columns={'domestic_gross_x':'domestic_gross'}, inplace=True)

# drop all of the data that is empyt
bomtn.dropna(inplace=True)
bomtn = bomtn[bomtn['worldwide_gross'] != 0.0]

In [None]:
# check the dataframe
bomtn.head()

In [None]:
# this notes all of the data that have missing data in the foriegn column
bomtn[(bomtn.domestic_gross > 10000000) & (bomtn.foreign < 1000000)].head()

## Merge the tmdb_df and sql_df databases
this will give us more rows of data to work with and it will fill some of the holes in each dataset

In [None]:
# merge the two dataframes
tmdbsql = pd.merge(sql_df, tmdb_df,how ='outer', on='title' )
# fill in the missing info with the other columns
tmdbsql['avg_rating'] = tmdbsql['avg_rating'].fillna(tmdbsql['vote_average'])
tmdbsql['numvotes'] = tmdbsql['numvotes'].fillna(tmdbsql['vote_count'])
tmdbsql['genres'] = tmdbsql['genres'].fillna(tmdbsql['genre_ids'])
tmdbsql['year'] = pd.to_datetime(tmdbsql['year'], format='%Y').fillna(tmdbsql['release_date'])
# drop the duplicate columns
tmdbsql.drop(['release_date', 'genre_ids', 'popularity', 'vote_average', 'vote_count'], axis=1, inplace = True)
# sort the values by number of votes which indicates the popularity of the movie
tmdbsql = tmdbsql.sort_values('numvotes', ascending=False)
# get rid of the smaller movies with less than a hundred reviews
tmdbsql = tmdbsql[tmdbsql['numvotes'] > 100]
# change the datetime to be just the year because that is all we care about
tmdbsql['year'] = pd.DatetimeIndex(tmdbsql['year']).year

In [None]:
tmdbsql.head()

## Analyize the data and make a graph shoing the popularity of each genre
you can do this by looking at the total number of reviews which is strong indicator of popularity since the more people
that watch the movie the more poeple that write reviews

In [None]:
# make a dataframe purley for this graph
genre_pop = tmdbsql.dropna(subset=['avg_rating', 'numvotes'])

#this loop will get the string list of genres and put each individual genre in a list
categories = []
for grlist in genre_pop['genres']:
    for item in grlist.split(','):
        categories.append(item)   

# this set will show the complete list of all genre types with no duplicates
allGenres = set(categories)

# this for loop will go though the genres column and calculate each genres popularity by
# finding the mean number of votes for each genre then putting that info into a dict
popularity = {}   
for genre in allGenres:
    avg = genre_pop[genre_pop['genres'].str.contains(genre)]['numvotes'].mean()
    popularity[genre] = avg
    
# this puts the dict info into an easy format for the graph
x = sorted(popularity, key=popularity.get, reverse = True)
y = sorted(popularity.values(), reverse = True)

# this graphs the average number of reviews per genre
fig, ax= plt.subplots(figsize=(10,5))
ax.grid(zorder=0)
ax.bar(x, y, zorder=3)
ax.set_xlabel('Genres', fontsize=15)
ax.set_ylabel('Average number of reviews', fontsize=15)
ax.set_title('Average number of reviews per genre', fontsize=20)
plt.xticks(rotation=45, ha='right')

plt.show();

## Analyize the directors to find the best of the best 
Using the previous graph we can find the most successful directors for each of the four most popular genres.
This first block is my own function that will take a genre string as a parameter as a string and return a nested list 
containing the names and average number of reviews for each of the three most successful directors in each of the four most
popular genres

In [None]:
# takes the genre as a string parameter
def getBestDirectors(genre):
    # these are samples of the bigger dataframe used specifically for this graph
    best_direct = tmdbsql.dropna(subset = ['director', 'numvotes'])
    movies = best_direct[best_direct['genres'].str.contains(genre)]

    # this for loop finds a list of all the directors for all the movies
    master_list = []
    for dlist in movies['director']:
        for direct in dlist.split(','):
            master_list.append(direct)
            
    # this set eliminates the duplicate directors
    directors = set(master_list)

    # this for loop will create a dictionary that has the names of each director for keys and their success measured by 
    # the mean number of reviews as the values
    director_success = {}
    for director in directors:
        success = movies[movies['director'].str.contains(director)]['numvotes'].mean()
        director_success[director] = success
    
    # this puts the dict info into a easier form for the graph and limits it to just the top three directors
    x = sorted(director_success, key = director_success.get, reverse = True)[:3]
    y = sorted(director_success.values(), reverse=True)[:3]
    
    return [x, y]

In [None]:
# these lines get the info for the top three directors for each of the four genres
action = getBestDirectors('Action')
adventure = getBestDirectors('Adventure')
scifi = getBestDirectors('Sci-Fi')
fantasy = getBestDirectors('Fantasy')

# this gets the masterlist for the best directors and the top genres
the_best = set(action[0] + adventure[0] + scifi[0] + fantasy[0])
top_genres = ['Action', 'Adventure', 'Sci-Fi', 'Fantasy']

# these lines will clean up the dataframe so that only the information about the relvent direcotrs and genres remains
bdf = tmdbsql.dropna(subset = ['director', 'actors', 'numvotes'])
bdf = bdf[['genres', 'numvotes', 'director']]
bdf = bdf[bdf['genres'].map(lambda x: any(substr in x for substr in top_genres))]
bdf = bdf[bdf['director'].map(lambda x: any(substr in x for substr in the_best))]

# this fore loop will go through and split up the director list for movies that have multiple directors and put the new 
# duplicated entries in a new dataframe
splitdir = pd.DataFrame(columns = ['genres', 'numvotes', 'director'])
for x, y in bdf.iterrows():
    for director in y['director'].split(','):
        splitdir.loc[len(splitdir.index)] = [y['genres'], y['numvotes'], director]
        
# this fore loop does the same thing as the one above except for the list of genres rather than directors
newdf = pd.DataFrame(columns = ['genres', 'numvotes', 'director'])
for x, y in splitdir.iterrows():
    for genre in y['genres'].split(','):
        newdf.loc[len(newdf.index)] = [genre, y['numvotes'], y['director']]
        
# after splitting up the genres we will get some entries that are not one of the four we are looking for so we filter them out
newdf = newdf[newdf['genres'].map(lambda x: x in top_genres)]
newdf = newdf[newdf['director'].map(lambda x: x in the_best)]

In [None]:
# this will graph the popularity of each director according to number of votes per each of the four genres
fig, ax = plt.subplots(figsize=(20,10))
g = sns.barplot(data=newdf, x='director', y='numvotes', hue='genres', 
            ax=ax, ci=None, palette=['red','limegreen','royalblue', 'gold'])
ax.set_xlabel('Directors\n(and the movies they are known for)', fontsize=25)
ax.set_ylabel('Average number of reviews', fontsize = 20)
ax.set_title('Top Movie Directors by number of reviews', fontsize=30)
ax.set_yticklabels(('0 reviews', '200,000', '400,000', '600,000', '800,000',
                    '1 million', '1.2 million', '1.4 million', '1.6 million'), fontsize=15)
# the directors and the movies they are known for
ax.set_xticklabels(('Christopher Nolan\n(The Dark Knight)', 'Joss Whedon\n(Firefly)', 'Tim Miller\n(Deadpool)',
                    'Gary Ross\n(Hunger Games)', 'J.J. Abrams\n(Star Trek)', 
                    'Peter Jackson\n(Lord of The Rings)', 'Alan Taylor\n(Thor)'), fontsize=15)

# rotate the xticks and make the legend bigger
plt.xticks(rotation=30, ha='center')
plt.setp(g.get_legend().get_texts(), fontsize='25') 
plt.setp(g.get_legend().get_title(), fontsize='25')

plt.show()

## Analysis of the best Actors for the top four genres
this graph will do the same thing that the directors graph did just for the actors instead

In [None]:
# give the function the genre and it will give you the top three actors
def getBestActors(genre):
    # the sample dataframes just for this graph
    best_actor = tmdbsql.dropna(subset = ['actors', 'numvotes'])
    movies = best_actor[best_actor['genres'].str.contains(genre)]
    
    # creates a masterlist of the names of all the actors in the database
    master_list = []
    for alist in movies['actors']:
        for actor in alist.split(','):
            master_list.append(actor)
            
    # gets rid of duplicate actor names
    actors = set(master_list)

    # calculates the success of the actor based on the mean number of votes and stores the result into a dict
    actor_success = {}
    for actor in actors:
        success = movies[movies['actors'].str.contains(actor)]['numvotes'].mean()
        actor_success[actor] = success
    
    # puts the dict into a format that is easier for the graph and limits it to the top 3
    x = sorted(actor_success, key = actor_success.get, reverse = True)[:3]
    y = sorted(actor_success.values(), reverse=True)[:3]
    
    return [x, y]

In [None]:
# calculates the best actors for each of the four most popular genres
actor_action = getBestActors('Action')
actor_adventure = getBestActors('Adventure')
actor_scifi = getBestActors('Sci-Fi')
actor_fantasy = getBestActors('Fantasy')

# gets the best actors per genre
best_actors = set(actor_action[0] + actor_adventure[0] + actor_scifi[0] + actor_fantasy[0])
top_genres = ['Action', 'Adventure', 'Sci-Fi', 'Fantasy']

# gets a dataframe we can use for the graph
actdf = tmdbsql.dropna(subset = ['director', 'actors', 'numvotes'])
actdf = actdf[['genres', 'numvotes', 'actors']]
actdf = actdf[actdf['genres'].map(lambda x: any(substr in x for substr in top_genres))]
actdf = actdf[actdf['actors'].map(lambda x: any(substr in x for substr in best_actors))]

# splits up the actors lists per movie
splitactor = pd.DataFrame(columns = ['genres', 'numvotes', 'actor'])
for x, y in actdf.iterrows():
    for actor in y['actors'].split(','):
        splitactor.loc[len(splitactor.index)] = [y['genres'], y['numvotes'], actor]

# splits up the genres list per movie
actor_df = pd.DataFrame(columns = ['genres', 'numvotes', 'actor'])
for x, y in splitactor.iterrows():
    for genre in y['genres'].split(','):
        actor_df.loc[len(actor_df.index)] = [genre, y['numvotes'], y['actor']]
        
# takes out the data we don't need
actor_df = actor_df[actor_df['genres'].map(lambda x: x in top_genres)]
actor_df = actor_df[actor_df['actor'].map(lambda x: x in best_actors)]

In [None]:
# the same graph we did for directors now for actors

fig, ax = plt.subplots(figsize=(20,10))
t = sns.barplot(data=actor_df, x='actor', y='numvotes', hue='genres', 
            ax=ax, ci=None, palette=['red','limegreen','royalblue', 'gold'])
ax.set_xlabel('Actor\n(and their best movie)', fontsize=30)
ax.set_ylabel('Average number of reviews', fontsize = 30)
ax.set_title('Top Movie Actors by number of reviews', fontsize=30)
ax.set_yticklabels(('0 reviews', '200,000', '400,000', '600,000', '800,000',
                    '1 million', '1.2 million', '1.4 million', '1.6 million'), fontsize=15)
ax.set_xticklabels(('Leonardo DiCaprio\n(Inception)', 'Joseph Gordan-Levitt\n(The Dark Knight Rises)', 
                    'Ken Watanabe\n(Godzilla)', 'Christian Bale\n(Dark Knight)', 'Tom Hardy\n(Venom)',
                    'Oscar Isaac\n(Ex Machina)', 'Andy Serkis\n(Black Panther)', 'Michael Gambon\n(Harry Potter)'), fontsize=15)
plt.legend(loc='upper right')
plt.setp(t.get_legend().get_texts(), fontsize='25') 


plt.xticks(rotation=30, ha='center')

plt.show()