In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('bom.movie_gross.csv.gz')

In [34]:
# function for adding foreign gross and domestic gross totals for movies
def money(df):
    for index, row in df.iterrows():
            print('{} is the total for {}'.format(int(row['domestic_gross']) + int(row['foreign_gross']), row['title']))
    

In [35]:
money(df.head())

1067000000 is the total for Toy Story 3
1025500000 is the total for Alice in Wonderland (2010)
960300000 is the total for Harry Potter and the Deathly Hallows Part 1
828300000 is the total for Inception
752600000 is the total for Shrek Forever After


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
title             3387 non-null object
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null object
year              3387 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [6]:
values = {'studio': 'NA', 'domestic_gross': 0, 'foreign_gross': 0}

In [17]:
df = df.fillna(value=values) # this is to fill NA values so I can add gross columns together for each movie

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
title             3387 non-null object
studio            3387 non-null object
domestic_gross    3387 non-null float64
foreign_gross     3387 non-null object
year              3387 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [19]:
df.loc[df['title'] == 'The Quake']

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
3382,The Quake,Magn.,6200.0,0,2018


In [71]:
df['foreign_gross'] = pd.to_numeric(df['foreign_gross'], errors = 'coerce')

In [72]:
df['foreign_gross']

0       652000000.0
1       691300000.0
2       664300000.0
3       535700000.0
4       513900000.0
           ...     
3382            0.0
3383            0.0
3384            0.0
3385            0.0
3386            0.0
Name: foreign_gross, Length: 3387, dtype: float64

In [81]:
index = df['foreign_gross'].index[df['foreign_gross'].apply(np.isnan)]

In [82]:
index

Int64Index([1872, 1873, 1874, 2760, 3079], dtype='int64')

In [98]:
dfobj = pd.DataFrame(df, columns = ['foreign_gross', 'title'], index = [1872, 1873, 1874, 2760, 3079])

In [99]:
dfobj

Unnamed: 0,foreign_gross,title
1872,1131.6,Star Wars: The Force Awakens
1873,1019.4,Jurassic World
1874,1163.0,Furious 7
2760,1010.0,The Fate of the Furious
3079,1369.5,Avengers: Infinity War


In [22]:
df.at[1872, 'foreign_gross']

'1,131.6'

In [23]:
df.at[1872, 'foreign_gross'] = 1131600000

In [24]:
df.at[1872, 'foreign_gross']

1131600000

In [25]:
df.loc[df['title'] == 'Star Wars: The Force Awakens']

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
1872,Star Wars: The Force Awakens,BV,936700000.0,1131600000,2015


In [26]:
df.at[1873, 'foreign_gross'] = 1019400000

In [27]:
df.at[1874, 'foreign_gross'] = 1163000000

In [28]:
df.at[2760, 'foreign_gross'] = 1010000000

In [29]:
df.at[3079, 'foreign_gross'] = 1369500000

In [49]:
money(df.head())

1067000000 is the total for Toy Story 3
1025500000 is the total for Alice in Wonderland (2010)
960300000 is the total for Harry Potter and the Deathly Hallows Part 1
828300000 is the total for Inception
752600000 is the total for Shrek Forever After


In [59]:
df['foreign_gross'] = df['foreign_gross'].astype(int)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
title             3387 non-null object
studio            3387 non-null object
domestic_gross    3387 non-null int64
foreign_gross     3387 non-null int64
year              3387 non-null int64
dtypes: int64(3), object(2)
memory usage: 132.4+ KB


In [61]:
df['total_gross'] = df['foreign_gross'] + df['domestic_gross']

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 6 columns):
title             3387 non-null object
studio            3387 non-null object
domestic_gross    3387 non-null int64
foreign_gross     3387 non-null int64
year              3387 non-null int64
total_gross       3387 non-null int64
dtypes: int64(4), object(2)
memory usage: 158.9+ KB


In [154]:
largest = [df['total_gross'].nlargest(10)]

[1872    2068300000
 3079    2048300000
 1873    1671700000
 727     1518900000
 1874    1516000000
 1875    1405400000
 3080    1347000000
 328     1341500000
 2758    1332600000
 3081    1309500000
 Name: total_gross, dtype: int64]

In [103]:
df.loc[[1872, 3079, 1873, 727, 1874, 1875, 3080, 328, 2758, 3081]]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,total_gross
1872,Star Wars: The Force Awakens,BV,936700000,1131600000,2015,2068300000
3079,Avengers: Infinity War,BV,678800000,1369500000,2018,2048300000
1873,Jurassic World,Uni.,652300000,1019400000,2015,1671700000
727,Marvel's The Avengers,BV,623400000,895500000,2012,1518900000
1874,Furious 7,Uni.,353000000,1163000000,2015,1516000000
1875,Avengers: Age of Ultron,BV,459000000,946400000,2015,1405400000
3080,Black Panther,BV,700100000,646900000,2018,1347000000
328,Harry Potter and the Deathly Hallows Part 2,WB,381000000,960500000,2011,1341500000
2758,Star Wars: The Last Jedi,BV,620200000,712400000,2017,1332600000
3081,Jurassic World: Fallen Kingdom,Uni.,417700000,891800000,2018,1309500000


In [237]:
#drop all columns but title and total_gross and retrieve the first 100 most grossed movies
new_df = df.drop(['studio', 'domestic_gross', 'foreign_gross', 'year'], axis = 1)

In [249]:
first_50_most_profittable = new_df.nlargest(50, columns = 'total_gross')
first_50_most_profittable.head()

Unnamed: 0,title,total_gross
1872,Star Wars: The Force Awakens,2068300000
3079,Avengers: Infinity War,2048300000
1873,Jurassic World,1671700000
727,Marvel's The Avengers,1518900000
1874,Furious 7,1516000000


In [470]:
first_1000_most_profittable = new_df.nlargest(1000, columns = 'total_gross')
first_1000_most_profittable

Unnamed: 0,title,total_gross
1872,Star Wars: The Force Awakens,2068300000
3079,Avengers: Infinity War,2048300000
1873,Jurassic World,1671700000
727,Marvel's The Avengers,1518900000
1874,Furious 7,1516000000
...,...,...
1587,Oculus,44000000
1982,Max (2015),44000000
1983,Still Alice,43900000
2439,Cafe Society,43800000


In [464]:
movies = pd.read_csv('imdb.title.basics.csv.gz')
new_movies = movies
new_movies = new_movies.drop(columns = ['original_title', 'runtime_minutes', 'start_year', 'tconst'])
new_movies.rename(columns = {'primary_title':'title'}, inplace = True)

In [475]:
two_tables = pd.merge(first_100_most_profittable, new_movies, on = 'title', how = 'inner')
# drop unnecessary columns
# combined_table = combined_table.drop(columns = ['Unnamed: 0', 'id', 'original_language', 'release_date', 'vote_average', 'vote_count'])
two_tables

Unnamed: 0,title,total_gross,genres
0,Avengers: Infinity War,2048300000,"Action,Adventure,Sci-Fi"
1,Jurassic World,1671700000,"Action,Adventure,Sci-Fi"
2,Furious 7,1516000000,"Action,Crime,Thriller"
3,Avengers: Age of Ultron,1405400000,"Action,Adventure,Sci-Fi"
4,Black Panther,1347000000,"Action,Adventure,Sci-Fi"
...,...,...,...
1023,Boyhood,44500000,Drama
1024,47 Meters Down,44300000,"Adventure,Drama,Horror"
1025,Machete,44100000,"Action,Crime,Thriller"
1026,Oculus,44000000,"Horror,Mystery"


In [469]:
two_tables.loc[two_tables['title'] == 'John Wick']

Unnamed: 0,title,total_gross,genres
713,John Wick,88700000,"Action,Crime,Thriller"


In [422]:
first_100_most_profittable.loc[first_100_most_profittable['title'] == 'John Wick']

Unnamed: 0,title,total_gross
1552,John Wick,88700000


In [467]:
popularity = pd.read_csv('tmdb.movies.csv.gz')
popularity = popularity.drop(columns = ['Unnamed: 0','genre_ids','id','original_language','original_title','release_date','vote_average','vote_count'])
popularity

Unnamed: 0,popularity,title
0,33.533,Harry Potter and the Deathly Hallows: Part 1
1,28.734,How to Train Your Dragon
2,28.515,Iron Man 2
3,28.005,Toy Story
4,27.920,Inception
...,...,...
26512,0.600,Laboratory Conditions
26513,0.600,_EXHIBIT_84xxx_
26514,0.600,The Last One
26515,0.600,Trailer Made


In [865]:
combined_data = pd.merge(two_tables, popularity, on = 'title', how = 'inner')
combined_data.nlargest(10, 'popularity')

Unnamed: 0,title,total_gross,genres,popularity
0,Avengers: Infinity War,2048300000,"Action,Adventure,Sci-Fi",80.773
876,John Wick,88700000,"Action,Crime,Thriller",78.123
37,The Hobbit: The Battle of the Five Armies,956000000,"Adventure,Fantasy",53.783
83,Guardians of the Galaxy,773400000,"Action,Adventure,Comedy",49.606
353,Blade Runner 2049,259300000,"Drama,Mystery,Sci-Fi",48.571
354,Blade Runner 2049,259300000,"Drama,Mystery,Sci-Fi",48.571
111,Fantastic Beasts: The Crimes of Grindelwald,653700000,"Adventure,Family,Fantasy",48.508
154,Ralph Breaks the Internet,529200000,"Adventure,Animation,Comedy",48.057
41,Spider-Man: Homecoming,880200000,"Action,Adventure,Sci-Fi",46.775
124,Ant-Man and the Wasp,622600000,"Action,Adventure,Comedy",44.729


In [866]:
test_data = combined_data

In [867]:
test_data.loc[test_data['title'] == 'Black Panther']

Unnamed: 0,title,total_gross,genres,popularity
4,Black Panther,1347000000,"Action,Adventure,Sci-Fi",2.058
5,Black Panther,1347000000,"Action,Adventure,Sci-Fi",44.14


In [853]:
test_data.nlargest(50, columns = 'popularity')

Unnamed: 0,title,total_gross,genres,popularity
0,Avengers: Infinity War,2048300000,"Action,Adventure,Sci-Fi",80.773
876,John Wick,88700000,"Action,Crime,Thriller",78.123
37,The Hobbit: The Battle of the Five Armies,956000000,"Adventure,Fantasy",53.783
83,Guardians of the Galaxy,773400000,"Action,Adventure,Comedy",49.606
353,Blade Runner 2049,259300000,"Drama,Mystery,Sci-Fi",48.571
354,Blade Runner 2049,259300000,"Drama,Mystery,Sci-Fi",48.571
111,Fantastic Beasts: The Crimes of Grindelwald,653700000,"Adventure,Family,Fantasy",48.508
154,Ralph Breaks the Internet,529200000,"Adventure,Animation,Comedy",48.057
41,Spider-Man: Homecoming,880200000,"Action,Adventure,Sci-Fi",46.775
124,Ant-Man and the Wasp,622600000,"Action,Adventure,Comedy",44.729


In [868]:
test_data['title'] = test_data.title.drop_duplicates(keep = 'last')

In [869]:
test_data.nlargest(20, columns = 'popularity')

Unnamed: 0,title,total_gross,genres,popularity
0,Avengers: Infinity War,2048300000,"Action,Adventure,Sci-Fi",80.773
876,John Wick,88700000,"Action,Crime,Thriller",78.123
37,The Hobbit: The Battle of the Five Armies,956000000,"Adventure,Fantasy",53.783
83,Guardians of the Galaxy,773400000,"Action,Adventure,Comedy",49.606
353,,259300000,"Drama,Mystery,Sci-Fi",48.571
354,Blade Runner 2049,259300000,"Drama,Mystery,Sci-Fi",48.571
111,Fantastic Beasts: The Crimes of Grindelwald,653700000,"Adventure,Family,Fantasy",48.508
154,Ralph Breaks the Internet,529200000,"Adventure,Animation,Comedy",48.057
41,Spider-Man: Homecoming,880200000,"Action,Adventure,Sci-Fi",46.775
124,Ant-Man and the Wasp,622600000,"Action,Adventure,Comedy",44.729


In [870]:
test_data = test_data.dropna(subset = ['title'], axis = 0, how = 'any')

In [871]:
test_data.isna().sum()

title          0
total_gross    0
genres         6
popularity     0
dtype: int64

In [872]:
test_data.head(30)

Unnamed: 0,title,total_gross,genres,popularity
0,Avengers: Infinity War,2048300000,"Action,Adventure,Sci-Fi",80.773
1,Jurassic World,1671700000,"Action,Adventure,Sci-Fi",20.709
2,Furious 7,1516000000,"Action,Crime,Thriller",20.396
3,Avengers: Age of Ultron,1405400000,"Action,Adventure,Sci-Fi",44.383
5,Black Panther,1347000000,"Action,Adventure,Sci-Fi",44.14
7,Star Wars: The Last Jedi,1332600000,"Action,Adventure,Fantasy",34.293
8,Jurassic World: Fallen Kingdom,1309500000,"Action,Adventure,Sci-Fi",34.958
14,Frozen,1276400000,"Adventure,Animation,Comedy",26.183
15,Incredibles 2,1242800000,"Action,Adventure,Animation",36.286
16,The Fate of the Furious,1236000000,"Action,Crime,Thriller",28.668


In [879]:
for x in combined_data.nlargest(100, 'popularity'):
    