In [3]:
import pandas as pd

In [4]:
data_movie_gross = pd.read_csv('zippedData/bom.movie_gross.csv.gz')

In [5]:
data_title_ratings = pd.read_csv('zippedData/imdb.title.ratings.csv.gz')

In [6]:
data_title_ratings.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [7]:
data_title_basics = pd.read_csv('zippedData/imdb.title.basics.csv.gz')

In [8]:
data_title_basics.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [9]:
## Merged imdb.title.basics.csv.gz and imdb.title.ratings.csv.gz on the column 'tconst' with an inner join to eliminate 
## non-matching items.
data_title_overall = pd.merge(data_title_basics, data_title_ratings, on='tconst', how='inner')

In [10]:
## Renamed 'primary_title' to 'title' in order to smoothly merge this dataframe with bom.movie_gross.csv.gz.
data_title_overall = data_title_overall.rename(columns={'primary_title': 'title'})

In [11]:
## Merged IMDB and BOM data into one dataframe
movie_titles = pd.merge(data_movie_gross, data_title_overall, on='title', how='inner')

In [12]:
movie_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3027 entries, 0 to 3026
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3027 non-null   object 
 1   studio           3024 non-null   object 
 2   domestic_gross   3005 non-null   float64
 3   foreign_gross    1832 non-null   object 
 4   year             3027 non-null   int64  
 5   tconst           3027 non-null   object 
 6   original_title   3027 non-null   object 
 7   start_year       3027 non-null   int64  
 8   runtime_minutes  2980 non-null   float64
 9   genres           3020 non-null   object 
 10  averagerating    3027 non-null   float64
 11  numvotes         3027 non-null   int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 307.4+ KB


In [13]:
movie_titles = movie_titles[['title', 'studio', 'domestic_gross', 'foreign_gross', 'year', 'genres', 'averagerating']]

In [14]:
movie_titles.dropna(axis=0, how='any', inplace=True)

In [15]:
movie_titles['foreign_gross'] = movie_titles.foreign_gross.str.split(',').str.join('').astype(float)

In [16]:
movie_titles['total_gross'] = movie_titles['domestic_gross'] + movie_titles['foreign_gross']

In [17]:
movie_titles.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,genres,averagerating,total_gross
0,Toy Story 3,BV,415000000.0,652000000.0,2010,"Adventure,Animation,Comedy",8.3,1067000000.0
1,Inception,WB,292600000.0,535700000.0,2010,"Action,Adventure,Sci-Fi",8.8,828300000.0
2,Shrek Forever After,P/DW,238700000.0,513900000.0,2010,"Adventure,Animation,Comedy",6.3,752600000.0
3,The Twilight Saga: Eclipse,Sum.,300500000.0,398000000.0,2010,"Adventure,Drama,Fantasy",5.0,698500000.0
4,Iron Man 2,Par.,312400000.0,311500000.0,2010,"Action,Adventure,Sci-Fi",7.0,623900000.0


In [18]:
movie_titles_genres = movie_titles[['title', 'studio', 'genres', 'total_gross', 'year']]

In [19]:
movie_titles_genres.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1803 entries, 0 to 3001
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        1803 non-null   object 
 1   studio       1803 non-null   object 
 2   genres       1803 non-null   object 
 3   total_gross  1803 non-null   float64
 4   year         1803 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 84.5+ KB


In [20]:
movie_titles_genres.head()

Unnamed: 0,title,studio,genres,total_gross,year
0,Toy Story 3,BV,"Adventure,Animation,Comedy",1067000000.0,2010
1,Inception,WB,"Action,Adventure,Sci-Fi",828300000.0,2010
2,Shrek Forever After,P/DW,"Adventure,Animation,Comedy",752600000.0,2010
3,The Twilight Saga: Eclipse,Sum.,"Adventure,Drama,Fantasy",698500000.0,2010
4,Iron Man 2,Par.,"Action,Adventure,Sci-Fi",623900000.0,2010


In [21]:
unique_genres = movie_titles_genres.genres.unique()

In [23]:
print(unique_genres)

['Adventure,Animation,Comedy' 'Action,Adventure,Sci-Fi'
 'Adventure,Drama,Fantasy' 'Animation,Comedy,Family'
 'Action,Adventure,Animation' 'Adventure,Family,Fantasy'
 'Biography,Drama,History' 'Action,Drama,Family'
 'Action,Adventure,Fantasy' 'Drama,Thriller' 'Action,Animation,Comedy'
 'Action,Adventure,Drama' 'Action,Drama' 'Action,Adventure,Thriller'
 'Action,Adventure,Family' 'Comedy,Romance' 'Action,Adventure,Horror'
 'Mystery,Thriller' 'Action,Mystery,Thriller' 'Comedy,Drama,Romance'
 'Comedy' 'Adventure,Drama,Western' 'Adventure,Comedy,Family'
 'Biography,Drama' 'Adventure,Comedy' 'Drama,Romance' 'Thriller' 'Drama'
 'Horror' 'Action,Comedy,Crime' 'Action,Thriller' 'Documentary'
 'Action,Crime' 'Crime,Drama,Thriller' 'Comedy,Crime,Romance'
 'Drama,Fantasy,Horror' 'Action,Comedy,Romance' 'Biography,Drama,Sport'
 'Drama,Romance,War' 'Action,Comedy,Family' 'Comedy,Family,Fantasy'
 'Drama,Fantasy,Romance' 'Crime,Drama,Mystery' 'Action,Crime,Drama'
 'Action,Comedy' 'Action,Drama,Thrill

In [24]:
movie_genres_list = []

for x in unique_genres:
    movie_genres_list.extend(x.split(','))

In [25]:
movie_genres = (list(set(movie_genres_list)))

In [26]:
movie_genres = sorted(movie_genres)

In [27]:
movie_genres

['Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Romance',
 'Sci-Fi',
 'Sport',
 'Thriller',
 'War',
 'Western']