In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [30]:
df1 = pd.read_csv('data/title.basics.csv')
df2 = pd.read_csv('data/title.ratings.csv')
df3 = pd.read_csv('data/bom.movie_gross.csv')

df1.shape, df2.shape, df3.shape

((146144, 6), (73856, 3), (3387, 5))

In [36]:
!ls

README.md           [34mdata[m[m                google-slide-link   jeff_notebook.ipynb


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           146144 non-null  object 
 1   primary_title    146144 non-null  object 
 2   original_title   146123 non-null  object 
 3   start_year       146144 non-null  int64  
 4   runtime_minutes  114405 non-null  float64
 5   genres           140736 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


In [6]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         73856 non-null  object 
 1   averagerating  73856 non-null  float64
 2   numvotes       73856 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


In [8]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


### DF1

This data set contains titles, premiere year, runtime and film genres, along with an index.

In [9]:
df1.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [10]:
df1['start_year'].value_counts()

2017    17504
2016    17272
2018    16849
2015    16243
2014    15589
2013    14709
2012    13787
2011    12900
2010    11849
2019     8379
2020      937
2021       83
2022       32
2023        5
2024        2
2027        1
2026        1
2025        1
2115        1
Name: start_year, dtype: int64

In [11]:
df1['genres'].value_counts()

Documentary                 32185
Drama                       21486
Comedy                       9177
Horror                       4372
Comedy,Drama                 3519
                            ...  
Biography,Reality-TV            1
Comedy,Sci-Fi,Sport             1
Comedy,Romance,Western          1
Biography,Fantasy,Horror        1
Biography,Family,News           1
Name: genres, Length: 1085, dtype: int64

In [12]:
len(df1['tconst'].unique()) # the output matches the number of entries for 'tconst'.  The values are unique

146144

### DF2

This data set contains fan rating and voting information along with an index.

In [14]:
df2.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [15]:
df2['averagerating'].value_counts(bins=10)

(6.4, 7.3]     19088
(5.5, 6.4]     16563
(7.3, 8.2]     12473
(4.6, 5.5]      9861
(3.7, 4.6]      5452
(8.2, 9.1]      5354
(2.8, 3.7]      2819
(1.9, 2.8]      1112
(9.1, 10.0]      715
(0.99, 1.9]      419
Name: averagerating, dtype: int64

In [16]:
df2['numvotes'].value_counts(bins=12)

(-1836.062, 153426.75]     73443
(153426.75, 306848.5]        243
(306848.5, 460270.25]         91
(460270.25, 613692.0]         45
(613692.0, 767113.75]         22
(767113.75, 920535.5]          4
(920535.5, 1073957.25]         3
(1073957.25, 1227379.0]        2
(1687644.25, 1841066.0]        1
(1380800.75, 1534222.5]        1
(1227379.0, 1380800.75]        1
(1534222.5, 1687644.25]        0
Name: numvotes, dtype: int64

In [17]:
len(df2['tconst'].unique()) # the output matches the number of entries for 'tconst'.  The values are unique

73856

## DF3

This data set contains titles, production studio, year of release from 2010 - 2018, and domestic and foreign gross receipts.

In [27]:
df3.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [28]:
df3['studio'].value_counts()

IFC           166
Uni.          147
WB            140
Magn.         136
Fox           136
             ... 
SEA             1
CFilms          1
NFC             1
FInd.           1
Grindstone      1
Name: studio, Length: 257, dtype: int64

In [29]:
df3['year'].value_counts()

2015    450
2016    436
2012    400
2011    399
2014    395
2013    350
2010    328
2017    321
2018    308
Name: year, dtype: int64

In [37]:
# Title, Studio, Domestic Gross, Foreign Gross, Year
movie_gross_df = pd.read_csv('data/bom.movie_gross.csv')

# Name of Crew/Principal, Primary Profession, Known for Titles
people_info_df = pd.read_csv('data/name.basics.csv')

# ID, Rating, Genre, Director, Writer, Theater Date, Box Office, Runtime, Studio
rt_movie_info_df = pd.read_csv('data/rt.movie_info.tsv', delimiter='\t')

# ID, Rating, Fresh, Top Critic
rt_movie_reviews_df = pd.read_csv('data/rt.reviews.tsv', delimiter='\t', encoding='latin1')

# Title ID, Title, Region, Language
title_akas_df = pd.read_csv('data/title.akas.csv')

# Title ID (tconst), Primary Title, Original Title, Year, Runtime, Genres
title_basics_df = pd.read_csv('data/title.basics.csv')

# Title ID (tconst), Directors (nconst), Writers (nconst)
title_crew_df = pd.read_csv('data/title.crew.csv')

# Title ID (tconst), People (nconst), Category (primary profession)
title_principals_df = pd.read_csv('data/title.principals.csv')

# Titled ID (tconst), Rating, Num Votes
title_ratings_df = pd.read_csv('data/title.ratings.csv')

# Genre, Language, Original Title, Popularity, Date, Title, Average Rating, Num of Ratings
tmdb_movies_df = pd.read_csv('data/tmdb.movies.csv')

# Release Date, Title, Budget, Domestic Gross, Worldwide Gross
movie_budgets_df = pd.read_csv('data/tn.movie_budgets.csv')

In [41]:
movie_gross_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           3387 non-null   object 
 1   studio          3382 non-null   object 
 2   domestic_gross  3359 non-null   float64
 3   foreign_gross   2037 non-null   object 
 4   year            3387 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [58]:
movie_gross_df.sort_values('title')

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
2093,'71,RAtt.,1300000.0,355000,2015
1830,"1,000 Times Good Night",FM,53900.0,,2014
2395,10 Cloverfield Lane,Par.,72100000.0,38100000,2016
1040,10 Years,Anch.,203000.0,,2012
2283,1001 Grams,KL,11000.0,,2015
...,...,...,...,...,...
2426,Zoolander 2,Par.,28800000.0,27900000,2016
2325,Zootopia,BV,341300000.0,682500000,2016
148,[Rec] 2,Magn.,27800.0,18500000,2010
2865,mother!,Par.,17800000.0,26700000,2017


In [40]:
movie_budgets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [59]:
movie_budgets_df.sort_values('movie')

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
5115,16,"Nov 20, 2015",#Horror,"$1,500,000",$0,$0
3954,55,"Jul 17, 2009",(500) Days of Summer,"$7,500,000","$32,425,665","$34,439,060"
4253,54,"Mar 11, 2016",10 Cloverfield Lane,"$5,000,000","$72,082,999","$108,286,422"
3447,48,"Nov 11, 2015",10 Days in a Madhouse,"$12,000,000","$14,616","$14,616"
3262,63,"Mar 31, 1999",10 Things I Hate About You,"$13,000,000","$38,177,966","$60,413,950"
...,...,...,...,...,...,...
1958,59,"Sep 15, 2017",mother!,"$30,000,000","$17,800,004","$42,531,076"
697,98,"Aug 9, 2002",xXx,"$70,000,000","$141,930,000","$267,200,000"
514,15,"Jan 20, 2017",xXx: Return of Xander Cage,"$85,000,000","$44,898,413","$345,033,359"
4756,57,"Apr 15, 2008",Ã l\'intÃ©rieur,"$3,000,000",$0,"$895,932"
