In [1]:
import pandas as pd
import numpy as np

data_info = pd.read_csv('data/zippedData/rotten_tomatoes_movies.csv.gz')
data_ratings = pd.read_csv('data/zippedData/imdb.title.ratings.csv.gz')
data_budgets = pd.read_csv('data/zippedData/tn.movie_budgets.csv.gz') 
data_reviews = pd.read_csv('data/zippedData/rotten_tomatoes_critic_reviews.csv.gz')
data_movies = pd.read_csv('data/zippedData/tmdb.movies.csv.gz')


# Observing the different tables. 

**Considering movie information, budget, and reviews to understand what their data represents.**

In [2]:
data_movies.head(2)

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610


In [3]:
data_budgets.head(2)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"


In [4]:
new_movie = data_info.loc[:,('movie_title','critics_consensus','actors','production_company','tomatometer_status','tomatometer_rating','audience_rating','tomatometer_rotten_critics_count')]

new_movie.head(1)

Unnamed: 0,movie_title,critics_consensus,actors,production_company,tomatometer_status,tomatometer_rating,audience_rating,tomatometer_rotten_critics_count
0,Percy Jackson & the Olympians: The Lightning T...,Though it may seem like just another Harry Pot...,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",20th Century Fox,Rotten,49.0,53.0,76


In [5]:
data_ratings.head(2)

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559


In [6]:
data_reviews.head(2)

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."


# Organizing & Cleaning Data

## Merging multiple tables to get more informed data

In [7]:
movie_overview= pd.merge(new_movie,data_budgets,how='inner',left_on='movie_title',right_on='movie')

In [8]:
movie_overview2 = pd.merge(movie_overview,data_movies, how='inner', left_on='movie_title',right_on='title')
movie_overview_columnDrop = movie_overview2.drop(['movie','genre_ids','original_title','title','original_language','release_date_x','id_x'], axis = 1)
movie_overview_columnDrop


Unnamed: 0,movie_title,critics_consensus,actors,production_company,tomatometer_status,tomatometer_rating,audience_rating,tomatometer_rotten_critics_count,production_budget,domestic_gross,worldwide_gross,id_y,popularity,release_date_y,vote_average,vote_count
0,Percy Jackson & the Olympians: The Lightning T...,Though it may seem like just another Harry Pot...,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",20th Century Fox,Rotten,49.0,53.0,76,"$95,000,000","$88,768,303","$223,050,874",32657,26.691,2010-02-11,6.1,4229
1,Please Give,Nicole Holofcener's newest might seem slight i...,"Catherine Keener, Amanda Peet, Oliver Platt, R...",Sony Pictures Classics,Certified-Fresh,87.0,64.0,19,"$3,000,000","$4,033,574","$4,570,178",40247,5.945,2010-01-22,6.3,81
2,Criminal,"If you saw Nine Queens, it may feel redundant,...","John C. Reilly, Diego Luna, Maggie Gyllenhaal,...",Warner Bros. Pictures,Fresh,69.0,57.0,39,"$31,500,000","$14,708,696","$38,771,262",302156,13.651,2016-04-15,5.9,897
3,Criminal,Despite the valiant efforts of a game and tale...,"Kevin Costner, Gary Oldman, Tommy Lee Jones, R...",Summit Entertainment,Rotten,30.0,47.0,94,"$31,500,000","$14,708,696","$38,771,262",302156,13.651,2016-04-15,5.9,897
4,Going the Distance,,"Christopher Jacot, Shawn Roberts, Ryan Bellevi...",Séville Pictures,Rotten,0.0,61.0,5,"$32,000,000","$17,804,299","$43,603,990",38073,8.281,2010-09-03,6.0,394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,Zodiac,"A quiet, dialogue-driven thriller that deliver...","Jake Gyllenhaal, Mark Ruffalo, Robert Downey J...",Paramount Pictures,Certified-Fresh,89.0,77.0,28,"$85,000,000","$33,080,084","$83,080,084",285135,4.180,2014-08-16,3.8,49
2496,Zookeeper,Zookeeper smothers Kevin James's with a sodden...,"Kevin James, Rosario Dawson, Ken Jeong, Leslie...",Columbia Pictures,Rotten,14.0,41.0,114,"$80,000,000","$80,360,866","$170,805,525",38317,10.764,2011-07-08,5.3,886
2497,Zoolander 2,Zoolander No. 2 has more celebrity cameos than...,"Ben Stiller, Owen Wilson, Will Ferrell, Penelo...",Paramount Pictures,Rotten,22.0,20.0,181,"$50,000,000","$28,848,693","$55,348,693",329833,12.997,2016-02-12,4.7,1374
2498,Zoom,Lacking the punch and good cheer of The Incred...,"Tim Allen, Courteney Cox, Chevy Chase, Spencer...",Sony Pictures Entertainment,Rotten,4.0,33.0,65,"$35,000,000","$11,989,328","$12,506,188",351065,3.434,2016-09-02,5.5,43


# Find thost frequently appearing actors in top performing films

# Find the correlation between actors and revenue

In [9]:
example = movie_overview_columnDrop.copy()
example.actors = example.actors.str.split(',')

In [10]:
example = example.explode('actors')

In [11]:
example.columns

Index(['movie_title', 'critics_consensus', 'actors', 'production_company',
       'tomatometer_status', 'tomatometer_rating', 'audience_rating',
       'tomatometer_rotten_critics_count', 'production_budget',
       'domestic_gross', 'worldwide_gross', 'id_y', 'popularity',
       'release_date_y', 'vote_average', 'vote_count'],
      dtype='object')

In [12]:
example = example[example.actors.isin(example.actors.value_counts()[:100].index.tolist())]

In [13]:
example = pd.get_dummies(example, columns = ['actors'])

In [14]:
example[example.columns[15:].tolist() + ['worldwide_gross']].corr()

Unnamed: 0,actors_ Adrian Martinez,actors_ Alan Tudyk,actors_ Alexander Flores,actors_ Allison Janney,actors_ Amy Adams,actors_ Ananais J. Dixon,actors_ Anthony Mackie,actors_ April M. Lawrence,actors_ April Winchell,actors_ Beau Knapp,...,actors_ Woody Harrelson,actors_Gbenga Akinnagbe,actors_Jim Parsons,actors_Johnny Depp,actors_Liam Neeson,actors_Marcia Gay Harden,actors_Mary Rigby-Abernathy,actors_Tom Cruise,actors_Tom Hardy,actors_Yann Arthus-Bertrand
actors_ Adrian Martinez,1.000000,-0.012130,-0.012675,-0.010650,-0.010650,-0.011848,-0.011848,-0.011848,-0.011848,-0.011848,...,-0.012405,-0.011848,-0.011848,-0.010650,-0.010961,-0.011848,-0.011848,-0.011264,-0.010961,-0.011848
actors_ Alan Tudyk,-0.012130,1.000000,-0.010834,-0.009103,-0.009103,-0.010127,-0.010127,-0.010127,-0.010127,-0.010127,...,-0.010603,-0.010127,-0.010127,-0.009103,-0.009369,-0.010127,-0.010127,-0.009628,-0.009369,-0.010127
actors_ Alexander Flores,-0.012675,-0.010834,1.000000,-0.009512,-0.009512,-0.010582,-0.010582,-0.010582,-0.010582,-0.010582,...,-0.011080,-0.010582,-0.010582,-0.009512,-0.009790,-0.010582,-0.010582,-0.010061,-0.009790,-0.010582
actors_ Allison Janney,-0.010650,-0.009103,-0.009512,1.000000,-0.007992,-0.008892,-0.008892,-0.008892,-0.008892,-0.008892,...,-0.009310,-0.008892,-0.008892,-0.007992,-0.008226,-0.008892,-0.008892,-0.008454,-0.008226,-0.008892
actors_ Amy Adams,-0.010650,-0.009103,-0.009512,-0.007992,1.000000,-0.008892,-0.008892,-0.008892,-0.008892,-0.008892,...,-0.009310,-0.008892,-0.008892,-0.007992,-0.008226,-0.008892,-0.008892,-0.008454,-0.008226,-0.008892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
actors_Marcia Gay Harden,-0.011848,-0.010127,-0.010582,-0.008892,-0.008892,-0.009892,-0.009892,-0.009892,-0.009892,-0.009892,...,-0.010357,-0.009892,-0.009892,-0.008892,-0.009151,1.000000,-0.009892,-0.009404,-0.009151,-0.009892
actors_Mary Rigby-Abernathy,-0.011848,-0.010127,-0.010582,-0.008892,-0.008892,-0.009892,-0.009892,-0.009892,-0.009892,-0.009892,...,-0.010357,-0.009892,-0.009892,-0.008892,-0.009151,-0.009892,1.000000,-0.009404,-0.009151,-0.009892
actors_Tom Cruise,-0.011264,-0.009628,-0.010061,-0.008454,-0.008454,-0.009404,-0.009404,-0.009404,-0.009404,-0.009404,...,-0.009847,-0.009404,-0.009404,-0.008454,-0.008701,-0.009404,-0.009404,1.000000,-0.008701,-0.009404
actors_Tom Hardy,-0.010961,-0.009369,-0.009790,-0.008226,-0.008226,-0.009151,-0.009151,-0.009151,-0.009151,-0.009151,...,-0.009582,-0.009151,-0.009151,-0.008226,-0.008467,-0.009151,-0.009151,-0.008701,1.000000,-0.009151


## Drop Null Values  - if present 

1. **Observation:**

            Checking how many null values are present.

In [15]:
movie_overview_columnDrop.isnull().sum()

movie_title                           0
critics_consensus                   429
actors                               10
production_company                   18
tomatometer_status                    0
tomatometer_rating                    0
audience_rating                       5
tomatometer_rotten_critics_count      0
production_budget                     0
domestic_gross                        0
worldwide_gross                       0
id_y                                  0
popularity                            0
release_date_y                        0
vote_average                          0
vote_count                            0
dtype: int64

2. **Sanity Check:**

           Double checking to ensure that we are not removing more that roughly 1% of our tables when removing 
           duplicates.

In [16]:
cellTotal = np.product(movie_overview_columnDrop.shape)

missing_values_count = movie_overview_columnDrop.isnull().sum()
total_missing = missing_values_count.sum()


percent_missing = (total_missing/cellTotal) * 100
print(percent_missing)

1.155


3. **Remove Rows With Empty Values:**

In [17]:
movie_overview_columnDrop

Unnamed: 0,movie_title,critics_consensus,actors,production_company,tomatometer_status,tomatometer_rating,audience_rating,tomatometer_rotten_critics_count,production_budget,domestic_gross,worldwide_gross,id_y,popularity,release_date_y,vote_average,vote_count
0,Percy Jackson & the Olympians: The Lightning T...,Though it may seem like just another Harry Pot...,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",20th Century Fox,Rotten,49.0,53.0,76,"$95,000,000","$88,768,303","$223,050,874",32657,26.691,2010-02-11,6.1,4229
1,Please Give,Nicole Holofcener's newest might seem slight i...,"Catherine Keener, Amanda Peet, Oliver Platt, R...",Sony Pictures Classics,Certified-Fresh,87.0,64.0,19,"$3,000,000","$4,033,574","$4,570,178",40247,5.945,2010-01-22,6.3,81
2,Criminal,"If you saw Nine Queens, it may feel redundant,...","John C. Reilly, Diego Luna, Maggie Gyllenhaal,...",Warner Bros. Pictures,Fresh,69.0,57.0,39,"$31,500,000","$14,708,696","$38,771,262",302156,13.651,2016-04-15,5.9,897
3,Criminal,Despite the valiant efforts of a game and tale...,"Kevin Costner, Gary Oldman, Tommy Lee Jones, R...",Summit Entertainment,Rotten,30.0,47.0,94,"$31,500,000","$14,708,696","$38,771,262",302156,13.651,2016-04-15,5.9,897
4,Going the Distance,,"Christopher Jacot, Shawn Roberts, Ryan Bellevi...",Séville Pictures,Rotten,0.0,61.0,5,"$32,000,000","$17,804,299","$43,603,990",38073,8.281,2010-09-03,6.0,394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,Zodiac,"A quiet, dialogue-driven thriller that deliver...","Jake Gyllenhaal, Mark Ruffalo, Robert Downey J...",Paramount Pictures,Certified-Fresh,89.0,77.0,28,"$85,000,000","$33,080,084","$83,080,084",285135,4.180,2014-08-16,3.8,49
2496,Zookeeper,Zookeeper smothers Kevin James's with a sodden...,"Kevin James, Rosario Dawson, Ken Jeong, Leslie...",Columbia Pictures,Rotten,14.0,41.0,114,"$80,000,000","$80,360,866","$170,805,525",38317,10.764,2011-07-08,5.3,886
2497,Zoolander 2,Zoolander No. 2 has more celebrity cameos than...,"Ben Stiller, Owen Wilson, Will Ferrell, Penelo...",Paramount Pictures,Rotten,22.0,20.0,181,"$50,000,000","$28,848,693","$55,348,693",329833,12.997,2016-02-12,4.7,1374
2498,Zoom,Lacking the punch and good cheer of The Incred...,"Tim Allen, Courteney Cox, Chevy Chase, Spencer...",Sony Pictures Entertainment,Rotten,4.0,33.0,65,"$35,000,000","$11,989,328","$12,506,188",351065,3.434,2016-09-02,5.5,43


In [18]:
movie_overview_dropNull = movie_overview_columnDrop.dropna()
movie_overview_dropNull

Unnamed: 0,movie_title,critics_consensus,actors,production_company,tomatometer_status,tomatometer_rating,audience_rating,tomatometer_rotten_critics_count,production_budget,domestic_gross,worldwide_gross,id_y,popularity,release_date_y,vote_average,vote_count
0,Percy Jackson & the Olympians: The Lightning T...,Though it may seem like just another Harry Pot...,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",20th Century Fox,Rotten,49.0,53.0,76,"$95,000,000","$88,768,303","$223,050,874",32657,26.691,2010-02-11,6.1,4229
1,Please Give,Nicole Holofcener's newest might seem slight i...,"Catherine Keener, Amanda Peet, Oliver Platt, R...",Sony Pictures Classics,Certified-Fresh,87.0,64.0,19,"$3,000,000","$4,033,574","$4,570,178",40247,5.945,2010-01-22,6.3,81
2,Criminal,"If you saw Nine Queens, it may feel redundant,...","John C. Reilly, Diego Luna, Maggie Gyllenhaal,...",Warner Bros. Pictures,Fresh,69.0,57.0,39,"$31,500,000","$14,708,696","$38,771,262",302156,13.651,2016-04-15,5.9,897
3,Criminal,Despite the valiant efforts of a game and tale...,"Kevin Costner, Gary Oldman, Tommy Lee Jones, R...",Summit Entertainment,Rotten,30.0,47.0,94,"$31,500,000","$14,708,696","$38,771,262",302156,13.651,2016-04-15,5.9,897
5,Going the Distance,It's timelier and a little more honest than mo...,"Drew Barrymore, Justin Long, Charlie Day, Jaso...",New Line Cinema,Rotten,54.0,52.0,77,"$32,000,000","$17,804,299","$43,603,990",38073,8.281,2010-09-03,6.0,394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,Zodiac,"A quiet, dialogue-driven thriller that deliver...","Jake Gyllenhaal, Mark Ruffalo, Robert Downey J...",Paramount Pictures,Certified-Fresh,89.0,77.0,28,"$85,000,000","$33,080,084","$83,080,084",285135,4.180,2014-08-16,3.8,49
2496,Zookeeper,Zookeeper smothers Kevin James's with a sodden...,"Kevin James, Rosario Dawson, Ken Jeong, Leslie...",Columbia Pictures,Rotten,14.0,41.0,114,"$80,000,000","$80,360,866","$170,805,525",38317,10.764,2011-07-08,5.3,886
2497,Zoolander 2,Zoolander No. 2 has more celebrity cameos than...,"Ben Stiller, Owen Wilson, Will Ferrell, Penelo...",Paramount Pictures,Rotten,22.0,20.0,181,"$50,000,000","$28,848,693","$55,348,693",329833,12.997,2016-02-12,4.7,1374
2498,Zoom,Lacking the punch and good cheer of The Incred...,"Tim Allen, Courteney Cox, Chevy Chase, Spencer...",Sony Pictures Entertainment,Rotten,4.0,33.0,65,"$35,000,000","$11,989,328","$12,506,188",351065,3.434,2016-09-02,5.5,43


## Drop duplicate values - if present 

In [None]:
movie_overview_dropDuplicates = movie_overview_dropNull.drop_duplicates(subset=['movie_title'], keep='last')

Again ensuring that there are no rows without information present

In [None]:
movie_overview_dropDuplicates.isnull().sum()

# Checking company and budget

In [24]:
movie_overview_dropDuplicates.head(3)

Unnamed: 0,movie_title,critics_consensus,actors,production_company,tomatometer_status,tomatometer_rating,audience_rating,tomatometer_rotten_critics_count,production_budget,domestic_gross,worldwide_gross,id_y,popularity,release_date_y,vote_average,vote_count
0,Percy Jackson & the Olympians: The Lightning T...,Though it may seem like just another Harry Pot...,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",20th Century Fox,Rotten,49.0,53.0,76,"$95,000,000","$88,768,303","$223,050,874",32657,26.691,2010-02-11,6.1,4229
1,Please Give,Nicole Holofcener's newest might seem slight i...,"Catherine Keener, Amanda Peet, Oliver Platt, R...",Sony Pictures Classics,Certified-Fresh,87.0,64.0,19,"$3,000,000","$4,033,574","$4,570,178",40247,5.945,2010-01-22,6.3,81
3,Criminal,Despite the valiant efforts of a game and tale...,"Kevin Costner, Gary Oldman, Tommy Lee Jones, R...",Summit Entertainment,Rotten,30.0,47.0,94,"$31,500,000","$14,708,696","$38,771,262",302156,13.651,2016-04-15,5.9,897


Taking dollar signs and commas out of the wide gross and production columns to ensure that they can be divided as integers and or floats.


In [26]:
movie_overview_dropDuplicates['domestic_gross']= movie_overview_dropDuplicates['domestic_gross'].str.replace('$','').str.replace(',','')
movie_overview_dropDuplicates

  movie_overview_dropDuplicates['domestic_gross']= movie_overview_dropDuplicates['domestic_gross'].str.replace('$','').str.replace(',','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_overview_dropDuplicates['domestic_gross']= movie_overview_dropDuplicates['domestic_gross'].str.replace('$','').str.replace(',','')


Unnamed: 0,movie_title,critics_consensus,actors,production_company,tomatometer_status,tomatometer_rating,audience_rating,tomatometer_rotten_critics_count,production_budget,domestic_gross,worldwide_gross,id_y,popularity,release_date_y,vote_average,vote_count
0,Percy Jackson & the Olympians: The Lightning T...,Though it may seem like just another Harry Pot...,"Logan Lerman, Brandon T. Jackson, Alexandra Da...",20th Century Fox,Rotten,49.0,53.0,76,"$95,000,000",88768303,"$223,050,874",32657,26.691,2010-02-11,6.1,4229
1,Please Give,Nicole Holofcener's newest might seem slight i...,"Catherine Keener, Amanda Peet, Oliver Platt, R...",Sony Pictures Classics,Certified-Fresh,87.0,64.0,19,"$3,000,000",4033574,"$4,570,178",40247,5.945,2010-01-22,6.3,81
3,Criminal,Despite the valiant efforts of a game and tale...,"Kevin Costner, Gary Oldman, Tommy Lee Jones, R...",Summit Entertainment,Rotten,30.0,47.0,94,"$31,500,000",14708696,"$38,771,262",302156,13.651,2016-04-15,5.9,897
5,Going the Distance,It's timelier and a little more honest than mo...,"Drew Barrymore, Justin Long, Charlie Day, Jaso...",New Line Cinema,Rotten,54.0,52.0,77,"$32,000,000",17804299,"$43,603,990",38073,8.281,2010-09-03,6.0,394
9,Moonlight,Moonlight uses one man's story to offer a rema...,"Naomie Harris, André Holland, Mahershala Ali, ...",A24 Films,Certified-Fresh,98.0,79.0,7,"$1,500,000",27854931,"$65,245,512",376867,15.948,2016-10-21,7.4,3893
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,Zodiac,"A quiet, dialogue-driven thriller that deliver...","Jake Gyllenhaal, Mark Ruffalo, Robert Downey J...",Paramount Pictures,Certified-Fresh,89.0,77.0,28,"$85,000,000",33080084,"$83,080,084",285135,4.180,2014-08-16,3.8,49
2496,Zookeeper,Zookeeper smothers Kevin James's with a sodden...,"Kevin James, Rosario Dawson, Ken Jeong, Leslie...",Columbia Pictures,Rotten,14.0,41.0,114,"$80,000,000",80360866,"$170,805,525",38317,10.764,2011-07-08,5.3,886
2497,Zoolander 2,Zoolander No. 2 has more celebrity cameos than...,"Ben Stiller, Owen Wilson, Will Ferrell, Penelo...",Paramount Pictures,Rotten,22.0,20.0,181,"$50,000,000",28848693,"$55,348,693",329833,12.997,2016-02-12,4.7,1374
2498,Zoom,Lacking the punch and good cheer of The Incred...,"Tim Allen, Courteney Cox, Chevy Chase, Spencer...",Sony Pictures Entertainment,Rotten,4.0,33.0,65,"$35,000,000",11989328,"$12,506,188",351065,3.434,2016-09-02,5.5,43


In [23]:
movie_overview_toString

NameError: name 'movie_overview_toString' is not defined

# Further investigate critic comments

In [None]:
movie_overview_dropNull['critics_consensus'].value_counts