In [1]:
import numpy as np
import pandas as pd
import json

## Merging Data Scraped From Multiple Sources Over Multiple Iterations

With two rounds of data each scraped from IMDB, Rotten Tomatoes, Box Office Mojo, and The Numbers, merging the dataframes required some attention and care.  


The dataframes were joined on the lowercased underscored title string. However, due to issues with multiple movies sharing titles, or remakes of the same movie over a forty year period, it was also necessary to carefully weed out duplicate records, and to ensure that accurate release dates and budget information were appended to the correct film records.

At this point, I also did some string cleaning and typecasting for data standardization.


In [2]:
imdb_df1 = pd.read_json('imdb_data_cleaned.json', lines=True)
imdb_df2 = pd.read_json('imdb_data_cleaned_2.json', lines=True)

In [3]:
imdb_df1.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
0,Tenet,tt6723592,/title/tt6723592/?ref_=adv_li_tt,2020.0,150.0,PG-13,Christopher Nolan,nm0634240,/name/nm0634240/?ref_=adv_li_dr_0,Action,Sci-Fi,Thriller,7.5,268688.0,69.0
1,The Midnight Sky,tt10539608,/title/tt10539608/?ref_=adv_li_tt,2020.0,118.0,PG-13,George Clooney,nm0000123,/name/nm0000123/?ref_=adv_li_dr_0,Drama,Fantasy,Sci-Fi,5.6,58011.0,58.0
2,Wonder Woman,tt0451279,/title/tt0451279/?ref_=adv_li_tt,2017.0,141.0,PG-13,Patty Jenkins,nm0420941,/name/nm0420941/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,7.4,564054.0,76.0
3,Avengers: Endgame,tt4154796,/title/tt4154796/?ref_=adv_li_tt,2019.0,181.0,PG-13,,,,Action,Adventure,Drama,8.4,802218.0,78.0
4,Outside the Wire,tt10451914,/title/tt10451914/?ref_=adv_li_tt,2021.0,114.0,R,Mikael Håfström,nm0405632,/name/nm0405632/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,5.4,9333.0,47.0


In [4]:
imdb_df2.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
0,Beneath,tt2325518,/title/tt2325518/?ref_=adv_li_tt,2013,90,Not Rated,Larry Fessenden,nm0275244,/name/nm0275244/?ref_=adv_li_dr_0,Horror,Sci-Fi,Thriller,3.7,2981,40
1,The Million Dollar Duck,tt0066728,/title/tt0066728/?ref_=adv_li_tt,1971,89,G,Vincent McEveety,nm0568546,/name/nm0568546/?ref_=adv_li_dr_0,Comedy,Family,Sci-Fi,5.9,2021,45
2,Cities of Last Things,tt4397342,/title/tt4397342/?ref_=adv_li_tt,2018,106,TV-MA,Wi Ding Ho,nm0387399,/name/nm0387399/?ref_=adv_li_dr_0,Crime,Drama,Sci-Fi,6.2,1227,66
3,Lazer Team,tt3864024,/title/tt3864024/?ref_=adv_li_tt,2015,102,PG-13,Matt Hullum,nm0401502,/name/nm0401502/?ref_=adv_li_dr_0,Action,Comedy,Sci-Fi,5.6,9128,42
4,The Powerpuff Girls Movie,tt0289408,/title/tt0289408/?ref_=adv_li_tt,2002,73,PG,Craig McCracken,nm0566833,/name/nm0566833/?ref_=adv_li_dr_0,Animation,Action,Adventure,6.6,9200,65


In [5]:
imdb_df1.shape

(2000, 15)

In [6]:
imdb_df2.shape

(83, 15)

In [13]:
imdb_df_full = pd.concat([imdb_df1, imdb_df2], axis=0)

In [14]:
imdb_df_full.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
0,Tenet,tt6723592,/title/tt6723592/?ref_=adv_li_tt,2020.0,150.0,PG-13,Christopher Nolan,nm0634240,/name/nm0634240/?ref_=adv_li_dr_0,Action,Sci-Fi,Thriller,7.5,268688.0,69.0
1,The Midnight Sky,tt10539608,/title/tt10539608/?ref_=adv_li_tt,2020.0,118.0,PG-13,George Clooney,nm0000123,/name/nm0000123/?ref_=adv_li_dr_0,Drama,Fantasy,Sci-Fi,5.6,58011.0,58.0
2,Wonder Woman,tt0451279,/title/tt0451279/?ref_=adv_li_tt,2017.0,141.0,PG-13,Patty Jenkins,nm0420941,/name/nm0420941/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,7.4,564054.0,76.0
3,Avengers: Endgame,tt4154796,/title/tt4154796/?ref_=adv_li_tt,2019.0,181.0,PG-13,,,,Action,Adventure,Drama,8.4,802218.0,78.0
4,Outside the Wire,tt10451914,/title/tt10451914/?ref_=adv_li_tt,2021.0,114.0,R,Mikael Håfström,nm0405632,/name/nm0405632/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,5.4,9333.0,47.0


In [15]:
imdb_df_full.shape

(2083, 15)

In [16]:
mojo_df1 = pd.read_json('mojo_data.json', lines=True)
mojo_df2 = pd.read_json('mojo_data2.json', lines=True)

In [17]:
mojo_df1.shape, mojo_df2.shape

((2000, 4), (83, 4))

In [18]:
mojo_df_full = pd.concat([mojo_df1, mojo_df2], axis =0)

In [20]:
mojo_df_full.shape

(2083, 4)

In [21]:
tomato_df1 = pd.read_json('tomato_data1.json', lines=True)
tomato_df2 = pd.read_json('tomato_data2.json', lines=True)

In [22]:
tomato_df1.shape, tomato_df2.shape

((2000, 5), (2083, 5))

In [23]:
tomato_df_full = tomato_df2

In [24]:
imdb_df_full.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
0,Tenet,tt6723592,/title/tt6723592/?ref_=adv_li_tt,2020.0,150.0,PG-13,Christopher Nolan,nm0634240,/name/nm0634240/?ref_=adv_li_dr_0,Action,Sci-Fi,Thriller,7.5,268688.0,69.0
1,The Midnight Sky,tt10539608,/title/tt10539608/?ref_=adv_li_tt,2020.0,118.0,PG-13,George Clooney,nm0000123,/name/nm0000123/?ref_=adv_li_dr_0,Drama,Fantasy,Sci-Fi,5.6,58011.0,58.0
2,Wonder Woman,tt0451279,/title/tt0451279/?ref_=adv_li_tt,2017.0,141.0,PG-13,Patty Jenkins,nm0420941,/name/nm0420941/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,7.4,564054.0,76.0
3,Avengers: Endgame,tt4154796,/title/tt4154796/?ref_=adv_li_tt,2019.0,181.0,PG-13,,,,Action,Adventure,Drama,8.4,802218.0,78.0
4,Outside the Wire,tt10451914,/title/tt10451914/?ref_=adv_li_tt,2021.0,114.0,R,Mikael Håfström,nm0405632,/name/nm0405632/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,5.4,9333.0,47.0


In [25]:
tomato_df_full.head()

Unnamed: 0,title,tomatometer,tomato_ct,aud_score,aud_score_ct
0,tenet,70.0,324.0,76.0,"Verified Ratings: 5,837"
1,the_midnight_sky,51.0,226.0,26.0,2122
2,wonder_woman,83.0,12.0,78.0,1910
3,avengers_endgame,94.0,532.0,90.0,70830
4,outside_the_wire,36.0,58.0,33.0,295


In [26]:
imdb_df_full['title_cc'] = imdb_df_full['title'].str.lower() 
imdb_df_full['title_cc'] = imdb_df_full['title_cc'].str.replace('&','and',regex=False)
imdb_df_full['title_cc'] = imdb_df_full['title_cc'].str.replace('\\','_',regex=True)
imdb_df_full['title_cc'] = imdb_df_full['title_cc'].str.replace('episode\s([ivx]*)\s-\s','',regex=True)
imdb_df_full['title_cc'] = imdb_df_full['title_cc'].str.replace("\s",'_',regex=True)
imdb_df_full['title_cc'] = imdb_df_full['title_cc'].str.replace('_+','_',regex=True)

In [27]:
imdb_df_full.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore,title_cc
0,Tenet,tt6723592,/title/tt6723592/?ref_=adv_li_tt,2020.0,150.0,PG-13,Christopher Nolan,nm0634240,/name/nm0634240/?ref_=adv_li_dr_0,Action,Sci-Fi,Thriller,7.5,268688.0,69.0,tenet
1,The Midnight Sky,tt10539608,/title/tt10539608/?ref_=adv_li_tt,2020.0,118.0,PG-13,George Clooney,nm0000123,/name/nm0000123/?ref_=adv_li_dr_0,Drama,Fantasy,Sci-Fi,5.6,58011.0,58.0,the_midnight_sky
2,Wonder Woman,tt0451279,/title/tt0451279/?ref_=adv_li_tt,2017.0,141.0,PG-13,Patty Jenkins,nm0420941,/name/nm0420941/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,7.4,564054.0,76.0,wonder_woman
3,Avengers: Endgame,tt4154796,/title/tt4154796/?ref_=adv_li_tt,2019.0,181.0,PG-13,,,,Action,Adventure,Drama,8.4,802218.0,78.0,avengers:_endgame
4,Outside the Wire,tt10451914,/title/tt10451914/?ref_=adv_li_tt,2021.0,114.0,R,Mikael Håfström,nm0405632,/name/nm0405632/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,5.4,9333.0,47.0,outside_the_wire


In [52]:
mojo_df_full.head()

Unnamed: 0,id,title_string,dom_gross,release_date
0,tt6723592,Tenet - Box Office Mojo,"$57,929,000","August 26, 2020\n (EMEA, APAC)"
1,tt10539608,The Midnight Sky - Box Office Mojo,"$62,557","December 9, 2020\n (South Korea)"
2,tt0451279,Wonder Woman - Box Office Mojo,"$412,815,408","May 30, 2017\n (APAC)"
3,tt4154796,Avengers: Endgame - Box Office Mojo,"$858,373,000","April 24, 2019\n (21 markets)"
4,tt10451914,Outside the Wire - Box Office Mojo,,


In [32]:
imdb_df_full.shape

(2083, 16)

In [33]:
tomato_df_full.shape

(2083, 5)

In [50]:
mojo_df_full.shape

(2083, 4)

In [35]:
imdb_df_full = imdb_df_full.sort_values(by='title_cc', ascending=True)

In [36]:
tomato_df_full = tomato_df_full.sort_values(by='title', ascending=True)

In [54]:
imdb_df_full = imdb_df_full.sort_values(by='imdb_id', ascending=True)

In [55]:
mojo_df_full = mojo_df_full.sort_values(by='id', ascending=True)

In [48]:
imdb_tomato = pd.merge(imdb_df_full, tomato_df_full, how='right', left_on='title_cc', right_on='title').fillna(np.nan)

In [70]:
imdb_tomato['title_cc'].value_counts()

fantastic_four            9
frankenstein              9
godzilla                  9
the_lost_world            9
cargo                     9
                         ..
the_door                  1
trancers                  1
flesh_for_frankenstein    1
gandahar                  1
the_cabbage_soup          1
Name: title_cc, Length: 1613, dtype: int64

In [87]:
mojo_df_full.shape

(2083, 4)

In [88]:
imdb_df_full.shape

(2083, 16)

In [93]:
imdb_mojo = pd.merge(imdb_df_full, mojo_df_full, how='left', left_on='imdb_id', right_on='imdb_id').fillna(np.nan)

In [81]:
mojo_df_full.rename(columns = {'id':'imdb_id'}, inplace = True)

In [94]:
imdb_mojo.shape

(2093, 19)

In [95]:
imdb_mojo.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore,title_cc,title_string,dom_gross,release_date
0,The Lost World,tt0016039,/title/tt0016039/?ref_=adv_li_tt,1925.0,110.0,Passed,Harry O. Hoyt,nm0398464,/name/nm0398464/?ref_=adv_li_dr_0,Adventure,Fantasy,Sci-Fi,7.0,4624.0,,the_lost_world,The Lost World - Box Office Mojo,,
1,Metropolis,tt0017136,/title/tt0017136/?ref_=adv_li_tt,1927.0,153.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Drama,Sci-Fi,,8.3,159589.0,98.0,metropolis,Metropolis - Box Office Mojo,"$1,236,166","May 6, 1927\n (Domestic)"
2,Woman in the Moon,tt0019901,/title/tt0019901/?ref_=adv_li_tt,1929.0,95.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Adventure,Comedy,Drama,7.3,3028.0,,woman_in_the_moon,Woman in the Moon - Box Office Mojo,,
3,Frankenstein,tt0021884,/title/tt0021884/?ref_=adv_li_tt,1931.0,70.0,Passed,James Whale,nm0001843,/name/nm0001843/?ref_=adv_li_dr_0,Drama,Horror,Sci-Fi,7.8,65146.0,91.0,frankenstein,Frankenstein - Box Office Mojo,"$1,626","November 21, 1931\n (Domestic)"
4,Svengali,tt0022454,/title/tt0022454/?ref_=adv_li_tt,1931.0,81.0,Approved,Archie Mayo,nm0562845,/name/nm0562845/?ref_=adv_li_dr_0,Drama,Horror,Romance,6.8,1793.0,,svengali,Svengali - Box Office Mojo,,


In [109]:
pd.set_option('display.max_rows', 75)

In [110]:
imdb_mojo['imdb_id'].value_counts()[0:25]

tt9264728    1
tt1483013    1
tt1355630    1
tt5862242    1
tt0074559    1
tt1098327    1
tt0080391    1
tt8633542    1
tt0081182    1
tt2924392    1
tt2580382    1
tt0097100    1
tt6544220    1
tt0109836    1
tt3289728    1
tt0441796    1
tt1139592    1
tt0102975    1
tt0775552    1
tt6644200    1
tt0067065    1
tt2051879    1
tt0016039    1
tt6604188    1
tt1023500    1
Name: imdb_id, dtype: int64

In [75]:
imdb_mojo[imdb_mojo['id'].isin(['tt0102687', 'tt3864024', 'tt0455326', 'tt4397342','tt4720596'])]

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore,title_cc,id,title_string,dom_gross,release_date
515,Poison,tt0102687,/title/tt0102687/?ref_=adv_li_tt,1991.0,85.0,R,Todd Haynes,nm0001331,/name/nm0001331/?ref_=adv_li_dr_0,Drama,Horror,Romance,6.5,3945.0,67.0,poison,tt0102687,Poison - Box Office Mojo,"$787,280","April 5, 1991\n (Domestic)"
516,Poison,tt0102687,/title/tt0102687/?ref_=adv_li_tt,1991.0,85.0,R,Todd Haynes,nm0001331,/name/nm0001331/?ref_=adv_li_dr_0,Drama,Horror,Romance,6.5,3945.0,67.0,poison,tt0102687,Poison - Box Office Mojo,"$787,280","April 5, 1991\n (Domestic)"
517,Poison,tt0102687,/title/tt0102687/?ref_=adv_li_tt,1991.0,85.0,R,Todd Haynes,nm0001331,/name/nm0001331/?ref_=adv_li_dr_0,Drama,Horror,Romance,6.5,3945.0,67.0,poison,tt0102687,Poison - Box Office Mojo,"$787,280","April 5, 1991\n (Domestic)"
518,Poison,tt0102687,/title/tt0102687/?ref_=adv_li_tt,1991.0,85.0,R,Todd Haynes,nm0001331,/name/nm0001331/?ref_=adv_li_dr_0,Drama,Horror,Romance,6.5,3945.0,67.0,poison,tt0102687,Poison - Box Office Mojo,"$787,280","April 5, 1991\n (Domestic)"
949,Aqua Teen Hunger Force Colon Movie Film for Th...,tt0455326,/title/tt0455326/?ref_=adv_li_tt,2007.0,86.0,R,,,,Animation,Action,Adventure,6.7,13324.0,54.0,aqua_teen_hunger_force_colon_movie_film_for_th...,tt0455326,Aqua Teen Hunger Force Colon Movie Film for Th...,"$5,520,368","April 13, 2007\n (Domestic)"
950,Aqua Teen Hunger Force Colon Movie Film for Th...,tt0455326,/title/tt0455326/?ref_=adv_li_tt,2007.0,86.0,R,,,,Animation,Action,Adventure,6.7,13323.0,54.0,aqua_teen_hunger_force_colon_movie_film_for_th...,tt0455326,Aqua Teen Hunger Force Colon Movie Film for Th...,"$5,520,368","April 13, 2007\n (Domestic)"
951,Aqua Teen Hunger Force Colon Movie Film for Th...,tt0455326,/title/tt0455326/?ref_=adv_li_tt,2007.0,86.0,R,,,,Animation,Action,Adventure,6.7,13324.0,54.0,aqua_teen_hunger_force_colon_movie_film_for_th...,tt0455326,Aqua Teen Hunger Force Colon Movie Film for Th...,"$5,520,368","April 13, 2007\n (Domestic)"
952,Aqua Teen Hunger Force Colon Movie Film for Th...,tt0455326,/title/tt0455326/?ref_=adv_li_tt,2007.0,86.0,R,,,,Animation,Action,Adventure,6.7,13323.0,54.0,aqua_teen_hunger_force_colon_movie_film_for_th...,tt0455326,Aqua Teen Hunger Force Colon Movie Film for Th...,"$5,520,368","April 13, 2007\n (Domestic)"
1640,Lazer Team,tt3864024,/title/tt3864024/?ref_=adv_li_tt,2015.0,102.0,PG-13,Matt Hullum,nm0401502,/name/nm0401502/?ref_=adv_li_dr_0,Action,Comedy,Sci-Fi,5.6,9128.0,42.0,lazer_team,tt3864024,Lazer Team - Box Office Mojo,"$1,186,426","January 27, 2016\n (Domestic)"
1641,Lazer Team,tt3864024,/title/tt3864024/?ref_=adv_li_tt,2015.0,102.0,PG-13,Matt Hullum,nm0401502,/name/nm0401502/?ref_=adv_li_dr_0,Action,Comedy,Sci-Fi,5.6,9127.0,42.0,lazer_team,tt3864024,Lazer Team - Box Office Mojo,"$1,186,426","January 27, 2016\n (Domestic)"


In [97]:
imdb_mojo = imdb_mojo.drop([516,517,518,950, 951,952, 1641,1642,1643,1694,1692,1693, 1733,1734,1735])

In [99]:
imdb_mojo[imdb_mojo['imdb_id'].isin(['tt0102687', 'tt3864024', 'tt0455326', 'tt4397342','tt4720596'])]

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore,title_cc,title_string,dom_gross,release_date
515,Poison,tt0102687,/title/tt0102687/?ref_=adv_li_tt,1991.0,85.0,R,Todd Haynes,nm0001331,/name/nm0001331/?ref_=adv_li_dr_0,Drama,Horror,Romance,6.5,3945.0,67.0,poison,Poison - Box Office Mojo,"$787,280","April 5, 1991\n (Domestic)"
949,Aqua Teen Hunger Force Colon Movie Film for Th...,tt0455326,/title/tt0455326/?ref_=adv_li_tt,2007.0,86.0,R,,,,Animation,Action,Adventure,6.7,13324.0,54.0,aqua_teen_hunger_force_colon_movie_film_for_th...,Aqua Teen Hunger Force Colon Movie Film for Th...,"$5,520,368","April 13, 2007\n (Domestic)"
1640,Lazer Team,tt3864024,/title/tt3864024/?ref_=adv_li_tt,2015.0,102.0,PG-13,Matt Hullum,nm0401502,/name/nm0401502/?ref_=adv_li_dr_0,Action,Comedy,Sci-Fi,5.6,9128.0,42.0,lazer_team,Lazer Team - Box Office Mojo,"$1,186,426","January 27, 2016\n (Domestic)"
1691,Cities of Last Things,tt4397342,/title/tt4397342/?ref_=adv_li_tt,2018.0,106.0,TV-MA,Wi Ding Ho,nm0387399,/name/nm0387399/?ref_=adv_li_dr_0,Crime,Drama,Sci-Fi,6.2,1227.0,66.0,cities_of_last_things,Cities of Last Things - Box Office Mojo,,
1732,Curvature,tt4720596,/title/tt4720596/?ref_=adv_li_tt,2017.0,90.0,,Diego Hallivis,nm3611504,/name/nm3611504/?ref_=adv_li_dr_0,Mystery,Sci-Fi,Thriller,4.8,1736.0,38.0,curvature,Curvature - Box Office Mojo,,"February 23, 2018\n (Domestic)"


In [100]:
imdb_mojo.shape

(2078, 19)

In [101]:
tomato_df_full.shape

(2083, 5)

In [103]:
tomato_df_full.head()

Unnamed: 0,title,tomatometer,tomato_ct,aud_score,aud_score_ct
1621,1,,,,
1598,100_degrees_below_zero,8.0,,,114.0
1194,100_earthquake,14.0,,,106.0
265,10_cloverfield_lane,90.0,312.0,79.0,60968.0
60,12_monkeys,89.0,70.0,88.0,391142.0


In [104]:
imdb_mojo_tomato = pd.merge(imdb_mojo, tomato_df_full, how='left', left_on='title_cc', right_on='title').fillna(np.nan)

In [105]:
imdb_mojo_tomato.shape

(2208, 24)

In [115]:
title_counts = imdb_mojo_tomato['title_cc'].value_counts()

In [123]:
type(title_counts)

pandas.core.series.Series

In [124]:
title_counts.head()

fantastic_four    9
godzilla          9
frankenstein      9
the_lost_world    9
cargo             9
Name: title_cc, dtype: int64

In [127]:
title_counts = title_counts[title_counts > 1].reset_index()

In [130]:
title_counts.head()

Unnamed: 0,index,title_cc
0,fantastic_four,9
1,godzilla,9
2,frankenstein,9
3,the_lost_world,9
4,cargo,9


In [134]:
title_counts.index

RangeIndex(start=0, stop=60, step=1)

In [135]:
imdb_mojo_tomato[imdb_mojo_tomato['title_cc'].isin(title_counts['index'])]

Unnamed: 0,title_x,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,...,metascore,title_cc,title_string,dom_gross,release_date,title_y,tomatometer,tomato_ct,aud_score,aud_score_ct
0,The Lost World,tt0016039,/title/tt0016039/?ref_=adv_li_tt,1925.0,110.0,Passed,Harry O. Hoyt,nm0398464,/name/nm0398464/?ref_=adv_li_dr_0,Adventure,...,,the_lost_world,The Lost World - Box Office Mojo,,,the_lost_world,100.0,15.0,69.0,3249
1,The Lost World,tt0016039,/title/tt0016039/?ref_=adv_li_tt,1925.0,110.0,Passed,Harry O. Hoyt,nm0398464,/name/nm0398464/?ref_=adv_li_dr_0,Adventure,...,,the_lost_world,The Lost World - Box Office Mojo,,,the_lost_world,100.0,15.0,69.0,3249
2,The Lost World,tt0016039,/title/tt0016039/?ref_=adv_li_tt,1925.0,110.0,Passed,Harry O. Hoyt,nm0398464,/name/nm0398464/?ref_=adv_li_dr_0,Adventure,...,,the_lost_world,The Lost World - Box Office Mojo,,,the_lost_world,100.0,15.0,69.0,3249
3,Metropolis,tt0017136,/title/tt0017136/?ref_=adv_li_tt,1927.0,153.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Drama,...,98.0,metropolis,Metropolis - Box Office Mojo,"$1,236,166","May 6, 1927\n (Domestic)",metropolis,,,,
4,Metropolis,tt0017136,/title/tt0017136/?ref_=adv_li_tt,1927.0,153.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Drama,...,98.0,metropolis,Metropolis - Box Office Mojo,"$1,236,166","May 6, 1927\n (Domestic)",metropolis,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Invasion,tt8060328,/title/tt8060328/?ref_=adv_li_tt,2020.0,134.0,Not Rated,Fedor Bondarchuk,nm0094080,/name/nm0094080/?ref_=adv_li_dr_0,Action,...,,invasion,Invasion - Box Office Mojo,"$16,262,755","November 29, 2019\n (Germany)",invasion,19.0,164.0,40.0,236143
2107,Invasion,tt8060328,/title/tt8060328/?ref_=adv_li_tt,2020.0,134.0,Not Rated,Fedor Bondarchuk,nm0094080,/name/nm0094080/?ref_=adv_li_dr_0,Action,...,,invasion,Invasion - Box Office Mojo,"$16,262,755","November 29, 2019\n (Germany)",invasion,19.0,164.0,40.0,236143
2157,Cargo,tt8992946,/title/tt8992946/?ref_=adv_li_tt,2019.0,119.0,,Arati Kadav,nm6814179,/name/nm6814179/?ref_=adv_li_dr_0,Drama,...,,cargo,Cargo - Box Office Mojo,,,cargo,19.0,,,1070
2158,Cargo,tt8992946,/title/tt8992946/?ref_=adv_li_tt,2019.0,119.0,,Arati Kadav,nm6814179,/name/nm6814179/?ref_=adv_li_dr_0,Drama,...,,cargo,Cargo - Box Office Mojo,,,cargo,19.0,,,1070


In [136]:
imdb_mojo_tomato.drop_duplicates()

Unnamed: 0,title_x,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,...,metascore,title_cc,title_string,dom_gross,release_date,title_y,tomatometer,tomato_ct,aud_score,aud_score_ct
0,The Lost World,tt0016039,/title/tt0016039/?ref_=adv_li_tt,1925.0,110.0,Passed,Harry O. Hoyt,nm0398464,/name/nm0398464/?ref_=adv_li_dr_0,Adventure,...,,the_lost_world,The Lost World - Box Office Mojo,,,the_lost_world,100.0,15.0,69.0,3249
3,Metropolis,tt0017136,/title/tt0017136/?ref_=adv_li_tt,1927.0,153.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Drama,...,98.0,metropolis,Metropolis - Box Office Mojo,"$1,236,166","May 6, 1927\n (Domestic)",metropolis,,,,
5,Woman in the Moon,tt0019901,/title/tt0019901/?ref_=adv_li_tt,1929.0,95.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Adventure,...,,woman_in_the_moon,Woman in the Moon - Box Office Mojo,,,woman_in_the_moon,75.0,8.0,72.0,643
6,Frankenstein,tt0021884,/title/tt0021884/?ref_=adv_li_tt,1931.0,70.0,Passed,James Whale,nm0001843,/name/nm0001843/?ref_=adv_li_dr_0,Drama,...,91.0,frankenstein,Frankenstein - Box Office Mojo,"$1,626","November 21, 1931\n (Domestic)",frankenstein,0.0,,,113
9,Svengali,tt0022454,/title/tt0022454/?ref_=adv_li_tt,1931.0,81.0,Approved,Archie Mayo,nm0562845,/name/nm0562845/?ref_=adv_li_dr_0,Drama,...,,svengali,Svengali - Box Office Mojo,,,svengali,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2203,Breach,tt9820556,/title/tt9820556/?ref_=adv_li_tt,2020.0,92.0,R,John Suits,nm2986811,/name/nm2986811/?ref_=adv_li_dr_0,Action,...,,breach,Breach - Box Office Mojo,"$36,721","December 17, 2020\n (Russia/CIS)",breach,84.0,177.0,67.0,377372
2204,The Banana Splits Movie,tt9831136,/title/tt9831136/?ref_=adv_li_tt,2019.0,89.0,R,Danishka Esterhazy,nm0261629,/name/nm0261629/?ref_=adv_li_dr_0,Comedy,...,,the_banana_splits_movie,The Banana Splits Movie - Box Office Mojo,,,the_banana_splits_movie,63.0,19.0,50.0,183
2205,Escape Room 2,tt9844522,/title/tt9844522/?ref_=adv_li_tt,2021.0,,,Adam Robitel,nm0733263,/name/nm0733263/?ref_=adv_li_dr_0,Action,...,,escape_room_2,Escape Room 2 - Box Office Mojo,,,escape_room_2,,,,
2206,Untitled: The Walking Dead Movie,tt9859436,/title/tt9859436/?ref_=adv_li_tt,,,,Greg Nicotero,nm0630524,/name/nm0630524/?ref_=adv_li_dr_0,Drama,...,,untitled:_the_walking_dead_movie,Untitled: The Walking Dead Movie - Box Office ...,,,,,,,


In [137]:
imdb_mojo_tomato = imdb_mojo_tomato.drop_duplicates()

In [138]:
imdb_mojo_tomato.drop('title_y', axis=1)

Unnamed: 0,title_x,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,...,votes,metascore,title_cc,title_string,dom_gross,release_date,tomatometer,tomato_ct,aud_score,aud_score_ct
0,The Lost World,tt0016039,/title/tt0016039/?ref_=adv_li_tt,1925.0,110.0,Passed,Harry O. Hoyt,nm0398464,/name/nm0398464/?ref_=adv_li_dr_0,Adventure,...,4624.0,,the_lost_world,The Lost World - Box Office Mojo,,,100.0,15.0,69.0,3249
3,Metropolis,tt0017136,/title/tt0017136/?ref_=adv_li_tt,1927.0,153.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Drama,...,159589.0,98.0,metropolis,Metropolis - Box Office Mojo,"$1,236,166","May 6, 1927\n (Domestic)",,,,
5,Woman in the Moon,tt0019901,/title/tt0019901/?ref_=adv_li_tt,1929.0,95.0,Not Rated,Fritz Lang,nm0000485,/name/nm0000485/?ref_=adv_li_dr_0,Adventure,...,3028.0,,woman_in_the_moon,Woman in the Moon - Box Office Mojo,,,75.0,8.0,72.0,643
6,Frankenstein,tt0021884,/title/tt0021884/?ref_=adv_li_tt,1931.0,70.0,Passed,James Whale,nm0001843,/name/nm0001843/?ref_=adv_li_dr_0,Drama,...,65146.0,91.0,frankenstein,Frankenstein - Box Office Mojo,"$1,626","November 21, 1931\n (Domestic)",0.0,,,113
9,Svengali,tt0022454,/title/tt0022454/?ref_=adv_li_tt,1931.0,81.0,Approved,Archie Mayo,nm0562845,/name/nm0562845/?ref_=adv_li_dr_0,Drama,...,1793.0,,svengali,Svengali - Box Office Mojo,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2203,Breach,tt9820556,/title/tt9820556/?ref_=adv_li_tt,2020.0,92.0,R,John Suits,nm2986811,/name/nm2986811/?ref_=adv_li_dr_0,Action,...,4304.0,,breach,Breach - Box Office Mojo,"$36,721","December 17, 2020\n (Russia/CIS)",84.0,177.0,67.0,377372
2204,The Banana Splits Movie,tt9831136,/title/tt9831136/?ref_=adv_li_tt,2019.0,89.0,R,Danishka Esterhazy,nm0261629,/name/nm0261629/?ref_=adv_li_dr_0,Comedy,...,2632.0,,the_banana_splits_movie,The Banana Splits Movie - Box Office Mojo,,,63.0,19.0,50.0,183
2205,Escape Room 2,tt9844522,/title/tt9844522/?ref_=adv_li_tt,2021.0,,,Adam Robitel,nm0733263,/name/nm0733263/?ref_=adv_li_dr_0,Action,...,,,escape_room_2,Escape Room 2 - Box Office Mojo,,,,,,
2206,Untitled: The Walking Dead Movie,tt9859436,/title/tt9859436/?ref_=adv_li_tt,,,,Greg Nicotero,nm0630524,/name/nm0630524/?ref_=adv_li_dr_0,Drama,...,,,untitled:_the_walking_dead_movie,Untitled: The Walking Dead Movie - Box Office ...,,,,,,


In [139]:
imdb_mojo_tomato.to_json('imdb_mojo_tomato.json', orient='records', lines=True)

In [141]:
budget_df_clean = pd.read_json('budget_df_cleaned.json', lines=True)

In [143]:
budget_df_clean.shape

(5049, 7)

In [144]:
imdb_mojo_tomato.shape

(2080, 24)

In [150]:
budget_df_clean = budget_df_clean.sort_values(by='title_cc', ascending=True)

In [185]:
imdb_mojo_tomato.columns

Index(['title_x', 'imdb_id', 'title_link', 'year', 'runtime', 'mpaa_rating',
       'director_name', 'director_id', 'director_link', 'genre_01', 'genre_02',
       'genre_03', 'imdb_rating', 'votes', 'metascore', 'title_cc',
       'title_string', 'dom_gross', 'release_date', 'title_y', 'tomatometer',
       'tomato_ct', 'aud_score', 'aud_score_ct'],
      dtype='object')

In [183]:
budget_df_clean['title']

418                    10,000 B.C.
584                 102 Dalmatians
3192           10 Cloverfield Lane
3652         10 Days in a Madhouse
3457    10 Things I Hate About You
                   ...            
220                       Zootopia
3161                          Zulu
2598                     Zwartboek
4458          Ãpouse-moi mon pote
2865            é·æ±ä¸è (CJ7)
Name: title, Length: 5049, dtype: object

In [151]:
imdb_mojo_tomato = imdb_mojo_tomato.sort_values(by='title_cc', ascending=True)

In [237]:
ITMB = pd.merge(imdb_mojo_tomato, budget_df_clean, how='left', left_on='title_cc', right_on='title_cc').fillna(np.nan)

In [194]:
imdb_mojo_tomato['dom_gross'] = imdb_mojo_tomato['dom_gross'].str.strip('$')

In [196]:
imdb_mojo_tomato['dom_gross'] = imdb_mojo_tomato['dom_gross'].str.replace(',', '',regex=True)

In [200]:
imdb_mojo_tomato['dom_gross'] = imdb_mojo_tomato['dom_gross'].fillna('0')

In [201]:
imdb_mojo_tomato['dom_gross'] = imdb_mojo_tomato['dom_gross'].astype('int64')

In [205]:
imdb_mojo_tomato['dom_gross'] = imdb_mojo_tomato['dom_gross'].astype('str')

In [204]:
budget_df_clean['dom_gross']=budget_df_clean['dom_gross'].astype('str')

In [202]:
imdb_mojo_tomato['dom_gross'].describe()

count    2.080000e+03
mean     3.017913e+07
std      8.077319e+07
min      0.000000e+00
25%      0.000000e+00
50%      1.462130e+05
75%      1.979536e+07
max      9.366622e+08
Name: dom_gross, dtype: float64

In [244]:
ITMB.shape

(2113, 30)

In [219]:
ITMB[['title_x', 'title_cc_x', 'title_cc_y']]

Unnamed: 0,title_x,title_cc_x,title_cc_y
0,*batteries not included,*batteries_not_included,
1,+1,+1,1114
2,+1,+1,1612
3,+1,+1,1776
4,+1,+1,213
...,...,...,...
241878,Zoombies,zoombies,zulu
241879,Zoombies,zoombies,ãpousemoi_mon_pote
241880,Æon Flux,æon_flux,
241881,Évolution,évolution,


In [246]:
ITMB_minimal = ITMB[(ITMB['dom_gross_x'].notna()) & (ITMB['dom_gross_y'].notna())]

In [210]:
tomato_titles = []
budget_titles = []

In [247]:
ITMB_minimal.shape

(554, 30)

In [249]:
imdb_mojo_tomato.shape

(2080, 24)

In [250]:
budget_df_clean.shape

(5049, 7)

In [252]:
ITMB_minimal.columns

Index(['title_x', 'imdb_id', 'title_link', 'year', 'runtime', 'mpaa_rating',
       'director_name', 'director_id', 'director_link', 'genre_01', 'genre_02',
       'genre_03', 'imdb_rating', 'votes', 'metascore', 'title_cc',
       'title_string', 'dom_gross_x', 'release_date_x', 'title_y',
       'tomatometer', 'tomato_ct', 'aud_score', 'aud_score_ct', 'title',
       'release_date_y', 'budget', 'dom_gross_y', 'ww_gross', 'release_date2'],
      dtype='object')

In [254]:
ITMB_minimal.to_json('ITMB_data.json', orient='records', lines=True)

In [212]:
tomato_str = imdb_mojo_tomato['title_cc'].str[0:4]

In [213]:
tomato_gross = imdb_mojo_tomato['dom_gross'].str[0:4]

In [214]:
budget_str = budget_df_clean['title_cc'][0:4]

In [215]:
budget_gross = budget_df_clean['dom_gross'].str[0:4]

In [226]:
ITMB.shape

(2080, 34)

In [228]:
ITMB.head()

Unnamed: 0,best_match_score,__id_left,__id_right,title_x,imdb_id,title_link,year,runtime,mpaa_rating,director_name,...,tomato_ct,aud_score,aud_score_ct,title,release_date_right,budget,dom_gross_right,ww_gross,title_cc_right,release_date2
0,,0_left,,*batteries not included,tt0092494,/title/tt0092494/?ref_=adv_li_tt,1987.0,106.0,PG,Matthew Robbins,...,,,,,,,,,,
1,,1_left,,+1,tt2395385,/title/tt2395385/?ref_=adv_li_tt,2013.0,96.0,Not Rated,Dennis Iliadis,...,,,,,,,,,,
2,,2_left,,10.0 Earthquake,tt3488056,/title/tt3488056/?ref_=adv_li_tt,2014.0,88.0,PG,David Gidali,...,,,,,,,,,,
3,,3_left,,100 Degrees Below Zero,tt2538128,/title/tt2538128/?ref_=adv_li_tt,2013.0,89.0,Not Rated,R.D. Braunstein,...,,,114.0,,,,,,,
4,0.252118,4_left,2_right,10 Cloverfield Lane,tt1179933,/title/tt1179933/?ref_=adv_li_tt,2016.0,103.0,PG-13,Dan Trachtenberg,...,312.0,79.0,60968.0,10 Cloverfield Lane,1451866000000.0,15000000.0,72082999.0,72082999.0,10_cloverfield_lane,"Jan 4, 2016"


In [242]:
ITMB.columns

Index(['title_x', 'imdb_id', 'title_link', 'year', 'runtime', 'mpaa_rating',
       'director_name', 'director_id', 'director_link', 'genre_01', 'genre_02',
       'genre_03', 'imdb_rating', 'votes', 'metascore', 'title_cc',
       'title_string', 'dom_gross_x', 'release_date_x', 'title_y',
       'tomatometer', 'tomato_ct', 'aud_score', 'aud_score_ct', 'title',
       'release_date_y', 'budget', 'dom_gross_y', 'ww_gross', 'release_date2'],
      dtype='object')