In [1]:
import numpy as np
import pandas as pd
import json

### Cleaning the IMDB Data

Initially, I had planned to iterate through and scrape the individual pages for film and director.  To this end, I selected the unique IMDB ids for director and title from the urls present on the title search page.  I also separated out the three genre keywords, intending to use all three columns as part of the analysis. Ultimately, I used only the first of the three keywords, and I used the aggregate information for each director within the dataset for the film counts for each.

Rather than cleaning and formatting each string or data type as it came through the for loops in the data scraping, I cleaned and formatted them here using the Pandas series.

In [116]:
imdb_df2 = pd.read_json('imdb_data2.json', lines=True)
# lines=True : read file as a json object per line

In [117]:
imdb_df2.head()

Unnamed: 0,title,title_link,mpaa_rating,year,runtime,imdb_rating,metascore,votes,director_name,director_link,imdb_genre_list
0,Trancers II,/title/tt0103116/?ref_=adv_li_tt,R,(1991),88 min,5.4,,1847.0,Charles Band,/name/nm0023929/?ref_=adv_li_dr_0,"\nAction, Horror, Sci-Fi"
1,Drive,/title/tt0116147/?ref_=adv_li_tt,R,(1997),100 min,6.8,,5451.0,Steve Wang,/name/nm0911036/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi"
2,Timerider: The Adventure of Lyle Swann,/title/tt0086443/?ref_=adv_li_tt,PG,(1982),94 min,5.4,,2005.0,William Dear,/name/nm0213100/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi"
3,Thirst,/title/tt3955808/?ref_=adv_li_tt,,(III) (2015),87 min,4.4,,1045.0,Greg Kiefer,/name/nm2279788/?ref_=adv_li_dr_0,"\nAction, Adventure, Horror"
4,I Married a Strange Person!,/title/tt0119346/?ref_=adv_li_tt,R,(1997),75 min,7.0,,1781.0,Bill Plympton,/name/nm0687739/?ref_=adv_li_dr_0,"\nAnimation, Comedy, Drama"


#### Segmenting Out Unique IDs for Title and Director

The second segment of the title url (`title_link`) is the unique `imdb_id` for the film, facilitating specific title searching on both IMDB and Box Office Mojo.  The unique link url for each director had a similar ID.

In [118]:
imdb_df2['imdb_id'] = (imdb_df2['title_link'].str.split("/", n=3, expand=True))[2]                               
imdb_df2['imdb_id']

0       tt0103116
1       tt0116147
2       tt0086443
3       tt3955808
4       tt0119346
          ...    
1995    tt4235644
1996    tt0047930
1997    tt0102562
1998    tt3118442
1999    tt4920098
Name: imdb_id, Length: 2000, dtype: object

In [119]:
imdb_df2['director_id'] = (imdb_df2['director_link'].str.split("/", n=3, expand=True))[2]                               
imdb_df2['director_id']

0       nm0023929
1       nm0911036
2       nm0213100
3       nm2279788
4       nm0687739
          ...    
1995    nm2606944
1996    nm0954724
1997    nm0525475
1998    nm2204106
1999    nm4407169
Name: director_id, Length: 2000, dtype: object

In [120]:
imdb_df2.head()

Unnamed: 0,title,title_link,mpaa_rating,year,runtime,imdb_rating,metascore,votes,director_name,director_link,imdb_genre_list,imdb_id,director_id
0,Trancers II,/title/tt0103116/?ref_=adv_li_tt,R,(1991),88 min,5.4,,1847.0,Charles Band,/name/nm0023929/?ref_=adv_li_dr_0,"\nAction, Horror, Sci-Fi",tt0103116,nm0023929
1,Drive,/title/tt0116147/?ref_=adv_li_tt,R,(1997),100 min,6.8,,5451.0,Steve Wang,/name/nm0911036/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi",tt0116147,nm0911036
2,Timerider: The Adventure of Lyle Swann,/title/tt0086443/?ref_=adv_li_tt,PG,(1982),94 min,5.4,,2005.0,William Dear,/name/nm0213100/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi",tt0086443,nm0213100
3,Thirst,/title/tt3955808/?ref_=adv_li_tt,,(III) (2015),87 min,4.4,,1045.0,Greg Kiefer,/name/nm2279788/?ref_=adv_li_dr_0,"\nAction, Adventure, Horror",tt3955808,nm2279788
4,I Married a Strange Person!,/title/tt0119346/?ref_=adv_li_tt,R,(1997),75 min,7.0,,1781.0,Bill Plympton,/name/nm0687739/?ref_=adv_li_dr_0,"\nAnimation, Comedy, Drama",tt0119346,nm0687739


#### Splitting Up Genre Keywords Into Individual Features

In [122]:
imdb_df2[['genre_01', 'genre_02', 'genre_03']] = imdb_df2['imdb_genre_list'].str.split(", ", n=2, expand=True)

In [123]:
imdb_df2[['genre_01', 'genre_02', 'genre_03']].head()

Unnamed: 0,genre_01,genre_02,genre_03
0,\nAction,Horror,Sci-Fi
1,\nAction,Adventure,Sci-Fi
2,\nAction,Adventure,Sci-Fi
3,\nAction,Adventure,Horror
4,\nAnimation,Comedy,Drama


In [124]:
imdb_df2['genre_01'] = imdb_df2['genre_01'].str.strip("\n")

#### String Cleaning, Type Standardization, Dropping of Null Values

In [121]:
imdb_df2['runtime'] = imdb_df2['runtime'].str.strip(' min')
imdb_df2['runtime']= imdb_df2['runtime'].replace('', np.NaN)
imdb_df2['runtime'] = pd.to_numeric(imdb_df2['runtime'], errors='coerce')

In [125]:
imdb_df2.head()

Unnamed: 0,title,title_link,mpaa_rating,year,runtime,imdb_rating,metascore,votes,director_name,director_link,imdb_genre_list,imdb_id,director_id,genre_01,genre_02,genre_03
0,Trancers II,/title/tt0103116/?ref_=adv_li_tt,R,(1991),88.0,5.4,,1847.0,Charles Band,/name/nm0023929/?ref_=adv_li_dr_0,"\nAction, Horror, Sci-Fi",tt0103116,nm0023929,Action,Horror,Sci-Fi
1,Drive,/title/tt0116147/?ref_=adv_li_tt,R,(1997),100.0,6.8,,5451.0,Steve Wang,/name/nm0911036/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi",tt0116147,nm0911036,Action,Adventure,Sci-Fi
2,Timerider: The Adventure of Lyle Swann,/title/tt0086443/?ref_=adv_li_tt,PG,(1982),94.0,5.4,,2005.0,William Dear,/name/nm0213100/?ref_=adv_li_dr_0,"\nAction, Adventure, Sci-Fi",tt0086443,nm0213100,Action,Adventure,Sci-Fi
3,Thirst,/title/tt3955808/?ref_=adv_li_tt,,(III) (2015),87.0,4.4,,1045.0,Greg Kiefer,/name/nm2279788/?ref_=adv_li_dr_0,"\nAction, Adventure, Horror",tt3955808,nm2279788,Action,Adventure,Horror
4,I Married a Strange Person!,/title/tt0119346/?ref_=adv_li_tt,R,(1997),75.0,7.0,,1781.0,Bill Plympton,/name/nm0687739/?ref_=adv_li_dr_0,"\nAnimation, Comedy, Drama",tt0119346,nm0687739,Animation,Comedy,Drama


####  Formatting Column Labels

In [36]:
imdb_df.columns

Index(['title', 'title_link', 'mpaa_rating', 'year', 'runtime', 'imdb_rating',
       'metascore', 'votes', 'director_name', 'director_link',
       'imdb_genre_list', 'imdb_id', 'director_id', 'genre_01', 'genre_02',
       'genre_03'],
      dtype='object')

In [126]:
imdb_df2 = imdb_df2[['title', 'imdb_id', 'title_link', 'year', 'runtime', 'mpaa_rating', 'director_name', 'director_id', 'director_link', 'genre_01', 'genre_02', 'genre_03','imdb_rating', 'votes', 'metascore']]

In [127]:
imdb_df2['runtime'].unique()

array([ 88., 100.,  94.,  87.,  75.,  95., 179.,  86.,  90.,  83., 117.,
        84., 216.,  nan,  85., 114.,  89., 113., 126., 111., 106.,  71.,
        97.,  92., 127.,  74.,  99., 102.,  73.,  98.,  45., 174.,  72.,
       110., 151.,  78., 101., 115., 116.,  62.,  81.,  96.,  91., 103.,
       125., 129.,  66.,  93.,  49.,  80.,  82.,  77.,  76.,  79.,  67.,
       105., 107., 141., 164., 142.,  60., 118.,  69., 354.,  65.,  63.,
        54.,  68., 108., 149., 104., 120., 135.,  59., 140., 119.,  70.,
        53., 134.,  58.,  61., 136.,  47., 112.,  64., 237.,  55., 154.,
       124., 138., 167., 121.,  51., 123.,  50., 299.,  57., 220.,  46.,
       180., 252.,  52.,  48., 109., 258., 160., 152., 130., 132., 165.,
       133., 137., 128., 122., 146., 173., 226., 265., 195.])

In [128]:
imdb_df2['votes']= imdb_df2['votes'].replace('', np.NaN)
imdb_df2['votes'] = pd.to_numeric(imdb_df2['votes'], errors='coerce')

In [129]:
imdb_df2['votes'].unique()

array([1847., 5451., 2005., ...,  592., 1051., 1651.])

In [130]:
imdb_df2.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
0,Trancers II,tt0103116,/title/tt0103116/?ref_=adv_li_tt,(1991),88.0,R,Charles Band,nm0023929,/name/nm0023929/?ref_=adv_li_dr_0,Action,Horror,Sci-Fi,5.4,1847.0,
1,Drive,tt0116147,/title/tt0116147/?ref_=adv_li_tt,(1997),100.0,R,Steve Wang,nm0911036,/name/nm0911036/?ref_=adv_li_dr_0,Action,Adventure,Sci-Fi,6.8,5451.0,
2,Timerider: The Adventure of Lyle Swann,tt0086443,/title/tt0086443/?ref_=adv_li_tt,(1982),94.0,PG,William Dear,nm0213100,/name/nm0213100/?ref_=adv_li_dr_0,Action,Adventure,Sci-Fi,5.4,2005.0,
3,Thirst,tt3955808,/title/tt3955808/?ref_=adv_li_tt,(III) (2015),87.0,,Greg Kiefer,nm2279788,/name/nm2279788/?ref_=adv_li_dr_0,Action,Adventure,Horror,4.4,1045.0,
4,I Married a Strange Person!,tt0119346,/title/tt0119346/?ref_=adv_li_tt,(1997),75.0,R,Bill Plympton,nm0687739,/name/nm0687739/?ref_=adv_li_dr_0,Animation,Comedy,Drama,7.0,1781.0,


In [131]:
imdb_df2['year'] = imdb_df2['year'].str.strip('[)XIV( ]')

In [132]:
imdb_df2['year'].unique()

array(['1991', '1997', '1982', '2015', '2005', '1987', '1983', '2013',
       '1962', '2018', '1979', '1966', '1941', '2017', '2021', '2019',
       '1965', '1971', '2020', '', '1956', '2016', '1968', '1999', '1994',
       '1993', '1992', '1939', '1977', '1954', '1972', '1967', '1958',
       '2002', '1975', '1998', '1985', '2010', '1984', '1944', '1974',
       '1981', '1995', '2003', '2012', '1986', '1957', '2009', '2007',
       '2006', '1940', '1953', '1996', '1970', '1988', '1943', '1969',
       '1936', '2014', '1932', '1990', '1964', '1955', '1942', '2008',
       '2001', '1989', '1973', '2011', '1961', '2000', '2004', '1959',
       '1963', '1960', '1980', '1976', '1920', '1945', '1951', '1950',
       '1931', '1935', '1952', '1924', '1978', '1938', '1946', '1916',
       '2022', '1928', '1933', '1947', '2025', '1925', '1948'],
      dtype=object)

In [133]:
imdb_df2['year']= imdb_df2['year'].replace('', np.NaN)

In [134]:
imdb_df2['year'].unique()

array(['1991', '1997', '1982', '2015', '2005', '1987', '1983', '2013',
       '1962', '2018', '1979', '1966', '1941', '2017', '2021', '2019',
       '1965', '1971', '2020', nan, '1956', '2016', '1968', '1999',
       '1994', '1993', '1992', '1939', '1977', '1954', '1972', '1967',
       '1958', '2002', '1975', '1998', '1985', '2010', '1984', '1944',
       '1974', '1981', '1995', '2003', '2012', '1986', '1957', '2009',
       '2007', '2006', '1940', '1953', '1996', '1970', '1988', '1943',
       '1969', '1936', '2014', '1932', '1990', '1964', '1955', '1942',
       '2008', '2001', '1989', '1973', '2011', '1961', '2000', '2004',
       '1959', '1963', '1960', '1980', '1976', '1920', '1945', '1951',
       '1950', '1931', '1935', '1952', '1924', '1978', '1938', '1946',
       '1916', '2022', '1928', '1933', '1947', '2025', '1925', '1948'],
      dtype=object)

In [135]:
imdb_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          2000 non-null   object 
 1   imdb_id        2000 non-null   object 
 2   title_link     2000 non-null   object 
 3   year           1844 non-null   object 
 4   runtime        1808 non-null   float64
 5   mpaa_rating    2000 non-null   object 
 6   director_name  2000 non-null   object 
 7   director_id    1732 non-null   object 
 8   director_link  2000 non-null   object 
 9   genre_01       2000 non-null   object 
 10  genre_02       1811 non-null   object 
 11  genre_03       1290 non-null   object 
 12  imdb_rating    1800 non-null   float64
 13  votes          1800 non-null   float64
 14  metascore      83 non-null     float64
dtypes: float64(4), object(11)
memory usage: 234.5+ KB


In [136]:
imdb_df2.sample(10, random_state=42)

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
1860,Chronesthesia,tt5729870,/title/tt5729870/?ref_=adv_li_tt,2016,93.0,,Hayden J. Weal,nm3289442,/name/nm3289442/?ref_=adv_li_dr_0,Comedy,Drama,Romance,6.4,407.0,
353,The Sender,tt0084658,/title/tt0084658/?ref_=adv_li_tt,1982,91.0,R,Roger Christian,nm0002337,/name/nm0002337/?ref_=adv_li_dr_0,Drama,Horror,Sci-Fi,6.0,1692.0,
1333,Medusa,tt3237826,/title/tt3237826/?ref_=adv_li_tt,2015,109.0,,Jorge Ameer,nm0024521,/name/nm0024521/?ref_=adv_li_dr_0,Horror,Sci-Fi,Thriller,5.1,125.0,
905,"Invasion, U.S.A.",tt0044750,/title/tt0044750/?ref_=adv_li_tt,1952,73.0,Approved,Alfred E. Green,nm0337586,/name/nm0337586/?ref_=adv_li_dr_0,Drama,Sci-Fi,War,3.2,1258.0,
1289,Watch the Sky,tt5083366,/title/tt5083366/?ref_=adv_li_tt,2017,87.0,,Alexander Murillo,nm7167050,/name/nm7167050/?ref_=adv_li_dr_0,Drama,Sci-Fi,,3.6,171.0,
1273,Future Cops,tt0106545,/title/tt0106545/?ref_=adv_li_tt,1993,95.0,Not Rated,Jing Wong,nm0939147,/name/nm0939147/?ref_=adv_li_dr_0,Action,Comedy,Sci-Fi,6.0,1063.0,
938,Assignment: Outer Space,tt0054330,/title/tt0054330/?ref_=adv_li_tt,1960,73.0,Unrated,Antonio Margheriti,nm0546672,/name/nm0546672/?ref_=adv_li_dr_0,Sci-Fi,,,3.6,798.0,
1731,Doomsday,tt2805976,/title/tt2805976/?ref_=adv_li_tt,2015,95.0,Not Rated,Neil Johnson,nm1484928,/name/nm1484928/?ref_=adv_li_dr_0,Action,Sci-Fi,,3.5,200.0,
65,Quantum Quest: A Cassini Space Odyssey,tt0312305,/title/tt0312305/?ref_=adv_li_tt,2010,45.0,,,,,Animation,Adventure,Sci-Fi,5.1,319.0,
1323,Alien Showdown: The Day the Old West Stood Still,tt2349142,/title/tt2349142/?ref_=adv_li_tt,2018,80.0,,Rene Perez,nm3163561,/name/nm3163561/?ref_=adv_li_dr_0,Action,Adventure,Fantasy,2.0,354.0,


In [137]:
imdb_df2.describe()

Unnamed: 0,runtime,imdb_rating,votes,metascore
count,1808.0,1800.0,1800.0,83.0
mean,91.853982,4.995,1547.03,50.253012
std,20.237295,1.46109,2006.358975,15.902787
min,45.0,1.1,5.0,8.0
25%,83.0,3.9,392.0,40.0
50%,90.0,5.1,903.0,49.0
75%,97.0,6.1,1917.75,63.0
max,354.0,9.5,24911.0,84.0


In [138]:
imdb_df2['year'].value_counts()

2017    106
2018     98
2016     94
2019     78
2015     70
       ... 
1931      1
2025      1
1925      1
1945      1
1928      1
Name: year, Length: 95, dtype: int64

#### Dropping Less Common MPAA Ratings Values

In [139]:
imdb_df2['mpaa_rating'].value_counts()
# let's drop some of these ratings, esp TV and X

NaN          722
Not Rated    358
R            338
Approved     109
PG           108
Unrated       84
PG-13         81
TV-14         49
TV-MA         43
G             39
Passed        29
TV-PG         19
GP             7
X              4
TV-Y7          3
TV-Y7-FV       2
M              2
TV-G           2
TV-Y           1
Name: mpaa_rating, dtype: int64

In [140]:
imdb_df2['mpaa_rating'].unique()

array(['R', 'PG', 'NaN', 'PG-13', 'Not Rated', 'TV-MA', 'Passed',
       'Approved', 'G', 'TV-PG', 'Unrated', 'TV-14', 'X', 'GP', 'M',
       'TV-G', 'TV-Y7', 'TV-Y7-FV', 'TV-Y'], dtype=object)

In [141]:
imdb_df2['genre_01'].value_counts()

Action                561
Horror                301
Comedy                221
Drama                 202
Sci-Fi                189
Animation             169
Adventure             143
Sci-Fi                 79
Fantasy                32
Crime                  32
Mystery                28
Family                 13
Documentary            12
Romance                 9
Musical                 3
Music                   3
Biography               2
History                 1
Name: genre_01, dtype: int64

In [142]:
imdb_df2['genre_02'].value_counts()

Sci-Fi                   445
Adventure                265
Sci-Fi                   221
Horror                   209
Drama                    127
Action                    98
Comedy                    97
Fantasy                   79
Mystery                   72
Thriller                  56
Romance                   38
Family                    35
Crime                     29
Horror                    11
History                    5
Musical                    5
Music                      3
Biography                  3
Comedy                     2
Drama                      2
Action                     2
Thriller                   2
Adventure                  1
Music                      1
Western                    1
Adult                      1
Animation                  1
Name: genre_02, dtype: int64

In [143]:
imdb_df2['genre_03'].value_counts()

Sci-Fi                   547
Thriller                 209
Comedy                    88
Fantasy                   81
Horror                    77
Drama                     66
Adventure                 56
Mystery                   46
Family                    39
Romance                   37
Crime                     18
Musical                    9
Music                      5
War                        4
History                    4
Sport                      2
Western                    1
Action                     1
Name: genre_03, dtype: int64

In [144]:
imdb_df2['genre_02']= imdb_df2['genre_02'].str.strip()

In [145]:
imdb_df2['genre_02'].value_counts()

Sci-Fi       666
Adventure    266
Horror       220
Drama        129
Action       100
Comedy        99
Fantasy       79
Mystery       72
Thriller      58
Romance       38
Family        35
Crime         29
History        5
Musical        5
Music          4
Biography      3
Adult          1
Animation      1
Western        1
Name: genre_02, dtype: int64

In [146]:
imdb_df2['genre_03']= imdb_df2['genre_03'].str.strip()

In [147]:
imdb_df2['genre_03'].value_counts()

Sci-Fi       547
Thriller     209
Comedy        88
Fantasy       81
Horror        77
Drama         66
Adventure     56
Mystery       46
Family        39
Romance       37
Crime         18
Musical        9
Music          5
History        4
War            4
Sport          2
Action         1
Western        1
Name: genre_03, dtype: int64

In [148]:
imdb_df2['genre_01']= imdb_df2['genre_01'].str.strip()

In [149]:
imdb_df2['genre_01'].value_counts()

Action         561
Horror         301
Sci-Fi         268
Comedy         221
Drama          202
Animation      169
Adventure      143
Fantasy         32
Crime           32
Mystery         28
Family          13
Documentary     12
Romance          9
Musical          3
Music            3
Biography        2
History          1
Name: genre_01, dtype: int64

In [150]:
imdb_df2.columns

Index(['title', 'imdb_id', 'title_link', 'year', 'runtime', 'mpaa_rating',
       'director_name', 'director_id', 'director_link', 'genre_01', 'genre_02',
       'genre_03', 'imdb_rating', 'votes', 'metascore'],
      dtype='object')

In [177]:
imdb_lookup_df3 = imdb_df2_ms[['title', 'imdb_id','title_link', 'director_name', 'director_id', 'director_link']]

####  Creating A Title String for URL Searching Rotten Tomatoes

In [178]:
imdb_lookup_df3['title_cc'] = imdb_df2_ms['title'].str.lower() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_lookup_df3['title_cc'] = imdb_df2_ms['title'].str.lower()


In [179]:
imdb_lookup_df3['title_cc'].head()

8                       beneath
19      the million dollar duck
25        cities of last things
37                   lazer team
44    the powerpuff girls movie
Name: title_cc, dtype: object

In [180]:
imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace('&','and',regex=False)
imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace('\\','_',regex=True)
imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace('episode\s([ivx]*)\s-\s','',regex=True)
imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace("\s",'_',regex=True)
imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace("\W",'',regex=True)
imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace('_+','_',regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace('&','and',regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_lookup_df3['title_cc'] = imdb_lookup_df3['title_cc'].str.replace('\\','_',regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_lookup_df3['titl

In [181]:
print(imdb_lookup_df3.shape)
imdb_lookup_df3.head()

(83, 7)


Unnamed: 0,title,imdb_id,title_link,director_name,director_id,director_link,title_cc
8,Beneath,tt2325518,/title/tt2325518/?ref_=adv_li_tt,Larry Fessenden,nm0275244,/name/nm0275244/?ref_=adv_li_dr_0,beneath
19,The Million Dollar Duck,tt0066728,/title/tt0066728/?ref_=adv_li_tt,Vincent McEveety,nm0568546,/name/nm0568546/?ref_=adv_li_dr_0,the_million_dollar_duck
25,Cities of Last Things,tt4397342,/title/tt4397342/?ref_=adv_li_tt,Wi Ding Ho,nm0387399,/name/nm0387399/?ref_=adv_li_dr_0,cities_of_last_things
37,Lazer Team,tt3864024,/title/tt3864024/?ref_=adv_li_tt,Matt Hullum,nm0401502,/name/nm0401502/?ref_=adv_li_dr_0,lazer_team
44,The Powerpuff Girls Movie,tt0289408,/title/tt0289408/?ref_=adv_li_tt,Craig McCracken,nm0566833,/name/nm0566833/?ref_=adv_li_dr_0,the_powerpuff_girls_movie


In [156]:
imdb_lookup_df2.shape

(2000, 7)

In [158]:
imdb_df2['director_name']= imdb_df2['director_name'].replace('NaN', np.NaN)

In [159]:
imdb_df2['director_id']= imdb_df2['director_id'].replace('NaN', np.NaN)

In [160]:
imdb_df2['director_id']= imdb_df2['director_id'].replace('None', np.NaN)

In [162]:
imdb_df2[imdb_df2.director_id.isna()]

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
14,Adventures of Captain Marvel,tt0033317,/title/tt0033317/?ref_=adv_li_tt,1941,216.0,Passed,,,,Action,Adventure,Fantasy,7.2,1112.0,
23,Eiga Doraemon: Nobita no getsumen tansaki,tt9735672,/title/tt9735672/?ref_=adv_li_tt,2019,111.0,,,,,Animation,Action,Adventure,6.6,459.0,
24,The Last Days,tt1935914,/title/tt1935914/?ref_=adv_li_tt,2013,100.0,Unrated,,,,Horror,Sci-Fi,Thriller,6.1,7942.0,
26,Robodog,tt3501062,/title/tt3501062/?ref_=adv_li_tt,,,,,,,Animation,Adventure,Comedy,,,
32,Carnosaur,tt0106521,/title/tt0106521/?ref_=adv_li_tt,1993,83.0,R,,,,Horror,Sci-Fi,,3.6,3664.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1931,Gamera vs. Barugon,tt0060446,/title/tt0060446/?ref_=adv_li_tt,1966,106.0,Not Rated,,,,Action,Adventure,Fantasy,5.0,2030.0,
1948,Axanar,tt3302086,/title/tt3302086/?ref_=adv_li_tt,,,,,,,Action,Adventure,Sci-Fi,,,
1971,GoBots: Battle of the Rock Lords,tt0091123,/title/tt0091123/?ref_=adv_li_tt,1986,75.0,G,,,,Animation,Action,Adventure,5.8,463.0,
1981,Kekkai Sensen,tt10436054,/title/tt10436054/?ref_=adv_li_tt,2019,,,,,,Action,Adventure,Fantasy,6.6,5.0,


In [163]:
imdb_df2['director_link']= imdb_df2['director_link'].replace('NaN', np.NaN)

In [164]:
imdb_df2['director_link']= imdb_df2['director_link'].replace('None', np.NaN)

In [165]:
imdb_df[imdb_df.director_link.isna()]

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
3,Avengers: Endgame,tt4154796,/title/tt4154796/?ref_=adv_li_tt,2019,181.0,PG-13,,,,Action,Adventure,Drama,8.4,802218.0,78.0
10,Avengers: Infinity War,tt4154756,/title/tt4154756/?ref_=adv_li_tt,2018,149.0,PG-13,,,,Action,Adventure,Sci-Fi,8.4,829607.0,68.0
20,The Matrix,tt0133093,/title/tt0133093/?ref_=adv_li_tt,1999,136.0,R,,,,Action,Sci-Fi,,8.7,1671469.0,73.0
35,Captain Marvel,tt4154664,/title/tt4154664/?ref_=adv_li_tt,2019,123.0,PG-13,,,,Action,Adventure,Sci-Fi,6.9,438384.0,64.0
48,Spider-Man: Into the Spider-Verse,tt4633694,/title/tt4633694/?ref_=adv_li_tt,2018,117.0,PG,,,,Animation,Action,Adventure,8.4,371835.0,87.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1969,The Shadow Effect,tt5044656,/title/tt5044656/?ref_=adv_li_tt,2017,93.0,Not Rated,,,,Action,Mystery,Sci-Fi,4.2,1610.0,
1975,The War of the Worlds: The Musical Drama,tt12106008,/title/tt12106008/?ref_=adv_li_tt,2018,,,,,,Adventure,Musical,Sci-Fi,7.1,12.0,
1980,Godzilla vs. Megalon,tt0070122,/title/tt0070122/?ref_=adv_li_tt,1973,78.0,,,,,Action,Adventure,Family,4.8,5049.0,
1991,Aqua Teen Hunger Force Colon Movie Film for Th...,tt0455326,/title/tt0455326/?ref_=adv_li_tt,2007,86.0,R,,,,Animation,Action,Adventure,6.7,13323.0,54.0


In [166]:
imdb_df2 = imdb_df2.replace('NaN', np.NaN)

In [167]:
imdb_df2 = imdb_df2.replace('None', np.NaN)

In [168]:
imdb_df2.isna().sum()

title               0
imdb_id             0
title_link          0
year              156
runtime           192
mpaa_rating       722
director_name     268
director_id       268
director_link     268
genre_01            0
genre_02          189
genre_03          710
imdb_rating       200
votes             200
metascore        1917
dtype: int64

In [170]:
imdb_df2_ms = imdb_df2[imdb_df2['metascore'].notna()]

In [171]:
imdb_df2_ms.shape

(83, 15)

In [173]:
imdb_df2_ms.describe()

Unnamed: 0,runtime,imdb_rating,votes,metascore
count,83.0,83.0,83.0,83.0
mean,94.096386,5.69759,3423.650602,50.253012
std,12.405363,0.947265,3221.284123,15.902787
min,71.0,3.2,37.0,8.0
25%,86.0,5.2,1348.0,40.0
50%,91.0,5.7,2478.0,49.0
75%,99.5,6.4,3900.0,63.0
max,129.0,7.3,17644.0,84.0


In [174]:
imdb_df2_ms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83 entries, 8 to 1768
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          83 non-null     object 
 1   imdb_id        83 non-null     object 
 2   title_link     83 non-null     object 
 3   year           83 non-null     object 
 4   runtime        83 non-null     float64
 5   mpaa_rating    73 non-null     object 
 6   director_name  73 non-null     object 
 7   director_id    73 non-null     object 
 8   director_link  73 non-null     object 
 9   genre_01       83 non-null     object 
 10  genre_02       80 non-null     object 
 11  genre_03       69 non-null     object 
 12  imdb_rating    83 non-null     float64
 13  votes          83 non-null     float64
 14  metascore      83 non-null     float64
dtypes: float64(4), object(11)
memory usage: 10.4+ KB


In [175]:
imdb_df2_ms.head()

Unnamed: 0,title,imdb_id,title_link,year,runtime,mpaa_rating,director_name,director_id,director_link,genre_01,genre_02,genre_03,imdb_rating,votes,metascore
8,Beneath,tt2325518,/title/tt2325518/?ref_=adv_li_tt,2013,90.0,Not Rated,Larry Fessenden,nm0275244,/name/nm0275244/?ref_=adv_li_dr_0,Horror,Sci-Fi,Thriller,3.7,2981.0,40.0
19,The Million Dollar Duck,tt0066728,/title/tt0066728/?ref_=adv_li_tt,1971,89.0,G,Vincent McEveety,nm0568546,/name/nm0568546/?ref_=adv_li_dr_0,Comedy,Family,Sci-Fi,5.9,2021.0,45.0
25,Cities of Last Things,tt4397342,/title/tt4397342/?ref_=adv_li_tt,2018,106.0,TV-MA,Wi Ding Ho,nm0387399,/name/nm0387399/?ref_=adv_li_dr_0,Crime,Drama,Sci-Fi,6.2,1227.0,66.0
37,Lazer Team,tt3864024,/title/tt3864024/?ref_=adv_li_tt,2015,102.0,PG-13,Matt Hullum,nm0401502,/name/nm0401502/?ref_=adv_li_dr_0,Action,Comedy,Sci-Fi,5.6,9128.0,42.0
44,The Powerpuff Girls Movie,tt0289408,/title/tt0289408/?ref_=adv_li_tt,2002,73.0,PG,Craig McCracken,nm0566833,/name/nm0566833/?ref_=adv_li_dr_0,Animation,Action,Adventure,6.6,9200.0,65.0


In [176]:
imdb_df2_ms.to_json('imdb_data_cleaned_2.json', orient='records', lines=True)

In [182]:
imdb_lookup_df3.to_json('imdb_lookup_df3.json', orient='records', lines=True)