In [1]:
import pandas as pd
import requests
import numpy as np
import re
import csv
from bs4 import BeautifulSoup
from sklearn.metrics import roc_curve, auc, precision_recall_curve

In [3]:
df1 = pd.read_csv('imdb_1.csv')
df2 = pd.read_csv('imdb_2.csv')

In [4]:
df1.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,runtime,genre,certificate,imdb_rating,gross,year,votes,director,actors,metascore,summary
0,0,tt7286456,Joker,122.0,"Crime, Drama, Thriller",R,8.5,335.45,(2019),804730,Todd Phillips,"['Joaquin Phoenix', 'Robert De Niro', 'Zazie B...",59.0,"In Gotham City, mentally troubled comedian Art..."
1,1,tt6751668,Parasite,132.0,"Comedy, Drama, Thriller",R,8.6,53.37,(2019),423230,Bong Joon Ho,"['Kang-ho Song', 'Sun-kyun Lee', 'Yeo-jeong Jo...",96.0,Greed and class discrimination threaten the ne...
2,2,tt8579674,1917,119.0,"Drama, War",R,8.3,159.23,(2019),324071,Sam Mendes,"['Dean-Charles Chapman', 'George MacKay', 'Dan...",78.0,"April 6th, 1917. As a regiment assembles to wa..."
3,3,tt4154796,Avengers: Endgame,181.0,"Action, Adventure, Drama",PG-13,8.4,858.37,(2019),726032,Anthony Russo,"['Joe Russo', 'Robert Downey Jr.', 'Chris Evan...",78.0,After the devastating events of Avengers: Infi...
4,4,tt0068646,The Godfather,175.0,"Crime, Drama",R,9.2,134.97,(1972),1551490,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",100.0,The aging patriarch of an organized crime dyna...


In [5]:
df2.head()

Unnamed: 0.1,Unnamed: 0,movie_id,user_review,critic_review,writer,language,country,budget,gross_1,opening_week,oscar_win,oscar_nom,other_win,other_nom,meta_score
0,0,tt7286456,"10,188 user",682 critic,"['Todd Phillips', 'Scott Silver']",English,"['USA', 'Canada']",$55000000,$335451311,$96202337,['2'],,92.0,199.0,59.0
1,1,tt6751668,"2,507 user",515 critic,"['Bong Joon Ho', 'Bong Joon Ho']",Korean,['South Korea'],$11400000,$53369749,$393216,['4'],,262.0,227.0,96.0
2,2,tt8579674,"2,674 user",461 critic,"['Sam Mendes', 'Krysty Wilson-Cairns']",English,"['USA', 'UK', 'India', 'Spain', 'Canada']",$95000000,$159227644,$576216,['3'],,110.0,161.0,78.0
3,3,tt4154796,"8,764 user",550 critic,"['Christopher Markus', 'Stephen McFeely']",English,['USA'],$356000000,$858373000,$357115007,,['1'],65.0,103.0,78.0
4,4,tt1454029,551 user,393 critic,"['Tate Taylor', 'Kathryn Stockett']",English,['USA'],$25000000,$169708112,$26044590,['1'],,79.0,120.0,62.0


In [6]:
#drop Unnamed columns form both
df1.drop(df1.columns[0], axis=1, inplace=True)
df2.drop(df2.columns[0], axis=1, inplace=True)

In [7]:
#combining dataset 
df = df1.merge(df2, on = 'movie_id')

In [8]:
df.head()

Unnamed: 0,movie_id,title,runtime,genre,certificate,imdb_rating,gross,year,votes,director,...,language,country,budget,gross_1,opening_week,oscar_win,oscar_nom,other_win,other_nom,meta_score
0,tt7286456,Joker,122.0,"Crime, Drama, Thriller",R,8.5,335.45,(2019),804730,Todd Phillips,...,English,"['USA', 'Canada']",$55000000,$335451311,$96202337,['2'],,92.0,199.0,59.0
1,tt6751668,Parasite,132.0,"Comedy, Drama, Thriller",R,8.6,53.37,(2019),423230,Bong Joon Ho,...,Korean,['South Korea'],$11400000,$53369749,$393216,['4'],,262.0,227.0,96.0
2,tt8579674,1917,119.0,"Drama, War",R,8.3,159.23,(2019),324071,Sam Mendes,...,English,"['USA', 'UK', 'India', 'Spain', 'Canada']",$95000000,$159227644,$576216,['3'],,110.0,161.0,78.0
3,tt4154796,Avengers: Endgame,181.0,"Action, Adventure, Drama",PG-13,8.4,858.37,(2019),726032,Anthony Russo,...,English,['USA'],$356000000,$858373000,$357115007,,['1'],65.0,103.0,78.0
4,tt0068646,The Godfather,175.0,"Crime, Drama",R,9.2,134.97,(1972),1551490,Francis Ford Coppola,...,English,['USA'],$6000000,$134966411,$302393,['3'],,26.0,30.0,100.0


In [9]:
#drop redundent columns
df.drop(columns=['gross', 'metascore'], inplace=True)

In [10]:
#replace N/A with nan
df = df.replace('N/A',np.nan)

In [11]:
#renaming the columns gross_1 with gross
df.rename(columns={'gross_1':'gross'}, inplace=True)

In [12]:
df.drop_duplicates(inplace = True)

In [13]:
df.head()

Unnamed: 0,movie_id,title,runtime,genre,certificate,imdb_rating,year,votes,director,actors,...,language,country,budget,gross,opening_week,oscar_win,oscar_nom,other_win,other_nom,meta_score
0,tt7286456,Joker,122.0,"Crime, Drama, Thriller",R,8.5,(2019),804730,Todd Phillips,"['Joaquin Phoenix', 'Robert De Niro', 'Zazie B...",...,English,"['USA', 'Canada']",$55000000,$335451311,$96202337,['2'],,92.0,199.0,59.0
1,tt6751668,Parasite,132.0,"Comedy, Drama, Thriller",R,8.6,(2019),423230,Bong Joon Ho,"['Kang-ho Song', 'Sun-kyun Lee', 'Yeo-jeong Jo...",...,Korean,['South Korea'],$11400000,$53369749,$393216,['4'],,262.0,227.0,96.0
2,tt8579674,1917,119.0,"Drama, War",R,8.3,(2019),324071,Sam Mendes,"['Dean-Charles Chapman', 'George MacKay', 'Dan...",...,English,"['USA', 'UK', 'India', 'Spain', 'Canada']",$95000000,$159227644,$576216,['3'],,110.0,161.0,78.0
3,tt4154796,Avengers: Endgame,181.0,"Action, Adventure, Drama",PG-13,8.4,(2019),726032,Anthony Russo,"['Joe Russo', 'Robert Downey Jr.', 'Chris Evan...",...,English,['USA'],$356000000,$858373000,$357115007,,['1'],65.0,103.0,78.0
4,tt0068646,The Godfather,175.0,"Crime, Drama",R,9.2,(1972),1551490,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",...,English,['USA'],$6000000,$134966411,$302393,['3'],,26.0,30.0,100.0


In [14]:
df.isnull().sum()

movie_id            0
title               0
runtime             4
genre               0
certificate       494
imdb_rating         0
year                0
votes               0
director            0
actors              0
summary             0
user_review         1
critic_review     102
writer              4
language           19
country             1
budget           1483
gross            1629
opening_week     1747
oscar_win        2608
oscar_nom        2611
other_win        1317
other_nom        1184
meta_score       1664
dtype: int64

## Year

In [15]:
df.year

0       (2019)
1       (2019)
2       (2019)
3       (2019)
4       (1972)
         ...  
2829    (2014)
2830    (2009)
2831    (2016)
2832    (1980)
2833    (1998)
Name: year, Length: 2834, dtype: object

In [16]:
df.year = df.year.apply(lambda x: int(re.findall("\d{4}",x)[0]))

In [17]:
df.year

0       2019
1       2019
2       2019
3       2019
4       1972
        ... 
2829    2014
2830    2009
2831    2016
2832    1980
2833    1998
Name: year, Length: 2834, dtype: int64

## Runtime

In [18]:
# fill the 4 missing values with the average. 
df.runtime.fillna(df.runtime.mean(), inplace=True)

## Awards

In [19]:
df.oscar_win

0       ['2']
1       ['4']
2       ['3']
3         NaN
4       ['3']
        ...  
2829      NaN
2830      NaN
2831      NaN
2832      NaN
2833      NaN
Name: oscar_win, Length: 2834, dtype: object

In [20]:
df.oscar_win = df.oscar_win.str.extract(r'(\d+)')

In [21]:
df.oscar_win

0         2
1         4
2         3
3       NaN
4         3
       ... 
2829    NaN
2830    NaN
2831    NaN
2832    NaN
2833    NaN
Name: oscar_win, Length: 2834, dtype: object

In [22]:
df.oscar_win.fillna(0, inplace=True)

In [23]:
df.oscar_win

0       2
1       4
2       3
3       0
4       3
       ..
2829    0
2830    0
2831    0
2832    0
2833    0
Name: oscar_win, Length: 2834, dtype: object

In [24]:
df.oscar_nom = df.oscar_nom.str.extract(r'(\d+)')


In [25]:
df.oscar_nom.fillna(0, inplace=True)

In [26]:
df.other_win.fillna(0, inplace=True)
df.other_nom.fillna(0, inplace=True)

In [28]:
#convert it to int 
df.oscar_win = df.oscar_win.astype(int)
df.oscar_nom = df.oscar_nom.astype(int)
df.other_win = df.other_win.apply(lambda x: int(x))
df.other_nom = df.other_nom.apply(lambda x: int(x))

## Country 

In [29]:
df.country.isnull().sum()

1

In [37]:
df['country'].value_counts()

['USA']                           1199
['India']                          386
['UK']                              94
['Japan']                           78
['UK', 'USA']                       51
                                  ... 
['UK', 'India', 'France']            1
['USA', 'Hong Kong', 'Canada']       1
['Soviet Union', 'Japan']            1
['Canada', 'France', 'Japan']        1
['Russia', 'China', 'USA']           1
Name: country, Length: 398, dtype: int64

In [38]:
df.country.fillna('USA', inplace = True)

In [39]:
def cleaner(df, function):
    df.loc[:, function] = df.loc[:, function].apply(lambda x: x.replace("[", "").replace("]", "").replace("'", ""))
    return df.loc[:, function]

In [40]:
df.country = cleaner(df,'country')

In [41]:
df.country

0                         USA, Canada
1                         South Korea
2       USA, UK, India, Spain, Canada
3                                 USA
4                                 USA
                    ...              
2829                           Canada
2830                            Italy
2831                     South Africa
2832                              USA
2833                              USA
Name: country, Length: 2834, dtype: object

## Meta Score

In [42]:
df.meta_score.isnull().sum()

1664

In [43]:
df.meta_score.fillna(df.meta_score.mean(), inplace=True)

## Languages

In [45]:
df.language.isnull().sum()

19

In [46]:
df['language'].value_counts()

English           1824
Hindi              151
Japanese            82
Tamil               81
French              71
                  ... 
Latvian              1
Sign Languages       1
Filipino             1
Belarusian           1
Hebrew               1
Name: language, Length: 62, dtype: int64

In [47]:
df.language.fillna("English", inplace=True)


## User_review

In [48]:
df.user_review.isnull().sum()

1

In [49]:
df.user_review.fillna('0', inplace = True)

In [50]:
df.user_review = df.user_review.apply(lambda x: int(x.replace(",", "").split(" ")[0].strip()))


## Critics review

In [51]:
df.critic_review.isnull().sum()

102

In [52]:
df.critic_review = df.critic_review.str.extract(r'(\d+)')

df.critic_review.fillna(0, inplace=True)

## Budget

In [53]:
df.budget.isna().sum()

1483

In [54]:
df.budget

0        $55000000
1        $11400000
2        $95000000
3       $356000000
4         $6000000
           ...    
2829           NaN
2830     $14000000
2831           NaN
2832      $2000000
2833      $6000000
Name: budget, Length: 2834, dtype: object

In [55]:
df.budget = df.budget.str.extract(r'(\d+)')

In [56]:
df.budget

0        55000000
1        11400000
2        95000000
3       356000000
4         6000000
          ...    
2829          NaN
2830     14000000
2831          NaN
2832      2000000
2833      6000000
Name: budget, Length: 2834, dtype: object

In [57]:
df.budget.fillna('0', inplace = True)

In [58]:
df.budget = df.budget.apply(lambda x: int(x))

In [60]:
df.budget.value_counts()

0           1485
20000000      54
5000000       51
10000000      44
3000000       38
            ... 
829000         1
72000000       1
1453000        1
140000         1
46667761       1
Name: budget, Length: 332, dtype: int64

In [61]:
df.budget = df.budget.replace(0, df[df.budget != 0].budget.median())

In [62]:
df.budget

0        55000000
1        11400000
2        95000000
3       356000000
4         6000000
          ...    
2829     10000000
2830     14000000
2831     10000000
2832      2000000
2833      6000000
Name: budget, Length: 2834, dtype: int64

## gross

In [63]:
df.gross.isnull().sum()

1629

In [64]:
df.gross = df.gross.str.extract(r'(\d+)')

In [65]:
df.gross.fillna('0', inplace = True)

In [66]:
df.gross = df.gross.apply(lambda x: int(x))

In [67]:
df.gross = df.gross.replace(0, df[df.gross != 0].gross.median())

## Opening weak

In [68]:
df.opening_week = df.opening_week.str.extract(r'(\d+)')

df.opening_week.fillna('0', inplace = True)

df.opening_week = df.opening_week.apply(lambda x: int(x))

## replacing the 0's with budget median value not including 0's
df.opening_week = df.opening_week.replace(0, df[df.gross != 0].opening_week.median())

## Certificate

In [69]:
df.certificate.isnull().sum()


494

In [70]:
df.certificate.fillna('Not Rated', inplace = True)
df.certificate.replace("Not Rated", "Unrated", inplace=True)

In [71]:
df.head()

Unnamed: 0,movie_id,title,runtime,genre,certificate,imdb_rating,year,votes,director,actors,...,language,country,budget,gross,opening_week,oscar_win,oscar_nom,other_win,other_nom,meta_score
0,tt7286456,Joker,122.0,"Crime, Drama, Thriller",R,8.5,2019,804730,Todd Phillips,"['Joaquin Phoenix', 'Robert De Niro', 'Zazie B...",...,English,"USA, Canada",55000000,335451311,96202337,2,0,92,199,59.0
1,tt6751668,Parasite,132.0,"Comedy, Drama, Thriller",R,8.6,2019,423230,Bong Joon Ho,"['Kang-ho Song', 'Sun-kyun Lee', 'Yeo-jeong Jo...",...,Korean,South Korea,11400000,53369749,393216,4,0,262,227,96.0
2,tt8579674,1917,119.0,"Drama, War",R,8.3,2019,324071,Sam Mendes,"['Dean-Charles Chapman', 'George MacKay', 'Dan...",...,English,"USA, UK, India, Spain, Canada",95000000,159227644,576216,3,0,110,161,78.0
3,tt4154796,Avengers: Endgame,181.0,"Action, Adventure, Drama",PG-13,8.4,2019,726032,Anthony Russo,"['Joe Russo', 'Robert Downey Jr.', 'Chris Evan...",...,English,USA,356000000,858373000,357115007,0,1,65,103,78.0
4,tt0068646,The Godfather,175.0,"Crime, Drama",R,9.2,1972,1551490,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",...,English,USA,6000000,134966411,302393,3,0,26,30,100.0


## Genre

In [73]:
def custom_encoder(df,feature):
    unique_feature = []
    for i in df.loc[:,feature]:
        a = i.split(",")
        for b in a:
            if b.strip() not in unique_feature:
                unique_feature.append(b.strip())
    cleaned_feature = []
    for i in df.loc[:,feature]:
        a = i.split(",")
        c =[]
        for b in a:
            c.append(b.strip())
        cleaned_feature.append(c)
    for i in unique_feature:
        df[i] = [1 if i in x else 0 for x in cleaned_feature]
    df.drop(columns=[feature], inplace = True)
    return df
    
            
df = custom_encoder(df, "genre")


In [77]:
df.head()

Unnamed: 0,movie_id,title,runtime,certificate,imdb_rating,year,votes,director,actors,summary,...,Family,History,Music,Animation,Sport,Musical,Film-Noir,Documentary,News,Short
0,tt7286456,Joker,122.0,R,8.5,2019,804730,Todd Phillips,"['Joaquin Phoenix', 'Robert De Niro', 'Zazie B...","In Gotham City, mentally troubled comedian Art...",...,0,0,0,0,0,0,0,0,0,0
1,tt6751668,Parasite,132.0,R,8.6,2019,423230,Bong Joon Ho,"['Kang-ho Song', 'Sun-kyun Lee', 'Yeo-jeong Jo...",Greed and class discrimination threaten the ne...,...,0,0,0,0,0,0,0,0,0,0
2,tt8579674,1917,119.0,R,8.3,2019,324071,Sam Mendes,"['Dean-Charles Chapman', 'George MacKay', 'Dan...","April 6th, 1917. As a regiment assembles to wa...",...,0,0,0,0,0,0,0,0,0,0
3,tt4154796,Avengers: Endgame,181.0,PG-13,8.4,2019,726032,Anthony Russo,"['Joe Russo', 'Robert Downey Jr.', 'Chris Evan...",After the devastating events of Avengers: Infi...,...,0,0,0,0,0,0,0,0,0,0
4,tt0068646,The Godfather,175.0,R,9.2,1972,1551490,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",The aging patriarch of an organized crime dyna...,...,0,0,0,0,0,0,0,0,0,0


## Actor

In [78]:
df.actors.isnull().sum()

0

In [79]:
df.actors

0       ['Joaquin Phoenix', 'Robert De Niro', 'Zazie B...
1       ['Kang-ho Song', 'Sun-kyun Lee', 'Yeo-jeong Jo...
2       ['Dean-Charles Chapman', 'George MacKay', 'Dan...
3       ['Joe Russo', 'Robert Downey Jr.', 'Chris Evan...
4       ['Marlon Brando', 'Al Pacino', 'James Caan', '...
                              ...                        
2829    ['Alexander Mendeluk', 'Kate Nauta', 'Benjamin...
2830    ['Adrien Brody', 'Emmanuelle Seigner', 'Elsa P...
2831    ['Sharni Vinson', 'Carlyn Burchell', 'Steven J...
2832    ['David Huffman', 'Marianna Hill', 'Burt Young...
2833    ['Rosanna Arquette', 'William Forsythe', 'Ange...
Name: actors, Length: 2834, dtype: object

In [80]:
df.actors = cleaner(df, "actors")

In [81]:
df.actors

0       Joaquin Phoenix, Robert De Niro, Zazie Beetz, ...
1       Kang-ho Song, Sun-kyun Lee, Yeo-jeong Jo, Woo-...
2       Dean-Charles Chapman, George MacKay, Daniel Ma...
3       Joe Russo, Robert Downey Jr., Chris Evans, Mar...
4       Marlon Brando, Al Pacino, James Caan, Diane Ke...
                              ...                        
2829    Alexander Mendeluk, Kate Nauta, Benjamin Easte...
2830    Adrien Brody, Emmanuelle Seigner, Elsa Pataky,...
2831    Sharni Vinson, Carlyn Burchell, Steven John Wa...
2832    David Huffman, Marianna Hill, Burt Young, Otis...
2833    Rosanna Arquette, William Forsythe, Angelina J...
Name: actors, Length: 2834, dtype: object

In [83]:
df.actors.nunique()

2816

In [84]:
#to many unique value so we can drop it
df.drop(columns="actors", inplace=True)

df.shape

(2834, 46)

## Writer


In [85]:
df.writer.isnull().sum()

4

In [86]:
df.writer

0               ['Todd Phillips', 'Scott Silver']
1                ['Bong Joon Ho', 'Bong Joon Ho']
2          ['Sam Mendes', 'Krysty Wilson-Cairns']
3       ['Christopher Markus', 'Stephen McFeely']
4          ['Mario Puzo', 'Francis Ford Coppola']
                          ...                    
2829                               ['Keith Shaw']
2830                 ['Jim Agnew', 'Sean Keller']
2831         ['Jonathan Jordaan', 'Alastair Orr']
2832           ['Jeffrey Bloom', 'Jeffrey Bloom']
2833         ['Tony Cinciripini', 'Antone Pagán']
Name: writer, Length: 2834, dtype: object

In [87]:
writer = []
for x in df.writer:
    try:
        a = x.split("(")[0].strip()
        a = a.strip(",")
        a = a.replace(" ", "_")
    except:
        a = None
    writer.append(a)
df.writer = writer
        

In [89]:
df.drop("writer", axis=1, inplace=True)

In [90]:
df.drop("director", axis=1, inplace=True)

In [91]:
df.head()

Unnamed: 0,movie_id,title,runtime,certificate,imdb_rating,year,votes,summary,user_review,critic_review,...,Family,History,Music,Animation,Sport,Musical,Film-Noir,Documentary,News,Short
0,tt7286456,Joker,122.0,R,8.5,2019,804730,"In Gotham City, mentally troubled comedian Art...",10188,682,...,0,0,0,0,0,0,0,0,0,0
1,tt6751668,Parasite,132.0,R,8.6,2019,423230,Greed and class discrimination threaten the ne...,2507,515,...,0,0,0,0,0,0,0,0,0,0
2,tt8579674,1917,119.0,R,8.3,2019,324071,"April 6th, 1917. As a regiment assembles to wa...",2674,461,...,0,0,0,0,0,0,0,0,0,0
3,tt4154796,Avengers: Endgame,181.0,PG-13,8.4,2019,726032,After the devastating events of Avengers: Infi...,8764,550,...,0,0,0,0,0,0,0,0,0,0
4,tt0068646,The Godfather,175.0,R,9.2,1972,1551490,The aging patriarch of an organized crime dyna...,3824,249,...,0,0,0,0,0,0,0,0,0,0


In [92]:
df.to_csv('cleaned_imdb.csv')