In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re

In [2]:
#First of all I want to import the dataset with the 1000 best rated movies by the audience on IMDB.
df = pd.read_csv("../data/imdb_top_1000.csv", encoding = "unicode_escape")

### Initial Dataframe

In [3]:
df.isna().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [4]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [6]:
#There are a few columns that won't provide any insight to the project so I will drop them.
#The columns are: ["Poster_Link", "Certificate," "Meta_score"]
#Poster Link is useless and we already have the IMDB Rating to rate the movies so we don't need neither Certificate nor 
#Meta score, and furthermore the two latter have many null values that we will not have.

In [6]:
df = df.drop(["Poster_Link", "Certificate", "Meta_score"], axis = 1)

### Cast Columns Fix

In [8]:
#Another thing that I believe that would be useful is to convert the 4 strings in columns Star 1,2,3,4 into a one column list.
#It will take less space and the usability will be the same.

In [9]:
df["Cast"] = (df["Star1"] + "-" + df["Star2"] + "-" + df["Star3"] + "-" + df["Star4"])

In [11]:
df['Cast'] = df.Cast.apply(lambda x: x[0:].split('-'))

In [13]:
#Now we can delete the columns with the actors
df = df.drop(["Star1", "Star2", "Star3", "Star4"], axis = 1)

### Released Year Column Fixed

In [20]:
#Now I will change the numerical values into integers
df["Released_Year"].value_counts()


2014    32
2004    31
2009    29
2013    28
2016    28
        ..
1926     1
1936     1
1924     1
1921     1
PG       1
Name: Released_Year, Length: 100, dtype: int64

In [21]:
#I found that there is a row that has "PG" for a year. I will take a closer look.

In [22]:
#There must be a mistake in the Dataset and the year of publication of Apollo 13 was 1995 so I will change it manually

In [23]:
df.loc[df['Released_Year'] == "PG"] 

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Director,No_of_Votes,Gross,Cast
966,Apollo 13,PG,140 min,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,Ron Howard,269197,173837933,"[Tom Hanks, Bill Paxton, Kevin Bacon, Gary Sin..."


In [24]:
df["Released_Year"] = df["Released_Year"].replace('[^0-9.]','1995', regex=True)

In [25]:
df["Released_Year"] = df["Released_Year"].replace('19951995','1995', regex=True)

In [26]:
df.loc[df['Series_Title'] == "Apollo 13"] 

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Director,No_of_Votes,Gross,Cast
966,Apollo 13,1995,140 min,"Adventure, Drama, History",7.6,NASA must devise a strategy to return Apollo 1...,Ron Howard,269197,173837933,"[Tom Hanks, Bill Paxton, Kevin Bacon, Gary Sin..."


In [27]:
df["Released_Year"] = df["Released_Year"].astype(int)

In [28]:
df.dtypes

Series_Title      object
Released_Year      int32
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Director          object
No_of_Votes        int64
Gross              Int32
Cast              object
dtype: object

### Genre Column Fix

In [29]:
#I want to convert the column Genre from a comma separated string into a list of strings

In [30]:
df['Genre'] = df.Genre.apply(lambda x: x[0:].split(','))

### Runtime Column Fix

In [33]:
#I want to make Runtime numeric, to do that I will strip all non-numeric values and when there is just numbers make it an int

In [34]:
df["Runtime"] = df["Runtime"].replace('[^0-9]','', regex=True)

In [36]:
df["Runtime"] = df["Runtime"].astype(int)

### Web Scrapping

In [5]:
#I will scrap a website with all Oscar winners and create a list with all of their names to create a column in the Dataframe
#that tells us if the movie won the Oscar or not

In [38]:
oscars = "https://www.imdb.com/list/ls009480135/?sort=list_order,asc&st_dt=&mode=simple&page=1&ref_=ttls_vw_smp"

In [39]:
headers = {"Accept-Language": "en-US,en;q=0.5"}

In [40]:
res_oscars = requests.get(oscars, headers=headers)

In [41]:
html_oscars = res_oscars.content

In [42]:
soup = BeautifulSoup(html_oscars, "html.parser")

In [43]:
titles = soup.find_all("div", attrs={"class":"col-title"})
titles[0]

<div class="col-title">
<span class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<span title="Chloé Zhao (dir.), Frances McDormand, David Strathairn">
<a href="/title/tt9770150/">Nomadland</a>
<span class="lister-item-year text-muted unbold">(2020)</span>
</span>
</span>
</div>

In [44]:
titles[0].get("href")

In [45]:
print(titles[0].get("href"))

None


In [46]:
titles[6].getText().replace("\n", "")[2:].split("(")[0].strip()

'Birdman or'

In [47]:
titles_all = [titles[i].getText().replace("\n", "")[2:].split("(")[0].strip() for i in range(len(titles))]

In [48]:
titles_all

['Nomadland',
 'Parasite',
 'Green Book',
 'The Shape of Water',
 'Moonlight',
 'Spotlight',
 'Birdman or',
 '12 Years a Slave',
 'Argo',
 '.The Artist',
 ".The King's Speech",
 '.The Hurt Locker',
 '.Slumdog Millionaire',
 '.No Country for Old Men',
 '.The Departed',
 '.Crash',
 '.Million Dollar Baby',
 '.The Lord of the Rings: The Return of the King',
 '.Chicago',
 '.A Beautiful Mind',
 '.Gladiator',
 '.American Beauty',
 '.Shakespeare in Love',
 '.Titanic',
 '.The English Patient',
 '.Braveheart',
 '.Forrest Gump',
 ".Schindler's List",
 '.Unforgiven',
 '.The Silence of the Lambs',
 '.Dances with Wolves',
 '.Driving Miss Daisy',
 '.Rain Man',
 '.The Last Emperor',
 '.Platoon',
 '.Out of Africa',
 '.Amadeus',
 '.Terms of Endearment',
 '.Gandhi',
 '.Chariots of Fire',
 '.Ordinary People',
 '.Kramer vs. Kramer',
 '.The Deer Hunter',
 '.Annie Hall',
 '.Rocky',
 ".One Flew Over the Cuckoo's Nest",
 '.The Godfather Part II',
 '.The Sting',
 '.The Godfather',
 '.The French Connection',
 '.

In [49]:
titles_fixed = []
for i in titles_all:
    if i[0] == ".":
        titles_fixed.append(i[1:])
    else:
        titles_fixed.append(i)

In [50]:
titles_fixed

['Nomadland',
 'Parasite',
 'Green Book',
 'The Shape of Water',
 'Moonlight',
 'Spotlight',
 'Birdman or',
 '12 Years a Slave',
 'Argo',
 'The Artist',
 "The King's Speech",
 'The Hurt Locker',
 'Slumdog Millionaire',
 'No Country for Old Men',
 'The Departed',
 'Crash',
 'Million Dollar Baby',
 'The Lord of the Rings: The Return of the King',
 'Chicago',
 'A Beautiful Mind',
 'Gladiator',
 'American Beauty',
 'Shakespeare in Love',
 'Titanic',
 'The English Patient',
 'Braveheart',
 'Forrest Gump',
 "Schindler's List",
 'Unforgiven',
 'The Silence of the Lambs',
 'Dances with Wolves',
 'Driving Miss Daisy',
 'Rain Man',
 'The Last Emperor',
 'Platoon',
 'Out of Africa',
 'Amadeus',
 'Terms of Endearment',
 'Gandhi',
 'Chariots of Fire',
 'Ordinary People',
 'Kramer vs. Kramer',
 'The Deer Hunter',
 'Annie Hall',
 'Rocky',
 "One Flew Over the Cuckoo's Nest",
 'The Godfather Part II',
 'The Sting',
 'The Godfather',
 'The French Connection',
 'Patton',
 'Midnight Cowboy',
 'Oliver!',
 

In [51]:
titles_fixed.index('Birdman or')

6

In [52]:
titles_fixed[6]= "Birdman"

In [53]:
titles_fixed

['Nomadland',
 'Parasite',
 'Green Book',
 'The Shape of Water',
 'Moonlight',
 'Spotlight',
 'Birdman',
 '12 Years a Slave',
 'Argo',
 'The Artist',
 "The King's Speech",
 'The Hurt Locker',
 'Slumdog Millionaire',
 'No Country for Old Men',
 'The Departed',
 'Crash',
 'Million Dollar Baby',
 'The Lord of the Rings: The Return of the King',
 'Chicago',
 'A Beautiful Mind',
 'Gladiator',
 'American Beauty',
 'Shakespeare in Love',
 'Titanic',
 'The English Patient',
 'Braveheart',
 'Forrest Gump',
 "Schindler's List",
 'Unforgiven',
 'The Silence of the Lambs',
 'Dances with Wolves',
 'Driving Miss Daisy',
 'Rain Man',
 'The Last Emperor',
 'Platoon',
 'Out of Africa',
 'Amadeus',
 'Terms of Endearment',
 'Gandhi',
 'Chariots of Fire',
 'Ordinary People',
 'Kramer vs. Kramer',
 'The Deer Hunter',
 'Annie Hall',
 'Rocky',
 "One Flew Over the Cuckoo's Nest",
 'The Godfather Part II',
 'The Sting',
 'The Godfather',
 'The French Connection',
 'Patton',
 'Midnight Cowboy',
 'Oliver!',
 'In

In [54]:
df.loc[df["Series_Title"].isin(titles_fixed)]


Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Director,No_of_Votes,Gross,Cast
1,The Godfather,1972,175,"[Crime, Drama]",9.2,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,1620367,134966411,"[Marlon Brando, Al Pacino, James Caan, Diane K..."
5,The Lord of the Rings: The Return of the King,2003,201,"[Action, Adventure, Drama]",8.9,Gandalf and Aragorn lead the World of Men agai...,Peter Jackson,1642758,377845905,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O..."
7,Schindler's List,1993,195,"[Biography, Drama, History]",8.9,"In German-occupied Poland during World War II,...",Steven Spielberg,1213505,96898818,"[Liam Neeson, Ralph Fiennes, Ben Kingsley, Car..."
11,Forrest Gump,1994,142,"[Drama, Romance]",8.8,"The presidencies of Kennedy and Johnson, the e...",Robert Zemeckis,1809221,330252182,"[Tom Hanks, Robin Wright, Gary Sinise, Sally F..."
17,One Flew Over the Cuckoo's Nest,1975,133,[Drama],8.7,A criminal pleads insanity and is admitted to ...,Milos Forman,918088,112000000,"[Jack Nicholson, Louise Fletcher, Michael Berr..."
28,The Silence of the Lambs,1991,118,"[Crime, Drama, Thriller]",8.6,A young F.B.I. cadet must receive the help of ...,Jonathan Demme,1270197,130742922,"[Jodie Foster, Anthony Hopkins, Lawrence A. Bo..."
37,The Departed,2006,151,"[Crime, Drama, Thriller]",8.5,An undercover cop and a mole in the police att...,Martin Scorsese,1189773,132384315,"[Leonardo DiCaprio, Matt Damon, Jack Nicholson..."
39,Gladiator,2000,155,"[Action, Adventure, Drama]",8.5,A former Roman General sets out to exact venge...,Ridley Scott,1341460,187705427,"[Russell Crowe, Joaquin Phoenix, Connie Nielse..."
50,Casablanca,1942,102,"[Drama, Romance, War]",8.5,A cynical expatriate American cafe owner strug...,Michael Curtiz,522093,1024560,"[Humphrey Bogart, Ingrid Bergman, Paul Henreid..."
98,American Beauty,1999,122,[Drama],8.3,A sexually frustrated suburban father has a mi...,Sam Mendes,1069738,130096601,"[Kevin Spacey, Annette Bening, Thora Birch, We..."


In [55]:
df.loc[df['Series_Title'].isin(titles_fixed), 'Oscar_Winner'] = True

In [57]:
df["Oscar_Winner"].fillna(False,inplace=True)

In [1]:
#Now I have the column telling me if the film won the Oscar for Best Picture or not.

### Gross Column Fix

In [16]:
#To get the Gross amount in numeric we have to first delete the commas and then convert into integers.
df["Gross"] = df["Gross"].replace('[\&,)]','', regex=True)


In [17]:
df["Gross"] = df["Gross"].astype("Int32")

In [2]:
#Now I will first divide the Gross amount by 1M for readibility and the adjust them for inflation with a fixed rate of 3.8%.

In [62]:
df["Gross_(M)"] = df["Gross"] / 1000000

In [64]:
df.drop(["Gross"], axis=1)

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Director,No_of_Votes,Cast,Oscar_Winner,Gross_(M)
0,The Shawshank Redemption,1994,142,[Drama],9.3,Two imprisoned men bond over a number of years...,Frank Darabont,2343110,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",False,28.341469
1,The Godfather,1972,175,"[Crime, Drama]",9.2,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,1620367,"[Marlon Brando, Al Pacino, James Caan, Diane K...",True,134.966411
2,The Dark Knight,2008,152,"[Action, Crime, Drama]",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,2303232,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",False,534.858444
3,The Godfather: Part II,1974,202,"[Crime, Drama]",9.0,The early life and career of Vito Corleone in ...,Francis Ford Coppola,1129952,"[Al Pacino, Robert De Niro, Robert Duvall, Dia...",False,57.3
4,12 Angry Men,1957,96,"[Crime, Drama]",9.0,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,689845,"[Henry Fonda, Lee J. Cobb, Martin Balsam, John...",False,4.36
...,...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,"[Comedy, Drama, Romance]",7.6,A young New York socialite becomes interested ...,Blake Edwards,166544,"[Audrey Hepburn, George Peppard, Patricia Neal...",False,
996,Giant,1956,201,"[Drama, Western]",7.6,Sprawling epic covering the life of a Texas ca...,George Stevens,34075,"[Elizabeth Taylor, Rock Hudson, James Dean, Ca...",False,
997,From Here to Eternity,1953,118,"[Drama, Romance, War]",7.6,"In Hawaii in 1941, a private is cruelly punish...",Fred Zinnemann,43374,"[Burt Lancaster, Montgomery Clift, Deborah Ker...",True,30.5
998,Lifeboat,1944,97,"[Drama, War]",7.6,Several survivors of a torpedoed merchant ship...,Alfred Hitchcock,26471,"[Tallulah Bankhead, John Hodiak, Walter Slezak...",False,


In [3]:
#Here I will create the new column with the PV of the Gross Amount made.

In [65]:
df["Gross_Adj_(M)"] = df["Gross_(M)"] * (1.038 ** (2022-df["Released_Year"]))

In [4]:
#Now I will delete the original Gross Column.

In [71]:
df.drop(["Gross"], axis = 1)

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Director,No_of_Votes,Cast,Oscar_Winner,Gross_(M),Gross_Adj_(M)
0,The Shawshank Redemption,1994,142,[Drama],9.3,Two imprisoned men bond over a number of years...,Frank Darabont,2343110,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",False,28.341469,80.528249
1,The Godfather,1972,175,"[Crime, Drama]",9.2,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,1620367,"[Marlon Brando, Al Pacino, James Caan, Diane K...",True,134.966411,871.150871
2,The Dark Knight,2008,152,"[Action, Crime, Drama]",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,2303232,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",False,534.858444,901.574886
3,The Godfather: Part II,1974,202,"[Crime, Drama]",9.0,The early life and career of Vito Corleone in ...,Francis Ford Coppola,1129952,"[Al Pacino, Robert De Niro, Robert Duvall, Dia...",False,57.3,343.263472
4,12 Angry Men,1957,96,"[Crime, Drama]",9.0,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,689845,"[Henry Fonda, Lee J. Cobb, Martin Balsam, John...",False,4.36,49.239594
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Breakfast at Tiffany's,1961,115,"[Comedy, Drama, Romance]",7.6,A young New York socialite becomes interested ...,Blake Edwards,166544,"[Audrey Hepburn, George Peppard, Patricia Neal...",False,,
996,Giant,1956,201,"[Drama, Western]",7.6,Sprawling epic covering the life of a Texas ca...,George Stevens,34075,"[Elizabeth Taylor, Rock Hudson, James Dean, Ca...",False,,
997,From Here to Eternity,1953,118,"[Drama, Romance, War]",7.6,"In Hawaii in 1941, a private is cruelly punish...",Fred Zinnemann,43374,"[Burt Lancaster, Montgomery Clift, Deborah Ker...",True,30.5,399.868535
998,Lifeboat,1944,97,"[Drama, War]",7.6,Several survivors of a torpedoed merchant ship...,Alfred Hitchcock,26471,"[Tallulah Bankhead, John Hodiak, Walter Slezak...",False,,


### Final Result

In [72]:
df.isna().sum()

Series_Title       0
Released_Year      0
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Director           0
No_of_Votes        0
Gross            169
Cast               0
Oscar_Winner       0
Gross_(M)        169
Gross_Adj_(M)    169
dtype: int64

In [73]:
df.dtypes

Series_Title      object
Released_Year      int32
Runtime            int32
Genre             object
IMDB_Rating      float64
Overview          object
Director          object
No_of_Votes        int64
Gross              Int32
Cast              object
Oscar_Winner        bool
Gross_(M)        Float64
Gross_Adj_(M)    Float64
dtype: object

In [76]:
df.head()

Unnamed: 0,Series_Title,Released_Year,Runtime,Genre,IMDB_Rating,Overview,Director,No_of_Votes,Gross,Cast,Oscar_Winner,Gross_(M),Gross_Adj_(M)
0,The Shawshank Redemption,1994,142,[Drama],9.3,Two imprisoned men bond over a number of years...,Frank Darabont,2343110,28341469,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",False,28.341469,80.528249
1,The Godfather,1972,175,"[Crime, Drama]",9.2,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,1620367,134966411,"[Marlon Brando, Al Pacino, James Caan, Diane K...",True,134.966411,871.150871
2,The Dark Knight,2008,152,"[Action, Crime, Drama]",9.0,When the menace known as the Joker wreaks havo...,Christopher Nolan,2303232,534858444,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",False,534.858444,901.574886
3,The Godfather: Part II,1974,202,"[Crime, Drama]",9.0,The early life and career of Vito Corleone in ...,Francis Ford Coppola,1129952,57300000,"[Al Pacino, Robert De Niro, Robert Duvall, Dia...",False,57.3,343.263472
4,12 Angry Men,1957,96,"[Crime, Drama]",9.0,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,689845,4360000,"[Henry Fonda, Lee J. Cobb, Martin Balsam, John...",False,4.36,49.239594


In [77]:
df.to_csv('tableclean.csv', encoding='utf-8')