<a href="https://colab.research.google.com/github/iadeiza/GitHub-Test-Project/blob/main/IMDB_Movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd


# IMDb URL with a placeholder {} for page number
base_url = "https://www.imdb.com/search/title/?groups=top_1000&count=100&start="
movie_list = []

# Loop through pages 1 to 3
for page_number in range(10):
    start_index = page_number * 100 + 1
    url = f"{base_url}{start_index}"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    # Find all containers for each movie on the page
    movies = soup.find_all("div", class_="lister-item-content")

    for movie in movies:
        # Extract title of the movie
        title = movie.find("a").text.strip()

        # Extract year of the movie and remove unnecessary characters
        year = movie.find("span", class_="lister-item-year").text.strip("()")
        year = re.findall(r'\d{4}', year)
        year = year[0]

        # Extract genre of the movie
        genre = movie.find("span", class_="genre").text.strip()

        # Extract runtime of the movie
        runtime = movie.find("span", class_="runtime").text.strip()

        # Extract IMDb rating of the movie
        imdb_rating = movie.find("strong").text.strip()

        # Extract Metascore if available, else default to "N/A"
        #metascore_tag = movie.find("span", class_="metascore  favorable")
        metascore_tag = movie.find("span", class_="metascore favorable")
        metascore = metascore_tag.text.strip() if metascore_tag else "N/A"

        # Extract summary of the movie
        summary = movie.find_all("p")[1].text.strip()

        # Extract director and stars
        director_and_stars = movie.find_all("p")[2].find_all("a")
        director = director_and_stars[0].text.strip()
        stars = [star.text.strip() for star in director_and_stars[1:]]

        # Extract votes and gross revenue if available, else default to "N/A"
        votes_and_gross = movie.find_all("span", attrs={"name": "nv"})
        votes = votes_and_gross[0].text.strip()
        gross_revenue = votes_and_gross[1].text.strip() if len(votes_and_gross) > 1 else "N/A"

        movie_dict = {
            "TITLE": title,
            "YEAR": year,
            "GENRE": genre,
            "RUNTIME": runtime,
            "RATINGS": imdb_rating,
            "METASCORE": metascore,
            "SUMMARY": summary,
            "DIRECTOR": director,
            "STARS": stars,
            "VOTES": votes,
            "GROSS REVENUE": gross_revenue
        }
        movie_list.append(movie_dict)

# Convert movie_list dictinary to Pandas DataFrame
movies_df = pd.DataFrame(movie_list)
# Check the count of the dataframe
movies_df.count()

# Save the dataframe to csv as top_1000
movies_df.to_csv("movies_top_1000.csv")

In [None]:
# IMDb URL with a placeholder {} for page number
base_url = "https://www.imdb.com/search/title/?groups=bottom_1000&count=100&start="
movie_list = []

# Loop through pages 1 to 10
for page_number in range(10):
    count = page_number * 100 + 1
    url = f"{base_url}{count}"
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")

    # Find all containers for each movie on the page
    movies = soup.find_all("div", class_="lister-item-content")

    for movie in movies:
        # Extract title of the movie
        title = movie.find("a").text.strip()

        # Extract year of the movie and remove unnecessary characters
        year = movie.find("span", class_="lister-item-year").text.strip("()")
        year = re.findall(r'\d{4}', year)
        year = year[0]

        # Extract genre of the movie
        genre = movie.find("span", class_="genre").text.strip()

        # Extract runtime of the movie
        runtime = movie.find("span", class_="runtime").text.strip()

        # Extract IMDb rating of the movie
        imdb_rating = movie.find("strong").text.strip()

        # Extract Metascore if available, else default to "N/A"
        #metascore_tag = movie.find("span", class_="metascore  favorable")
        metascore_tag = movie.find("span", class_="metascore unfavorable")
        metascore = metascore_tag.text.strip() if metascore_tag else "N/A"

        # Extract summary of the movie
        summary = movie.find_all("p")[1].text.strip()

        # Extract director and stars
        director_and_stars = movie.find_all("p")[2].find_all("a")
        director = director_and_stars[0].text.strip()
        stars = [star.text.strip() for star in director_and_stars[1:]]

        # Extract votes and gross revenue if available, else default to "N/A"
        votes_and_gross = movie.find_all("span", attrs={"name": "nv"})
        votes = votes_and_gross[0].text.strip()
        gross_revenue = votes_and_gross[1].text.strip() if len(votes_and_gross) > 1 else "N/A"

        movie_dict = {
            "TITLE": title,
            "YEAR": year,
            "GENRE": genre,
            "RUNTIME": runtime,
            "RATINGS": imdb_rating,
            "METASCORE": metascore,
            "SUMMARY": summary,
            "DIRECTOR": director,
            "STARS": stars,
            "VOTES": votes,
            "GROSS REVENUE": gross_revenue
        }
        movie_list.append(movie_dict)

# Convert movie_list dictinary to Pandas DataFrame
movies_df = pd.DataFrame(movie_list)
# Check the count of the dataframe
movies_df.count()

# Save the dataframe to csv as bottom_1000
movies_df.to_csv("movies_bottom_1000.csv")

In [None]:
top_df = pd.read_csv("movies_top_1000.csv", encoding='utf-8')
top_df.head()

Unnamed: 0.1,Unnamed: 0,TITLE,YEAR,GENRE,RUNTIME,RATINGS,METASCORE,SUMMARY,DIRECTOR,STARS,VOTES,GROSS REVENUE
0,0,Killers of the Flower Moon,2023,"Crime, Drama, History",206 min,8.0,89.0,When oil is discovered in 1920s Oklahoma under...,Martin Scorsese,"['Leonardo DiCaprio', 'Robert De Niro', 'Lily ...",73875,
1,1,Spider-Man: Across the Spider-Verse,2023,"Animation, Action, Adventure",140 min,8.7,86.0,"Miles Morales catapults across the Multiverse,...",Joaquim Dos Santos,"['Kemp Powers', 'Justin K. Thompson', 'Shameik...",288543,#25
2,2,Halloween,1978,"Horror, Thriller",91 min,7.7,87.0,Fifteen years after murdering his sister on Ha...,John Carpenter,"['Donald Pleasence', 'Jamie Lee Curtis', 'Tony...",300322,$47.00M
3,3,The Nightmare Before Christmas,1993,"Animation, Family, Fantasy",76 min,7.9,82.0,"Jack Skellington, king of Halloween Town, disc...",Henry Selick,"['Danny Elfman', 'Chris Sarandon', ""Catherine ...",364318,$75.08M
4,4,Oppenheimer,2023,"Biography, Drama, History",180 min,8.5,88.0,"The story of American scientist, J. Robert Opp...",Christopher Nolan,"['Cillian Murphy', 'Emily Blunt', 'Matt Damon'...",474153,#47


In [None]:
bottom_df = pd.read_csv("movies_bottom_1000.csv")
bottom_df.head()

Unnamed: 0.1,Unnamed: 0,TITLE,YEAR,GENRE,RUNTIME,RATINGS,METASCORE,SUMMARY,DIRECTOR,STARS,VOTES,GROSS REVENUE
0,0,The Exorcist: Believer,2023,Horror,111 min,4.9,39.0,When two girls disappear into the woods and re...,David Gordon Green,"['Lafortune Joseph', 'Leslie Odom Jr.', 'Gastn...",24717,
1,1,The Nun,2018,"Horror, Mystery, Thriller",96 min,5.3,,A priest with a haunted past and a novice on t...,Corin Hardy,"['Demián Bichir', 'Taissa Farmiga', 'Jonas Blo...",164327,$117.45M
2,2,Expend4bles,2023,"Action, Adventure, Thriller",103 min,4.8,30.0,Armed with every weapon they can get their han...,Scott Waugh,"['Jason Statham', '50 Cent', 'Megan Fox', 'Dol...",22308,
3,3,Winnie the Pooh: Blood and Honey,2023,Horror,84 min,2.9,16.0,After Christopher Robin abandons them for coll...,Rhys Frake-Waterfield,"['Nikolai Leon', 'Maria Taylor', 'Natasha Rose...",23722,
4,4,Halloween Ends,2022,"Horror, Thriller",111 min,5.0,,The saga of Michael Myers and Laurie Strode co...,David Gordon Green,"['Jamie Lee Curtis', 'Andi Matichak', 'James J...",68215,


In [None]:
print(top_df.shape)
print(bottom_df.shape)

(1000, 12)
(1000, 12)


In [None]:
# Concatenate the DataFrames
merged_df = pd.concat([top_df, bottom_df], ignore_index=True)

# Drop the redundant 'Unnamed: 0' column
merged_df.drop(columns=['Unnamed: 0'], inplace=True)

merged_df.shape

(2000, 11)

In [None]:
merged_df.head()

Unnamed: 0,TITLE,YEAR,GENRE,RUNTIME,RATINGS,METASCORE,SUMMARY,DIRECTOR,STARS,VOTES,GROSS REVENUE
0,Killers of the Flower Moon,2023,"Crime, Drama, History",206 min,8.0,89.0,When oil is discovered in 1920s Oklahoma under...,Martin Scorsese,"['Leonardo DiCaprio', 'Robert De Niro', 'Lily ...",73875,
1,Spider-Man: Across the Spider-Verse,2023,"Animation, Action, Adventure",140 min,8.7,86.0,"Miles Morales catapults across the Multiverse,...",Joaquim Dos Santos,"['Kemp Powers', 'Justin K. Thompson', 'Shameik...",288543,#25
2,Halloween,1978,"Horror, Thriller",91 min,7.7,87.0,Fifteen years after murdering his sister on Ha...,John Carpenter,"['Donald Pleasence', 'Jamie Lee Curtis', 'Tony...",300322,$47.00M
3,The Nightmare Before Christmas,1993,"Animation, Family, Fantasy",76 min,7.9,82.0,"Jack Skellington, king of Halloween Town, disc...",Henry Selick,"['Danny Elfman', 'Chris Sarandon', ""Catherine ...",364318,$75.08M
4,Oppenheimer,2023,"Biography, Drama, History",180 min,8.5,88.0,"The story of American scientist, J. Robert Opp...",Christopher Nolan,"['Cillian Murphy', 'Emily Blunt', 'Matt Damon'...",474153,#47


In [None]:
tom_cruise = merged_df[merged_df['STARS'].str.contains('Tom Cruise')]
tom_cruise

Unnamed: 0,TITLE,YEAR,GENRE,RUNTIME,RATINGS,METASCORE,SUMMARY,DIRECTOR,STARS,VOTES,GROSS REVENUE
5,Mission: Impossible - Dead Reckoning Part One,2023,"Action, Adventure, Thriller",163 min,7.8,81.0,Ethan Hunt and his IMF team must track down a ...,Christopher McQuarrie,"['Tom Cruise', 'Hayley Atwell', 'Ving Rhames',...",178956,
33,Top Gun: Maverick,2022,"Action, Drama",130 min,8.3,78.0,"After thirty years, Maverick is still pushing ...",Joseph Kosinski,"['Tom Cruise', 'Jennifer Connelly', 'Miles Tel...",638969,$718.73M
133,Mission: Impossible - Fallout,2018,"Action, Adventure, Thriller",147 min,7.7,86.0,"Ethan Hunt and his IMF team, along with some f...",Christopher McQuarrie,"['Tom Cruise', 'Henry Cavill', 'Ving Rhames', ...",367592,$220.16M
199,Edge of Tomorrow,2014,"Action, Adventure, Sci-Fi",113 min,7.9,71.0,A soldier fighting aliens gets to relive the s...,Doug Liman,"['Tom Cruise', 'Emily Blunt', 'Bill Paxton', '...",717601,$100.21M
317,A Few Good Men,1992,"Drama, Thriller",138 min,7.7,62.0,Military lawyer Lieutenant Daniel Kaffee defen...,Rob Reiner,"['Tom Cruise', 'Jack Nicholson', 'Demi Moore',...",280242,$141.34M
348,The Last Samurai,2003,"Action, Drama",154 min,7.8,,An American military advisor embraces the Samu...,Edward Zwick,"['Tom Cruise', 'Ken Watanabe', 'Billy Connolly...",460905,$111.11M
355,Minority Report,2002,"Action, Crime, Mystery",145 min,7.6,80.0,In a future where a special police unit is abl...,Steven Spielberg,"['Tom Cruise', 'Colin Farrell', 'Samantha Mort...",574127,$132.07M
379,Magnolia,1999,Drama,188 min,8.0,78.0,An epic mosaic of interrelated characters in s...,Paul Thomas Anderson,"['Tom Cruise', 'Jason Robards', 'Julianne Moor...",323858,$22.46M
385,Rain Man,1988,Drama,133 min,8.0,65.0,After a selfish L.A. yuppie learns his estrang...,Barry Levinson,"['Dustin Hoffman', 'Tom Cruise', 'Valeria Goli...",536004,$178.80M
1005,The Mummy,2017,"Action, Adventure, Fantasy",110 min,5.4,34.0,An ancient Egyptian princess is awakened from ...,Alex Kurtzman,"['Tom Cruise', 'Sofia Boutella', 'Annabelle Wa...",201090,$80.10M


In [None]:
# Check the data info
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TITLE          2000 non-null   object 
 1   YEAR           2000 non-null   int64  
 2   GENRE          2000 non-null   object 
 3   RUNTIME        2000 non-null   object 
 4   RATINGS        2000 non-null   float64
 5   METASCORE      1343 non-null   float64
 6   SUMMARY        2000 non-null   object 
 7   DIRECTOR       2000 non-null   object 
 8   STARS          2000 non-null   object 
 9   VOTES          2000 non-null   object 
 10  GROSS REVENUE  1543 non-null   object 
dtypes: float64(2), int64(1), object(8)
memory usage: 172.0+ KB


In [None]:
# Check the total number of missing values in "GROSS REVENUE" column
merged_df["GROSS REVENUE"].isna().sum()

457

In [None]:
# Check the summary statistics of the dataset
merged_df.describe()

Unnamed: 0,YEAR,RATINGS,METASCORE
count,2000.0,2000.0,1343.0
mean,1999.8695,6.31705,58.966493
std,20.313759,1.7551,27.844462
min,1920.0,1.2,1.0
25%,1993.0,4.9,31.0
50%,2006.0,6.85,70.0
75%,2014.0,7.9,84.0
max,2023.0,9.3,100.0


In [None]:
# Check the datatypes of the each columns
merged_df.dtypes

TITLE             object
YEAR               int64
GENRE             object
RUNTIME           object
RATINGS          float64
METASCORE        float64
SUMMARY           object
DIRECTOR          object
STARS             object
VOTES             object
GROSS REVENUE     object
dtype: object

In [19]:
non_revenue = merged_df[merged_df['GROSS REVENUE'].str.contains('#', na=False)]
len(non_revenue)

30