In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
movie_names = []
years = []
ratings = []
metascores = []
gross_incomes = []
votes = []
runtimes = []
genres = []
certificates = []
descriptions = []
directors = []
stars = []


In [3]:
url = 'https://www.imdb.com/search/title/?num_votes=10000,&sort=user_rating,desc&title_type=feature'


In [4]:
for page_number in range(1, 201):  
    page_url = f'{url}&start={50 * (page_number - 1) + 1}'
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    movie_data = soup.findAll('div', attrs={'class': 'lister-item-content'})
    
    for store in movie_data:
        # Extract movie name
        name = store.h3.a.text
        movie_names.append(name)

        # Extract year of release
        year_of_release = store.h3.find('span', class_='lister-item-year').text.strip('()')
        years.append(year_of_release)

        # Extract runtime
        time = store.p.find('span', class_='runtime').text.replace(' min', '')
        runtimes.append(time)

        # Extract IMDb rating
        rate = store.find('div', class_='inline-block ratings-imdb-rating').strong.text
        ratings.append(rate)

        # Extract Metascore (if available)
        meta = store.find('span', class_='metascore')
        metascores.append(meta.text.strip() if meta else 'NAN')

        # Extract number of votes
        vote = store.find('span', {'name': 'nv'})['data-value']
        votes.append(vote)

        # Extract gross income 
        gross = store.find_all('span', {'name': 'nv'})[-1]['data-value']
        gross_incomes.append(gross if gross else 'NAN')

        # Extract genre
        genre = store.find('span', class_='genre').text.strip()
        genres.append(genre)

        # Extract certificate
        certificate = store.find('span', class_='certificate').text if store.find('span', class_='certificate') else 'N/A'
        certificates.append(certificate)

        # Extract description
        description = store.find_all('p', class_='text-muted')[1].text.strip() if len(store.find_all('p', class_='text-muted')) > 1 else 'N/A'
        descriptions.append(description)


        # Extracting directors and stars
        credits = store.find_all('p')[2].text.split('|')
        director = credits[0].strip().replace('Director:', '')
        directors.append(director)

        stars_info = credits[1].strip().replace('Stars:', '') if len(credits) > 1 else 'N/A'
        stars_list = stars_info.split(',')
        stars.append(', '.join([star.strip() for star in stars_list]))



In [5]:
MOVIE_data = pd.DataFrame({'Name of movie': movie_names, 'YEAR': years, 'RATES': ratings, 'METASCORE': metascores,
                            'gross_collection': gross_incomes, 'Votes': votes, 'Watch Time': runtimes,
                            'Genre': genres, 'Certificate': certificates, 'Description': descriptions,
                            'Director': directors, 'Stars': stars})

In [6]:
MOVIE_data

Unnamed: 0,Name of movie,YEAR,RATES,METASCORE,gross_collection,Votes,Watch Time,Genre,Certificate,Description,Director,Stars
0,The Shawshank Redemption,1994,9.3,82,28341469,2806836,142,Drama,A,"Over the course of several years, two convicts...",\nFrank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi..."
1,The Godfather,1972,9.2,100,134966411,1955954,175,"Crime, Drama",A,"Don Vito Corleone, head of a mafia family, dec...",\nFrancis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Diane Ke..."
2,Ramayana: The Legend of Prince Rama,1993,9.2,NAN,13031,13031,135,"Animation, Action, Adventure",U,An anime adaptation of the Hindu epic the Rama...,"Directors:\nRam Mohan, \nYûgô Sakô, \nKoichi S...","Arun Govil, Nikhil Kapoor, Edie Mirman, Rael P..."
3,Hababam Sinifi,1975,9.2,NAN,42248,42248,87,"Comedy, Drama",,"Lazy, uneducated students share a very close b...",\nErtem Egilmez,"Kemal Sunal, Münir Özkul, Halit Akçatepe, Tari..."
4,The Dark Knight,2008,9.0,84,534858444,2788568,152,"Action, Crime, Drama",UA,When the menace known as the Joker wreaks havo...,\nChristopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Madea's Witness Protection,2012,4.9,42,65653242,11208,114,"Comedy, Crime, Drama",PG-13,A Wall Street investment banker who has been s...,\nTyler Perry,"Tyler Perry, Eugene Levy, Denise Richards, Tom..."
9996,Rise,2007,4.9,NAN,106398,12404,97,"Action, Crime, Horror",R,A reporter wakes up in a morgue and finds hers...,\nSebastian Gutierrez,"Lucy Liu, Michael Chiklis, Carla Gugino, Rober..."
9997,Le divorce,2003,4.9,51,9074550,11901,117,"Comedy, Drama, Romance",PG-13,French vs. American social customs and behavio...,\nJames Ivory,"Kate Hudson, Naomi Watts, Stockard Channing, J..."
9998,The Ten,2007,4.9,50,769726,16889,96,"Comedy, Romance",R,"10 stories, each inspired by one of the 10 Com...",\nDavid Wain,"Paul Rudd, Jessica Alba, Winona Ryder, Adam Brody"


In [8]:
MOVIE_data.to_csv("imdb_10000_movies.csv", index=False)