In [1]:
from bs4 import BeautifulSoup
import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

url = 'https://www.imdb.com/chart/top'
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')

In [2]:
import pandas as pd

# Find the list of movie containers
movie_containers = soup.find_all('li', class_='ipc-metadata-list-summary-item')

# Create an empty list to store individual DataFrames
dfs = []

# Loop through each container to extract and append data to the list
for movie_container in movie_containers:
    # Extract data for each movie
    title_element = movie_container.select_one('.ipc-title__text')
    movie_title = title_element.text if title_element else 'N/A'

    release_year_element = movie_container.select_one('.cli-title-metadata-item:nth-of-type(1)')
    release_year = release_year_element.text if release_year_element else 'N/A'

    duration_element = movie_container.select_one('.cli-title-metadata-item:nth-of-type(2)')
    duration = duration_element.text if duration_element else 'N/A'

    rating_element = movie_container.select_one('.cli-title-metadata-item:nth-of-type(3)')
    rating = rating_element.text if rating_element else 'N/A'

    imdb_rating_element = movie_container.find('span', {'class': 'ipc-rating-star--imdb'})
    imdb_rating = imdb_rating_element.text.strip() if imdb_rating_element else 'N/A'

    imdb_votes_element = movie_container.select_one('.ipc-rating-star--voteCount')
    imdb_votes = imdb_votes_element.text if imdb_votes_element else 'N/A'

    # Create a DataFrame for each movie
    movie_df = pd.DataFrame({
        'Movie Title': [movie_title],
        'Release Year': [release_year],
        'Duration': [duration],
        'Rating': [rating],
        'IMDb Rating': [imdb_rating],
        'IMDb Votes': [imdb_votes]
    })

    # Append the DataFrame to the list
    dfs.append(movie_df)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Display the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv('data/imdb_data.csv', index=False)

                   Movie Title Release Year Duration    Rating IMDb Rating  \
0  1. The Shawshank Redemption         1994   2h 22m         R  9.3 (2.8M)   
1             2. The Godfather         1972   2h 55m         R    9.2 (2M)   
2           3. The Dark Knight         2008   2h 32m     PG-13  9.0 (2.8M)   
3     4. The Godfather Part II         1974   3h 22m         R  9.0 (1.3M)   
4              5. 12 Angry Men         1957   1h 36m  Approved  9.0 (847K)   

  IMDb Votes  
0     (2.8M)  
1       (2M)  
2     (2.8M)  
3     (1.3M)  
4     (847K)  
