In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
data = pd.read_csv('movies.csv')
df = pd.DataFrame(data)

In [None]:
def get_movie_details(movie_id):
    url = f"https://www.imdb.com/title/{movie_id}/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Scrape Release date
    details_section = soup.find("div", {"data-testid": "title-details-section"})
    if details_section:
        release_date_element = details_section.find("li", {"data-testid": "title-details-releasedate"})
        release_date = release_date_element.find('ul').find('li').find('a').text.strip() if release_date_element else 'N/A'
    else:
        release_date = 'N/A'

    # Scrape the director's name
    director_element = soup.find('li', {
        'class': 'ipc-metadata-list__item ipc-metadata-list__item--align-end',
        'data-testid': 'title-pc-principal-credit'
    })
    
    director_name = director_element.find('a', class_='ipc-metadata-list-item__list-content-item').text.strip() if director_element else 'N/A'

    # Scrape the star actors
    stars_element = soup.find("li", {
        "class": "ipc-metadata-list__item ipc-metadata-list__item--align-end ipc-metadata-list-item--link",
        "data-testid": "title-pc-principal-credit"
    })

    star_actors = []
    if stars_element:
        star_links = stars_element.find_all('a', class_='ipc-metadata-list-item__list-content-item--link')
        for star_link in star_links:
            star_actors.append(star_link.text.strip())
    
    # Scrape box office information
    box_office_section = soup.find('div', {'data-testid': 'title-boxoffice-section'})
    
    if box_office_section:
        budget = box_office_section.find('li', {'data-testid': 'title-boxoffice-budget'})
        gross_us_canada = box_office_section.find('li', {'data-testid': 'title-boxoffice-grossdomestic'})
        gross_worldwide = box_office_section.find('li', {'data-testid': 'title-boxoffice-cumulativeworldwidegross'})
        
        budget_value = budget.find('span', class_='ipc-metadata-list-item__list-content-item').text if budget else 'N/A'
        gross_us_canada_value = gross_us_canada.find('span', class_='ipc-metadata-list-item__list-content-item').text if gross_us_canada else 'N/A'
        gross_worldwide_value = gross_worldwide.find('span', class_='ipc-metadata-list-item__list-content-item').text if gross_worldwide else 'N/A'
    else:
        budget_value = 'N/A'
        gross_us_canada_value = 'N/A'
        gross_worldwide_value = 'N/A'
    
    # Scrape number of user reviews
    user_reviews_tag = soup.find('span', class_ = 'score')
    user_reviews = user_reviews_tag.text if user_reviews_tag else 'N/A'
    
    # Scrape the star rating
    star_rating_tag = soup.find('span', class_='ipc-rating-star--rating')
    star_rating = star_rating_tag.text if star_rating_tag else 'N/A'

    # Scrape the vote count
    vote_count_tag = soup.find('div', class_='sc-d541859f-3 dwhNqC')
    vote_count = vote_count_tag.text if vote_count_tag else 'N/A'

    return {
        'Release Date': release_date,
        'Director': director_name,
        'Star Actors': star_actors,
        'Budget': budget_value,
        'Gross US & Canada': gross_us_canada_value,
        'Gross Worldwide': gross_worldwide_value,
        'Number of User Reviews': user_reviews,
        'Star Rating': star_rating,
        'Vote Count': vote_count

    }

In [None]:
df['Release Date'] = 'N/A'
df['Director'] = 'N/A'
df['Star Actors'] = 'N/A'
df['Budget'] = 'N/A'
df['Gross US & Canada'] = 'N/A'
df['Gross Worldwide'] = 'N/A'
df['Number of User Reviews'] = 'N/A'
df['Star Rating'] = 'N/A'
df['Vote Count'] = 'N/A'

for index, row in df.iterrows():
    movie_id = row['id']
    movie_details = get_movie_details(movie_id)
    df.at[index, 'Release Date'] = movie_details['Release Date']
    df.at[index, 'Director'] = movie_details['Director']    
    df.at[index, 'Star Actors'] = movie_details['Star Actors']    
    df.at[index, 'Budget'] = movie_details['Budget']
    df.at[index, 'Gross US & Canada'] = movie_details['Gross US & Canada']
    df.at[index, 'Gross Worldwide'] = movie_details['Gross Worldwide']
    df.at[index, 'Number of User Reviews'] = movie_details['Number of User Reviews']
    df.at[index, 'Star Rating'] = movie_details['Star Rating']
    df.at[index, 'Vote Count'] = movie_details['Vote Count']

    print(f"Processed {index + 1}/{len(df)}: {movie_id}")

In [None]:
df.to_csv('movie_data.csv', index = False)