In [19]:
import requests
from bs4 import BeautifulSoup

def get_movie_details(url):
    """
    Scrapes movie details from the provided IMDb URL.

    Args:
        url (str): The URL of the IMDb movie page.

    Returns:
        dict: A dictionary containing the following movie details:
            - Title: The title of the movie.
            - Year: The release year of the movie.
            - Genre: The genre(s) of the movie.
            - Rating: The IMDb rating of the movie.
            - Runtime: The runtime of the movie in minutes.
            - Certificate: The certificate/rating of the movie.
            - Directors: A list of the movie's directors.
            - Cast: A list of the movie's cast members.
            - Description: The description of the movie.
    """
    movie_details = {}

    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    if response.status_code != 200:
        print("Failed to retrieve movie details:", response.status_code)
        return movie_details

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract title, year, genre, and rating
    title = soup.title.text.split('- IMDb')[0]
    year = int(soup.find(lambda tag: tag.name == 'a' and 'releaseinfo' in tag.get('href', '')).text)
    genre = ', '.join([span.text.strip() for span in soup.find_all('a', class_="ipc-chip ipc-chip--on-baseAlt")])
    rating = float(soup.find('span', class_='sc-bde20123-1 cMEQkK').text)

    # Extract runtime and certificate
    meta_tag = soup.find('meta', property='og:description')
    if meta_tag:
        content = meta_tag.get('content')
        runtime = None
        certificate = None
        if content:
            parts = content.split('|')
            if len(parts) == 2:
                runtime_str, certificate = parts
                runtime_str = runtime_str.strip()
                if runtime_str:
                    time_parts = runtime_str.split()
                    if len(time_parts) == 2:
                        hours, minutes = time_parts
                        runtime = int(hours[:-1]) * 60 + int(minutes[:-1])
                certificate = certificate.strip()

    # Extract full credits URL segment
    credits_url_segment = None
    try:
        credits_url_segment = soup.find(lambda tag: tag.name == 'a' and 'fullcredits' in tag.get('href', '')).get('href')
    except AttributeError:
        print("Failed to find full credits URL segment")
        return movie_details
    # Fetch full credits page
    base_url = "https://www.imdb.com"
    
    credits_response = requests.get(f'{base_url}{credits_url_segment}')
    if credits_response.status_code != 200:
        print("Failed to retrieve full credits page:", credits_response.status_code)
        return movie_details

    credits_soup = BeautifulSoup(credits_response.content, 'html.parser')

    # Extract directors from the table
    directors = []
    table = credits_soup.find('table', class_='simpleTable simpleCreditsTable')
    if table:
        for row in table.find_all('tr'):
            director_element = row.find('td', class_='name')
            if director_element:
                director_name = director_element.a.text.strip()
                directors.append(director_name)

    # Extract cast names
    cast_names = []
    cast_table = credits_soup.find("table", class_="cast_list")
    if cast_table:
        cast_td_elements = cast_table.find_all("td", class_=lambda value: value != "character")
        cast_names = [td.find("a").text.strip() for td in cast_td_elements if td.find("a")]
        cast_names = [name for name in cast_names if name]

    # Extract description
    description = soup.find('meta', {'name': 'description'}).get('content')

    # Construct the movie_details dictionary
    movie_details = {
        'Title': title,
        'Year': year,
        'Genre': genre,
        'Rating': rating,
        'Runtime': runtime,
        'Certificate': certificate,
        'Directors': directors,
        'Cast': cast_names,
        'Description': description
    }

    return movie_details


In [36]:
movie_details = {}
with open('../movie_links/2006/11/2006-11-10.txt', 'r') as file:
    urls = file.read().splitlines()

for url in urls:
    movie_details = get_movie_details(url)
    if movie_details:
        print(f"Processed {movie_details['Title']}")

Processed Beautiful Ohio (2006) 
Processed Hiding Victoria (2006) 
Processed Coffee Date (2006) 
Processed Raising Flagg (2006) 
Processed The Utah Murder Project (2006) 
Processed Maple Palm (2006) 
Processed Indiscretion (2006) 
Processed Nowhere Street (2006) 
Processed Ghost Hunters: Point of Contact (2006) 
Failed to find full credits URL segment
Processed The Gutter Diaries (2006) 


{'Title': 'The Gutter Diaries (2006) ',
 'Year': 2006,
 'Genre': 'Drama',
 'Rating': 6.9,
 'Runtime': None,
 'Certificate': None,
 'Directors': ['Josh Whittall'],
 'Cast': ['Reese Alexander',
  'Pat Alguire',
  'Coltin Argue',
  'Kristina Barr',
  'Sharron Bertchilde',
  'Jenette Caradonna',
  'Claire Carreras',
  'Caroline Chojnacki',
  'Debbie Cragg',
  'John Dadey',
  'Nathan Dashwood',
  'Cheryl Denay',
  'Shade Louis Dietz',
  'Rachel Eaves',
  'Elena Esovolova',
  'Tomoko Hanawa',
  'David Haus',
  'Bryan Jones',
  'Erin Kenning',
  'Arleigh Mainwaring',
  'Duncan Mao',
  'Christina McInulty',
  'Robert Munn',
  'Elena Peradenic',
  'Jane Purcell',
  'Elaine Rathie',
  'Cyril Redillas',
  'Kayja Rethel',
  'Jeff Sarsfield',
  'Kjirsten Sigmund',
  'Ben Smith',
  'Daysi Tattersall',
  'Jody Vaillant',
  'Nigel Vonas',
  'Eric White',
  'Fagin Woodcock'],
 'Description': 'The Gutter Diaries: Directed by Josh Whittall. With Reese Alexander, Pat Alguire, Coltin Argue, Kristina Barr. 