In [76]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import random

# 隨機選擇 User-Agent 防止被網站屏蔽
headers = {
    'User-Agent': random.choice([
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
    ])
}

# Step 1: 從 Wikipedia 獲取電影名稱與 Rotten Tomatoes 連結
def get_movie_urls_from_wikipedia(url):
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"無法連接到 Wikipedia 頁面, 狀態碼: {response.status_code}")
        return [], []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    movie_urls = []
    movie_titles = []
    
    for link in soup.find_all('a', href=True):
        if "rottentomatoes.com/m/" in link['href']:
            movie_urls.append(link['href'])
            movie_titles.append(link.text.strip())
    
    return movie_titles, movie_urls

# Step 2: 將結果存入 CSV 文件
def save_to_csv(movie_titles, movie_urls, output_file):
    # 創建 DataFrame
    df = pd.DataFrame({
        'Movie Title': movie_titles,
        'Rotten Tomatoes URL': movie_urls
    })
    
    # 保存到 CSV 文件
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"資料已保存到 {output_file}")

# 執行流程
wikipedia_url = "https://en.wikipedia.org/wiki/List_of_films_with_a_0%25_rating_on_Rotten_Tomatoes"
output_directory = r"C:\Users\User\Desktop\學\大學\python"  # 更改為你的輸出路徑# 更改為你的輸出路徑# 更改為你的輸出路徑# 更改為你的輸出路徑# 更改為你的輸出路徑
output_file = os.path.join(output_directory, "movie_titles_and_urls.csv")

# 獲取電影名稱和連結
movie_titles, movie_urls = get_movie_urls_from_wikipedia(wikipedia_url)

# 保存結果到 CSV
if movie_titles and movie_urls:
    save_to_csv(movie_titles, movie_urls, output_file)
else:
    print("未能提取到任何電影資料")


資料已保存到 C:\Users\User\Desktop\學\大學\python\movie_titles_and_urls.csv


In [80]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import random
from datetime import datetime

# 隨機選擇 User-Agent 防止被網站屏蔽
headers = {
    'User-Agent': random.choice([
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
    ])
}

# Step 1: 爬取導演名稱與導演專業連結
def get_director_info_from_rottentomatoes(movie_url):
    response = requests.get(movie_url, headers=headers)
    
    if response.status_code != 200:
        print(f"無法連接到電影頁面: {movie_url}, 狀態碼: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # 查找 JSON-LD 數據
    script_tag = soup.find('script', type='application/ld+json')
    if not script_tag:
        print(f"未找到 JSON-LD 數據: {movie_url}")
        return None

    # 解析 JSON 數據
    movie_data = json.loads(script_tag.string)

    # 提取導演名稱和連結
    directors = movie_data.get('director', [])
    directors_info = []

    for director in directors:
        director_name = director.get('name', 'Unknown Director')
        director_url = director.get('sameAs', 'Unknown URL')
        
        # 爬取導演個人資料
        director_bio_info = get_director_bio_info(director_url)
        if director_bio_info:
            director_birthday, director_birthplace, highest_rated, lowest_rated, age = director_bio_info
        else:
            director_birthday, director_birthplace, highest_rated, lowest_rated, age = 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
        
        directors_info.append({
            'Movie Title': movie_data.get('name', 'Unknown Movie'),
            'Director Name': director_name,
            'Director URL': director_url,
            'Birthday': director_birthday,
            'Birthplace': director_birthplace,
            'Highest Rated': highest_rated,
            'Lowest Rated': lowest_rated,
            'Age': age
        })
    
    return directors_info

# Step 2: 爬取導演的生日、出生地、最高與最低分作品
def get_director_bio_info(director_url):
    response = requests.get(director_url, headers=headers)
    
    if response.status_code != 200:
        print(f"無法連接到導演頁面: {director_url}, 狀態碼: {response.status_code}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # 提取生日
    birthday_tag = soup.find('p', {'data-qa': 'celebrity-bio-bday'})
    birthday = birthday_tag.text.strip().replace("Birthday:", "").strip() if birthday_tag else 'N/A'
    # 計算年齡
    age = calculate_age(birthday) if birthday != 'N/A' else 'N/A'
    
    # 提取出生地
    birthplace_tag = soup.find('p', {'data-qa': 'celebrity-bio-birthplace'})
    birthplace = birthplace_tag.text.strip().replace("Birthplace:", "").strip() if birthplace_tag else 'N/A'

    # 提取最高與最低分作品
    highest_rated_tag = soup.find('p', {'data-qa': 'celebrity-bio-highest-rated'})
    highest_rated = highest_rated_tag.find('a').text.strip() if highest_rated_tag and highest_rated_tag.find('a') else 'N/A'

    lowest_rated_tag = soup.find('p', {'data-qa': 'celebrity-bio-lowest-rated'})
    lowest_rated = lowest_rated_tag.find('a').text.strip() if lowest_rated_tag and lowest_rated_tag.find('a') else 'N/A'
    
    return birthday, birthplace, highest_rated, lowest_rated, age

# 計算年齡
def calculate_age(birthday):
    try:
        birth_date = datetime.strptime(birthday, "%b %d, %Y")
        today = datetime.now()
        return today.year - birth_date.year - ((today.month, today.day) < (birth_date.month, birth_date.day))
    except ValueError:
        return 'N/A'

# Step 3: 從 CSV 中讀取電影 URL，並獲取導演資訊
def extract_director_info_from_csv(input_file, output_file):
    # 從 CSV 讀取電影標題和連結
    df = pd.read_csv(input_file)

    directors_info = []

    for index, row in df.iterrows():
        movie_title = row['Movie Title']
        movie_url = row['Rotten Tomatoes URL']
        print(f"正在處理電影: {movie_title}")

        directors = get_director_info_from_rottentomatoes(movie_url)

        if directors:
            for director_data in directors:
                directors_info.append(director_data)

    # 將導演資訊保存到新的 CSV 文件
    directors_df = pd.DataFrame(directors_info)
    directors_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"導演資料已保存到 {output_file}")

# 執行流程
input_csv_file = r"C:\Users\User\Desktop\學\大學\python\movie_titles_and_urls.csv"  # 更改為你的輸入路徑# 更改為你的輸入路徑# 更改為你的輸入路徑# 更改為你的輸入路徑
output_csv_file = r"C:\Users\User\Desktop\學\大學\python\directors_info.csv"  # 更改為你的輸出路徑# 更改為你的輸出路徑# 更改為你的輸出路徑

# 提取導演資訊並保存
extract_director_info_from_csv(input_csv_file, output_csv_file)


正在處理電影: "Staying Alive"
正在處理電影: "Bolero (1984)"
正在處理電影: "Police Academy 4: Citizens on Patrol"
正在處理電影: "Problem Child (1990)"
正在處理電影: "Highlander 2: The Quickening (1991)"
正在處理電影: "Return to the Blue Lagoon (1991)"
正在處理電影: "Folks! (1992)"
正在處理電影: "Look Who's Talking Now (1993)"
正在處理電影: "Wagons East! (1994)"
正在處理電影: "Simon Sez (1999)"
正在處理電影: "3 Strikes (2000)"
正在處理電影: "Ballistic: Ecks vs. Sever (2002)"
正在處理電影: "Killing Me Softly"
正在處理電影: "Merci Docteur Rey (2002)"
正在處理電影: "Pinocchio (2002)"
正在處理電影: "Derailed"
正在處理電影: "National Lampoon's Gold Diggers (2004)"
正在處理電影: "Superbabies: Baby Geniuses 2 (2004)"
正在處理電影: "Constellation"
正在處理電影: "Redline (2007)"
正在處理電影: "Scar (2007)"
正在處理電影: "One Missed Call"
正在處理電影: "Homecoming"
正在處理電影: "Stolen"
正在處理電影: "Transylmania (2009)"
正在處理電影: "The Nutcracker in 3D (2010)"
正在處理電影: "Beneath the Darkness"
正在處理電影: "Dark Tide (2012)"
正在處理電影: "A Thousand Words"
正在處理電影: "Left Behind"
正在處理電影: "The Ridiculous 6 (2015)"
正在處理電影: "Cabin Fever"
正在處理電影: "Dark Crimes (20

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import random

# 隨機 User-Agent 避免被封鎖
headers = {
    'User-Agent': random.choice([
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
    ])
}

# 解析 <script> JSON 中的數據
def parse_media_scorecard_json(soup):
    try:
        script_tag = soup.find("script", id="media-scorecard-json", type="application/json")
        if script_tag:
            json_data = json.loads(script_tag.string)
            audience_score = json_data.get("audienceScore", {})
            critics_score = json_data.get("criticsScore", {})
            return {
                "averageRating": audience_score.get("averageRating", "NA"),
                "bandedRatingCount": audience_score.get("bandedRatingCount", "NA"),
                "likedCount": audience_score.get("likedCount", "NA"),
                "notLikedCount": audience_score.get("notLikedCount", "NA"),
                "reviewCount": audience_score.get("reviewCount", "NA"),
                "score": audience_score.get("score", "NA"),
                "criticsAverageRating": critics_score.get("averageRating", "NA"),
                "criticsReviewCount": critics_score.get("reviewCount", "NA"),
                "criticsScore": critics_score.get("score", "NA")
            }
    except json.JSONDecodeError:
        print("Failed to decode JSON data")
    return {}

# 取得導演的作品列表（包含電影與 TV 劇集）
def get_director_filmography(director_url):
    response = requests.get(director_url, headers=headers)
    
    if response.status_code != 200:
        print(f"Unable to connect to the page: {director_url}, status code: {response.status_code}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    scorecard_data = parse_media_scorecard_json(soup)
    filmography = []

    for category, label in [('movies', 'Movie'), ('tv', 'TV Show')]:
        works = soup.select(f'table[data-qa="celebrity-filmography-{category}"] tbody > tr')

        for work in works:
            title = work.get('data-title', 'Title not found')

            # 判斷 Tomatometer 評分
            tomatometer = work.get('data-tomatometer', '')
            if tomatometer == "0":
                tomatometer = "No Score Yet" if work.select_one('span.celebrity-filmography__no-score') else "0%"

            # 判斷觀眾評分
            audience_score = work.get('data-audiencescore', '')
            if audience_score == "0":
                audience_score = "No Score Yet" if work.select_one('span.celebrity-filmography__no-score') else "0%"

            # 處理票房和年份數據
            box_office = work.get('data-boxoffice', 'Box office not found')
            year_raw = work.get('data-appearance-year', work.get('data-year', 'Year not found'))
            year = process_year(year_raw)

            # 提取導演在作品中的職位
            credits = work.find('td', class_='celebrity-filmography__credits').text.strip() if work.find('td', class_='celebrity-filmography__credits') else 'Role not found'
            
            # 添加數據到 filmography 列表，並包含電影或 TV 劇集類別
            filmography.append({
                'title': title,
                'year': year,
                'tomatometer': tomatometer,
                'audience_score': audience_score,
                'box_office': box_office,
                'credits': credits,
                'category': label,
                **scorecard_data  # 合併 scorecard 數據
            })
    
    return filmography

def process_year(year_raw):
    """ 處理年份數據，包括單一年份或範圍值。 """
    if isinstance(year_raw, list):
        return ", ".join(year_raw)
    if year_raw.startswith("[") and year_raw.endswith("]"):
        year_range = year_raw.strip("[]").split('-')
        start_year = year_range[0]
        end_year = year_range[1] if len(year_range) > 1 else start_year
        return f"{start_year}-{end_year}"
    return year_raw

# 提取數據並儲存到 CSV
def extract_movies_from_directors(input_file, output_file):
    df = pd.read_csv(input_file)
    all_movies = []

    for index, row in df.iterrows():
        director_name = row['Director Name']
        director_url = row['Director URL']
        print(f"Processing director: {director_name}")

        movies = get_director_filmography(director_url)
        
        for movie in movies:
            movie['Director Name'] = director_name
            all_movies.append(movie)

    movies_df = pd.DataFrame(all_movies)
    movies_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Film and TV data saved to {output_file}")

# 執行程序
input_csv_file = r"C:\Users\User\Desktop\學\大學\python\directors_info.csv"
output_csv_file = r"C:\Users\User\Desktop\學\大學\python\movies_from_directors.csv"
extract_movies_from_directors(input_csv_file, output_csv_file)


Processing director: Sylvester Stallone
Processing director: John Derek
Processing director: Jim Drake
Processing director: Dennis Dugan
Processing director: Russell Mulcahy
Processing director: William Graham
Processing director: Ted Kotcheff
Processing director: Tom Ropelewski
Processing director: Peter Markle
Processing director: Kevin Alyn Elders
Processing director: DJ Pooh
Processing director: Wych Kaosayananda
Processing director: Chen Kaige
Processing director: Andrew Litvack
Processing director: Roberto Benigni
Processing director: Bob Misiorowski
Processing director: Gary Preisler
Processing director: Bob Clark
Processing director: Jordan Walker-Pearlman
Processing director: Andy Cheng
Processing director: Jed Weintrob
Processing director: Éric Valette
Processing director: Morgan J. Freeman
Processing director: Anders Anderson
Processing director: David Hillenbrand
Processing director: Scott Hillenbrand
Processing director: Andrey Konchalovskiy
Processing director: Martin Gui

In [24]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import random

# Headers for requests to avoid blocking
headers = {
    'User-Agent': random.choice([
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
    ])
}

def parse_movie_page(url):
    """Scrape movie-specific information from each individual movie URL."""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    try:
        # Extracting JSON from <script> for audience and critics' scores
        scorecard_script = soup.find("script", id="media-scorecard-json", type="application/json")
        scorecard_data = json.loads(scorecard_script.string) if scorecard_script else {}

        # Extracting synopsis
        synopsis_tag = soup.select_one('[data-qa="synopsis-value"]')
        synopsis = synopsis_tag.get_text(strip=True) if synopsis_tag else "Synopsis not found"
        
        # Extracting other movie information
        movie_info = {}
        for item in soup.select('div.category-wrap'):
            label = item.select_one('[data-qa="item-label"]').get_text(strip=True)
            value = ", ".join([i.get_text(strip=True) for i in item.select('[data-qa="item-value"]')])
            movie_info[label] = value

        # Merging extracted data
        movie_data = {
            'audience_score': scorecard_data.get("audienceScore", {}).get("score", "NA"),
            'critics_score': scorecard_data.get("criticsScore", {}).get("score", "NA"),
            'synopsis': synopsis,
            **movie_info
        }
        return movie_data

    except Exception as e:
        print(f"Failed to parse movie page {url}: {e}")
        return {}

def get_director_movies(director_url):
    """Retrieve list of movies directed by a director from their filmography page."""
    response = requests.get(director_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to access {director_url}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    movies = []

    for work in soup.select('[data-qa="celebrity-filmography-movies-trow"]'):
        title = work.get('data-title', 'Unknown title')
        year = work.get('data-year', 'Unknown year')
        movie_url = "https://www.rottentomatoes.com" + work.select_one('a')['href']
        
        movie_data = parse_movie_page(movie_url)
        movies.append({'title': title, 'year': year, 'movie_url': movie_url, **movie_data})

    return movies

def extract_all_director_data(input_file, output_file):
    """Read CSV of directors, collect movie data for each, and save results to a new CSV."""
    df = pd.read_csv(input_file)
    all_movies = []

    for _, row in df.iterrows():
        director_url = row['Director URL']
        print(f"Scraping data for director: {row['Director Name']}")
        
        movies = get_director_movies(director_url)
        for movie in movies:
            movie['Director Name'] = row['Director Name']
            all_movies.append(movie)

    movies_df = pd.DataFrame(all_movies)
    movies_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Data saved to {output_file}")

# Usage
input_csv_file = r"C:\Users\User\Desktop\學\大學\python\directors_info.csv"
output_csv_file = r"C:\Users\User\Desktop\學\大學\python\movies_details.csv"
extract_all_director_data(input_csv_file, output_csv_file)


Scraping data for director: Sylvester Stallone
Scraping data for director: John Derek
Scraping data for director: Jim Drake
Scraping data for director: Dennis Dugan
Scraping data for director: Russell Mulcahy
Scraping data for director: William Graham
Scraping data for director: Ted Kotcheff
Scraping data for director: Tom Ropelewski
Scraping data for director: Peter Markle
Scraping data for director: Kevin Alyn Elders
Scraping data for director: DJ Pooh
Scraping data for director: Wych Kaosayananda
Scraping data for director: Chen Kaige
Scraping data for director: Andrew Litvack
Scraping data for director: Roberto Benigni
Scraping data for director: Bob Misiorowski
Scraping data for director: Gary Preisler
Scraping data for director: Bob Clark
Scraping data for director: Jordan Walker-Pearlman
Scraping data for director: Andy Cheng
Scraping data for director: Jed Weintrob
Scraping data for director: Éric Valette
Scraping data for director: Morgan J. Freeman
Scraping data for director: