In [110]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib.parse

In [111]:
def souper (one_link):
    page = requests.get(one_link)
    src = page.content
    soup = BeautifulSoup(src,"lxml")
    return soup

In [112]:
def get_person_page(person_query_name):
    encoded_person_name = urllib.parse.quote(person_query_name)
    search_page = f"https://www.themoviedb.org/search?query={encoded_person_name}"
    search_soup = souper(search_page)
    name = search_soup.find('p', {'class' : 'name'} )
    if (name.a.text.lower()  == person_query_name.lower()):
        link_href = name.a['href']
        person_name_page = f"https://www.themoviedb.org{link_href}"
        return person_name_page
    else :
        print("No Page Found")

In [114]:
def find_works (person_page):
    per_soup = souper(person_page)
    credits_list = per_soup.find("div",{'class':'credits_list'})
    parts = credits_list.find_all("table",{'class':'card credits'})
    roles = []
    for i in credits_list.find_all('h3'):
        roles.append(i.text)
    works = []
    for group in parts:
        maps = group.find_all('a',{'class': 'tooltip'})
        works.append(maps)
    works_links = [] 
    unique_works_links = set()
    for group in works:
        group_links = []
        for work in group:
            work_href = work['href']
            if not work_href.startswith('/tv'):
                url = f"https://www.themoviedb.org{work_href}"
            if url not in unique_works_links:
                unique_works_links.add(url)
                group_links.append(url)
        works_links.append(group_links)    
    roles_links = {}
    for role, links in zip(roles, works_links):
        roles_links[role] = [link for link in links]       
    return roles_links    

In [115]:
def movie_datar(movie_page):
    movie_src = souper(movie_page)
    movie_header = movie_src.find("section", {'class': 'header'})

    # Handle potential exceptions and assign default values
    movie_name = ''
    release_year = '-'
    movie_runtime = '-'
    movie_age = '-'
    director = '-'
    movie_score = '-'
    movie_genres = '-'
    status = ''
    language = ''
    Budget = ''
    revenue = ''

    try:
        movie_name = movie_header.a.text
    except AttributeError:
        pass
    try:
        release_year = movie_header.find("span", {'class': 'release_date'}).text.strip('()')
    except AttributeError:
        pass
    try:
        movie_runtime = movie_header.find("span", {'class': 'runtime'}).text.strip()
    except AttributeError:
        pass
    try:
        movie_age = movie_header.find("span", {'class': 'certification'}).text.strip()
    except AttributeError:
        pass
    try:
        director = movie_header.find('p', {'class': 'character'}).parent.p.a.text
    except AttributeError:
        pass
    try:
        movie_score = movie_header.find("div", {'class': 'user_score_chart'}).get('data-percent')
    except AttributeError:
        pass
    try:
        genres = movie_header.find("span", {'class': 'genres'}).find_all('a')
        all_genres = [genre.text for genre in genres]
        movie_genres = ', '.join(all_genres)
    except AttributeError:
        pass
    try:
        facts_columns = movie_src.find("section", {'class': 'facts left_column'})
        facts_raw = facts_columns.find_all('p')
        facts_all = [fact.bdi.decompose() for fact in facts_raw]
        status = facts_raw[0].text.strip()
        language = facts_raw[1].text.strip()
        Budget = facts_raw[2].text.strip()
        revenue = facts_raw[3].text.strip()
    except AttributeError:
        pass

    data_temp = {
        'Name': movie_name,
        'Release Year': release_year,
        'Runtime': movie_runtime,
        'Age Rating': movie_age,
        'Director': director,
        'Score': movie_score,
        'Genres': movie_genres,
        'Status': status,
        'Language': language,
        'Budget': Budget,
        'Revenue': revenue
    }
    movie_data = pd.DataFrame(data_temp, index=[0])
    return movie_data

In [116]:
def main (person_name):
    person_page = get_person_page(person_name)
    movies = find_works(person_page)
    all_movie_data = [] 
    for role,links in movies.items():
        for link in links:
            movie_data = movie_datar(link)
            movie_data['Role'] = role
            all_movie_data.append(movie_data)
    if all_movie_data:
        combined_data = pd.concat(all_movie_data, ignore_index=True)  # Combine all movie data
        return combined_data
    else:
        return None
    #return all_movie_data


In [119]:
person_name = "Leonardo DiCaprio"
query = main(person_name)
query

Unnamed: 0,Name,Release Year,Runtime,Age Rating,Director,Score,Genres,Status,Language,Budget,Revenue,Role
0,Devil in the White City,-,-,-,Martin Scorsese,0,"Crime, Thriller",مخطط,الإنجليزية,-,-,التمثيل
1,Roosevelt,-,-,-,Martin Scorsese,0,"Drama, History",في الانتاج,الإنجليزية,-,-,التمثيل
2,The Black Hand,-,-,-,Stephan Talty,0,Drama,يُشاع,الإنجليزية,-,-,التمثيل
3,Sinatra,-,-,-,Martin Scorsese,0,"Music, Drama",مخطط,الإنجليزية,-,-,التمثيل
4,The Wager,-,-,-,Martin Scorsese,0,"Drama, History",مخطط,الإنجليزية,-,-,التمثيل
...,...,...,...,...,...,...,...,...,...,...,...,...
109,The Ides of March,2011,1h 41m,R,George Clooney,67,Drama,تم عرضه,الإنجليزية,"$12,500,000.00","$76,338,111.00",إنتاج
110,Red Riding Hood,2011,1h 40m,PG-13,Catherine Hardwicke,60,"Thriller, Drama, Fantasy, Mystery, Horror",تم عرضه,الإنجليزية,"$42,000,000.00","$89,162,162.00",إنتاج
111,Orphan,2009,2h 2m,12,Jaume Collet-Serra,70,"Horror, Thriller, Mystery",تم عرضه,الإنجليزية,"$20,000,000.00","$77,912,251.00",إنتاج
112,Gardener of Eden,2007,1h 28m,R,Kevin Connolly,54,"Comedy, Drama",تم عرضه,الإنجليزية,-,-,إنتاج


In [132]:
def cast_datar (cast_link,movie_name):
    cast_src = souper(cast_link)
    cast_raw = cast_src.find("ol",{'class':'people'})
    cast_names_soup = cast_raw.find_all("li")
    cast_data_temp = []
    for inf in cast_names_soup:
        actor =  inf.find("div",{'class':'info'}).p.a.text
        character = inf.find("p",{'class':'character'}).text.strip()
        cast_data_temp.append({'actor': actor, 'character': character})
    for cast_member in cast_data_temp:
        cast_member['movie'] = movie_name
    cast_data = pd.DataFrame(cast_data_temp)
    return cast_data

In [133]:
movie_page = 'https://www.themoviedb.org/movie/634649-spider-man-no-way-home/cast'
NWH_cast = cast_datar(movie_page,"Spider-Man: No Way Home")

In [134]:
NWH_cast

Unnamed: 0,actor,character,movie
0,Tom Holland,Peter Parker / Spider-Man,Spider-Man: No Way Home
1,Zendaya,MJ,Spider-Man: No Way Home
2,Benedict Cumberbatch,Doctor Strange,Spider-Man: No Way Home
3,Jacob Batalon,Ned Leeds,Spider-Man: No Way Home
4,Jamie Foxx,Max Dillon / Electro,Spider-Man: No Way Home
...,...,...,...
57,Jay Karales,Bodega Customer (uncredited),Spider-Man: No Way Home
58,Gina Aponte,Media Reporter (uncredited),Spider-Man: No Way Home
59,John Barnes,School Protester (uncredited),Spider-Man: No Way Home
60,Harry Holland,Shaker Kid (uncredited),Spider-Man: No Way Home
