In [2]:
# ISSUES TO TROUBLESHOOT

# 1: not returning all links -- only parent ones in this case:
# wiki_scrape("Alba Flores")
# may be because there is a parent listed with no link so then skips the relatives

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import wikipediaapi as wiki

In [4]:
# Parent and Relative strings to search for in wikipedia infobox
parent_pattern = re.compile('Parent|Relative')

# url base
wiki_url_base = "https://en.wikipedia.org" # wikipedia
imdb_base_url = "https://www.imdb.com"

# wikipedia language setting
wiki_wiki = wiki.Wikipedia('en')

# imdb links
# Most Popular TV Shows as determined by IMDb Users
url_pop_tv = "https://www.imdb.com/chart/tvmeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=DK1W37ZH61RXZP184X95&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_5"

# Most Popular Movies as determined by IMDb Users
url_pop_mov = "https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=408FM1WEANBKDGP51PET&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_2"

# Top Box Office (US) - updated weekly
url_box_office = "https://www.imdb.com/chart/boxoffice?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=KK2KJN75YN0PGF8Q79QK&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=moviemeter&ref_=chtmvm_ql_1"

# IMDb Top 250 TV Shows - IMDb Top 250 as rated by regular IMDb voters
url_top_tv = "https://www.imdb.com/chart/toptv?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=075QRGJ32Y0624A92Y2E&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_6"

# IMDb Top 250 Movies - IMDb Top 250 as rated by regular IMDb voters
url_top_mov = "https://www.imdb.com/chart/top?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=YSCMHSWYXMFCVH1D74MH&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=tvmeter&ref_=chttvm_ql_3"


In [29]:
# given name of person, outputs parent links or false if none
def wiki_scrape(name):
    subject = name.replace(" ", "_")
    url = wiki_url_base + "/wiki/" + subject

    # check if wiki page exists
    wiki_page = wiki_wiki.page(subject)
    does_page_exist = wiki_page.exists()

    if does_page_exist is False :
        return False # no wiki page -> not famous enough, not a nepo baby
    
    elif does_page_exist is True : 
        data = requests.get(url).text
        soup = BeautifulSoup(data,'html.parser') # full page
        
        # check if infobox exists
        infobox = soup.find("table",{"class":"infobox biography vcard"}) # infobox
        
        if infobox is None :
            return False # no infobox on wiki page -> not a nepo baby
        else :
            does_parent_field_exist = bool(re.search("Parent|Relative", infobox.text)) # see if Parent or Relative field is listed in infobox
        
            if does_parent_field_exist is False :
                return False
            elif does_parent_field_exist is True :
                parent_field = soup.find('th', string=parent_pattern).parent
                parent_a_tags = parent_field.find_all('a')
                if len(parent_a_tags) == 0 :
                    return False # parents listed in infobox but not linked -> not a nepo baby
                else :
                    parent_wiki_list = []
                    for link in parent_field.find_all('a'):
                        parent_wiki = link.get('href')
                        parent_wiki_link = wiki_url_base + parent_wiki
                        parent_wiki_list.append(parent_wiki_link)
                        
                        parent_wiki_list[:] = [x for x in parent_wiki_list if "cite_note" not in x] # cited entries are in <a href> tags so remove those links here
    
                return parent_wiki_list


In [6]:
def get_imdb_list_links(url):
    page_data = requests.get(url).text
    soup = BeautifulSoup(page_data,'html.parser')
    
    media_table = soup.find("table", {"class":"chart full-width"}) 
    
    all_links = []
    
    for tag in media_table.select("td:nth-of-type(2) a"):
        all_links.append(imdb_base_url + tag["href"])
    
    df = pd.DataFrame(columns=["link"], data = all_links)
    return df

In [43]:
def top_cast(url):
    page_data =  requests.get(url).text
    soup = BeautifulSoup(page_data,'html.parser')
    
    title = soup.find("h1")
    title = title.text
    
    # grab top cast section of imdb page
    top_cast = soup.find_all("a", {"class": "sc-36c36dd0-1 QSQgP"})
    
    cast_list = []
    
    # get cast names
    for a in top_cast:
        cast_list.append(str(a.string))
        
    return[title, cast_list]

In [54]:
def imdb_whole_shebang(url):
    df = get_imdb_list_links(url)
    df['imdb_info'] = df.apply(lambda row : top_cast(row['link']), axis=1)
    df = pd.DataFrame(df["imdb_info"].to_list(), columns=['title', 'cast'])
    df = df.explode('cast').reset_index(drop=True)
    #df['nepo'] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1)
    return df

In [None]:
pop_df = imdb_whole_shebang(url_pop_tv)

In [8]:
pop_tv_df = get_imdb_list_links(url_pop_tv)
pop_mov_df = get_imdb_list_links(url_pop_mov)
top_tv_df = get_imdb_list_links(url_top_tv)
top_mov_df = get_imdb_list_links(url_top_mov)
box_office_df = get_imdb_list_links(url_box_office)

In [18]:
pop_tv_df["imdb_info"] = pop_tv_df.apply(lambda row : top_cast(row['link']), axis=1)
pop_mov_df["imdb_info"] = pop_mov_df.apply(lambda row : top_cast(row['link']), axis=1)
top_tv_df["imdb_info"] = top_tv_df.apply(lambda row : top_cast(row['link']), axis=1)
top_mov_df["imdb_info"] = top_mov_df.apply(lambda row : top_cast(row['link']), axis=1)
box_office_df["imdb_info"] = box_office_df.apply(lambda row : top_cast(row['link']), axis=1)

KeyError: 'link'

In [11]:
pop_tv_df = pd.DataFrame(pop_tv_df["imdb_info"].to_list(), columns=['title', 'cast'])
pop_mov_df = pd.DataFrame(pop_mov_df["imdb_info"].to_list(), columns=['title', 'cast'])
top_tv_df = pd.DataFrame(top_tv_df["imdb_info"].to_list(), columns=['title', 'cast'])
top_mov_df = pd.DataFrame(top_mov_df["imdb_info"].to_list(), columns=['title', 'cast'])
box_office_df = pd.DataFrame(box_office_df["imdb_info"].to_list(), columns=['title', 'cast'])

In [22]:
pop_tv_df = pop_tv_df.explode('cast').reset_index(drop=True)
pop_mov_df = pop_mov_df.explode('cast').reset_index(drop=True)
top_tv_df = top_tv_df.explode('cast').reset_index(drop=True)
top_mov_df = top_mov_df.explode('cast').reset_index(drop=True)
box_office_df = box_office_df.explode('cast').reset_index(drop=True)

KeyError: 'cast'

In [None]:
url = "https://en.wikipedia.org/wiki/Alba_Flores"

data = requests.get(url).text
soup = BeautifulSoup(data,'html.parser') # full page
        
# check if infobox exists
infobox = soup.find("table",{"class":"infobox biography vcard"}) # infobox

parent_field = soup.find('th', string=parent_pattern).parent
parent_a_tags = parent_field.find_all('a')

print(parent_field)

In [None]:
wiki_scrape("Hailey Bieber")

In [42]:
type(pop_df['cast'][1])

bs4.element.NavigableString

In [45]:
test = top_cast("https://www.imdb.com/title/tt4574334/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=332cb927-0342-42b3-815c-f9124e84021d&pf_rd_r=Q8WYBTJ5QDZDMW9Z3R8E&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=tvmeter&ref_=chttvm_tt_1")