In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import wikipediaapi as wiki

In [2]:
# Parent and Relative strings to search for in wikipedia infobox
parent_pattern = re.compile('Parent|Relative')

# url base
wiki_url_base = "https://en.wikipedia.org" # wikipedia
imdb_base_url = "https://www.imdb.com"

# wikipedia language setting
wiki_wiki = wiki.Wikipedia('en')

# imdb links
# Most Popular TV Shows as determined by IMDb Users
url_pop_tv = "https://www.imdb.com/chart/tvmeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=DK1W37ZH61RXZP184X95&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_5"

# Most Popular Movies as determined by IMDb Users
url_pop_mov = "https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=408FM1WEANBKDGP51PET&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_2"

# Top Box Office (US) - updated weekly
url_box_office = "https://www.imdb.com/chart/boxoffice?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=KK2KJN75YN0PGF8Q79QK&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=moviemeter&ref_=chtmvm_ql_1"

In [3]:
# given name of person, outputs parent links or false if none
def wiki_scrape(name):
    subject = name.replace(" ", "_")
    url = wiki_url_base + "/wiki/" + subject

    # check if wiki page exists
    wiki_page = wiki_wiki.page(subject)
    does_page_exist = wiki_page.exists()

    if does_page_exist is False :
        return True # no wiki page -> not famous enough, not a nepo baby
    
    elif does_page_exist is True : 
        data = requests.get(url).text
        soup = BeautifulSoup(data,'html.parser') # full page
        
        # check if infobox exists
        infobox = soup.find("table",{"class":"infobox biography vcard"}) # infobox
        
        if infobox is None :
            return True # no infobox on wiki page -> not a nepo baby
        else :
            does_parent_field_exist = bool(re.search("Parent|Relative", infobox.text)) # see if Parent or Relative field is listed in infobox
        
            if does_parent_field_exist is False :
                return True # parent field not listed in infobox -> not a nepo baby
            elif does_parent_field_exist is True :
                parent_field = soup.find('th', string=parent_pattern).parent
                parent_a_tags = parent_field.find_all('a')
                if len(parent_a_tags) == 0 :
                    return True # parents listed in infobox but not linked -> not a nepo baby
                else : # nepo baby!
                    parent_wiki_list = []
                    for link in parent_field.find_all('a'):
                        parent_wiki = link.get('href')
                        parent_wiki_link = wiki_url_base + parent_wiki
                        parent_wiki_list.append(parent_wiki_link)
                        
                        parent_wiki_list[:] = [x for x in parent_wiki_list if "cite_note" not in x] # cited entries are in <a href> tags so remove those links here
    
                return parent_wiki_list


In [24]:
def percent_nepo(df):
    df = df[['title', 'nepos']]
    df = df[df.nepos != True].groupby('title').count() / df.groupby('title').count()
    return df

In [4]:
def get_imdb_list_links(url):
    page_data = requests.get(url).text
    soup = BeautifulSoup(page_data,'html.parser')
    
    media_table = soup.find("table", {"class":"chart full-width"}) 
    
    all_links = []
    
    # media is linked twice in each row -- specify td (column) = 2 to avoid dupes
    for tag in media_table.select("a"):
        all_links.append(imdb_base_url + tag["href"])

    # media is linked twice in each row 
    # tried select("td:nth-of-type(2) a") in the for loop above but that didn't work for all imdb links
    df = pd.DataFrame(columns=["link"], data = all_links).drop_duplicates()
   
    return df

In [5]:
def top_cast(url):
    page_data =  requests.get(url).text
    soup = BeautifulSoup(page_data,'html.parser')
    
    title = soup.find("h1")
    title = title.text
    
    # grab top cast section of imdb page
    top_cast = soup.find_all("a", {"class": "sc-36c36dd0-1 QSQgP"})
    
    cast_list = []
    
    # get cast names
    for a in top_cast:
        cast_list.append(str(a.string))
        
    return[title, cast_list]

In [6]:
def imdb_whole_shebang(url):
    df = get_imdb_list_links(url)
    df['imdb_info'] = df.apply(lambda row : top_cast(row['link']), axis=1)
    df = pd.DataFrame(df["imdb_info"].to_list(), columns=['title', 'cast'])
    df = df.explode('cast').reset_index(drop=True)
    df.loc[:,"nepos"] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1)
    return df

In [8]:
pop_tv_df = imdb_whole_shebang(url_pop_tv)

In [9]:
pop_mov_df = imdb_whole_shebang(url_pop_mov)

In [None]:
box_office_df = imdb_whole_shebang(url_box_office)