In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import wikipediaapi as wiki

In [2]:
# met gala attendees
df = pd.read_csv(r'/Users/jessicacarr/Documents/J/nepos/nepos_met gala.csv')

# Parent and Relative strings to search for in wikipedia infobox
parent_pattern = re.compile('Parent|Relative')

# wikipedia url base
url_base = "https://en.wikipedia.org"

# wikipedia language setting
wiki_wiki = wiki.Wikipedia('en')

In [3]:
# given name of person, outputs parent links or false if none
def wiki_scrape(name):
    subject = name.replace(" ", "_")
    url = url_base + "/wiki/" + subject

    # check if wiki page exists
    wiki_page = wiki_wiki.page(subject)
    does_page_exist = wiki_page.exists()

    if does_page_exist is False :
        return [name, False] # no wiki page -> not famous enough, not a nepo baby
    
    elif does_page_exist is True : 
        data = requests.get(url).text
        soup = BeautifulSoup(data,'html.parser') # full page
        
        # check if infobox exists
        infobox = soup.find("table",{"class":"infobox biography vcard"}) # infobox
        
        if infobox is None :
            return [name, False] # no infobox on wiki page -> not a nepo baby
        else :
            does_parent_field_exist = bool(re.search("Parent|Relative", infobox.text)) # see if Parent or Relative field is listed in infobox
        
            if does_parent_field_exist is False :
                return [name, False]
            elif does_parent_field_exist is True :
                parent_field = soup.find('th', string=parent_pattern).parent
                parent_a_tags = parent_field.find_all('a')
                if len(parent_a_tags) == 0 :
                    return [name, False] # parents listed in infobox but not linked -> not a nepo baby
                else :
                    parent_wiki_list = []
                    for link in parent_field.find_all('a'):
                        parent_wiki = link.get('href')
                        parent_wiki_link = url_base + parent_wiki
                        parent_wiki_list.append(parent_wiki_link)
                        
                        parent_wiki_list[:] = [x for x in parent_wiki_list if "cite_note" not in x] # cited entries are in <a href> tags so remove those links here
    
                return [name, parent_wiki_list]


In [4]:
# link to Top 100 Popular Shows
url_top_tv = "https://www.imdb.com/chart/tvmeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=DK1W37ZH61RXZP184X95&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_5"
imdb_base_url = "https://www.imdb.com"

# html parser
data_top_tv = requests.get(url_top_tv).text
soup_top_tv = BeautifulSoup(data_top_tv,'html.parser')

top_tv_table = soup_top_tv.find("table", {"class":"chart full-width"})
top_tv_links = []

for link in top_tv_table.find_all('a'):
    tv_link = link.get('href')
    tv_link = imdb_base_url + tv_link
    top_tv_links.append(tv_link)

In [5]:
url_imdb = top_tv_links[98] # tester for one show

# get specific show imdb page
data_imdb = requests.get(url_imdb).text
soup_imdb = BeautifulSoup(data_imdb,'html.parser')

In [6]:
# grab top cast section of imdb page
top_cast = soup_imdb.find_all("a", {"class": "sc-36c36dd0-1 QSQgP"})

# initialise list
cast_names = []

# get cast names
for a in top_cast:
    cast_names.append(a.string)

In [7]:
# find the nepo babies
list(map(wiki_scrape, cast_names))

[['Úrsula Corberó', False],
 ['Álvaro Morte', False],
 ['Itziar Ituño', False],
 ['Pedro Alonso', False],
 ['Miguel Herrán', False],
 ['Jaime Lorente', False],
 ['Esther Acebo', False],
 ['Darko Peric', False],
 ['Enrique Arce', False],
 ['Alba Flores', ['https://en.wikipedia.org/wiki/Antonio_Flores']],
 ['Fernando Soto', False],
 ['Mario de la Rosa', False],
 ['Hovik Keuchkerian', False],
 ['Rodrigo de la Serna', False],
 ['Najwa Nimri', False],
 ['Luka Peros', False],
 ['Fernando Cayo', False],
 ['Rocco Narva', False]]

In [9]:
# ISSUES TO TROUBLESHOOT

# 1: not returning all links -- only parent ones in this case:
wiki_scrape("Alba Flores")
# may be because there is a parent listed with no link so then skips the relatives

# 2: top_tv_links has duplicate links (len = 200 rather than 100) because there is a link in titleColumn and posterColumn

['Alba Flores', ['https://en.wikipedia.org/wiki/Antonio_Flores']]

In [8]:
# apply wiki_scrape function
#df["nepo"] = df.apply(lambda row : wiki_scrape(row['attendee']), axis=1)