In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import wikipediaapi as wiki
import spotipy
import billboard
from datetime import datetime
import imdb
import random

In [None]:
# Parent and Relative strings to search for in wikipedia infobox
parent_pattern = re.compile('Parent|Relative')

# url base
wiki_url_base = "https://en.wikipedia.org" # wikipedia
imdb_base_url = "https://www.imdb.com"
imdb_search_base_url = 'https://www.imdb.com/title/tt'

# wikipedia language setting
wiki_wiki = wiki.Wikipedia('en')

# imdb links
# Most Popular TV Shows as determined by IMDb Users
url_pop_tv = "https://www.imdb.com/chart/tvmeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=DK1W37ZH61RXZP184X95&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_ql_5"

# Most Popular Movies as determined by IMDb Users
url_pop_mov = "https://www.imdb.com/chart/moviemeter?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=408FM1WEANBKDGP51PET&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_2"

# Top Box Office (US) - updated weekly
url_box_office = "https://www.imdb.com/chart/boxoffice?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=KK2KJN75YN0PGF8Q79QK&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=moviemeter&ref_=chtmvm_ql_1"

ia = imdb.Cinemagoer()

In [None]:
# given name of person, outputs parent links or false if none
def wiki_scrape(name):
    subject = name.replace(" ", "_")
    url = wiki_url_base + "/wiki/" + subject

    # check if wiki page exists
    wiki_page = wiki_wiki.page(subject)
    does_page_exist = wiki_page.exists()

    if does_page_exist is False :
        return False # no wiki page -> not famous enough, not a nepo baby
    
    elif does_page_exist is True : 
        data = requests.get(url).text
        soup = BeautifulSoup(data,'html.parser') # full page
        
        # check if infobox exists
        infobox = soup.find("table",{"class":"infobox biography vcard"}) # infobox
        
        if infobox is None :
            return False # no infobox on wiki page -> not a nepo baby
        else :
            does_parent_field_exist = bool(re.search("Parent|Relative", infobox.text)) # see if Parent or Relative field is listed in infobox
        
            if does_parent_field_exist is False :
                return False # parent field not listed in infobox -> not a nepo baby
            elif does_parent_field_exist is True :
                parent_field = soup.find('th', string=parent_pattern).parent
                parent_a_tags = parent_field.find_all('a')
                if len(parent_a_tags) == 0 :
                    return False # parents listed in infobox but not linked -> not a nepo baby
                else : # nepo baby!
                    parent_wiki_list = []
                    for link in parent_field.find_all('a'):
                        parent_wiki = link.get('href')
                        parent_wiki_link = wiki_url_base + parent_wiki
                        parent_wiki_list.append(parent_wiki_link)
                        
                        parent_wiki_list[:] = [x for x in parent_wiki_list if "cite_note" not in x] # cited entries are in <a href> tags so remove those links here
    
                return parent_wiki_list


In [None]:
def pct_nepo(df, group_col):
    df = df[[group_col, 'nepos']]
    df = df[df.nepos != False].groupby(group_col).count() / df.groupby(group_col).count() * 100
    df = df.rename(columns={'nepos': 'pct_nepo'})
    return df

In [None]:
def get_imdb_list_links(url):
    page_data = requests.get(url).text
    soup = BeautifulSoup(page_data,'html.parser')
    
    media_table = soup.find("table", {"class":"chart full-width"}) 
    
    all_links = []
    
    # media is linked twice in each row -- specify td (column) = 2 to avoid dupes
    for tag in media_table.select("a"):
        all_links.append(imdb_base_url + tag["href"])

    # media is linked twice in each row 
    # tried select("td:nth-of-type(2) a") in the for loop above but that didn't work for all imdb links
    df = pd.DataFrame(columns=["link"], data = all_links).drop_duplicates()
   
    return df

In [None]:
def title_specs(url):
    page_data =  requests.get(url).text
    soup = BeautifulSoup(page_data,'html.parser')
    
    # TITLE
    title = soup.find("h1")
    title = title.text
    
    # CAST
    # grab top cast section of imdb page
    top_cast = soup.find_all("a", {"class": "sc-36c36dd0-1 QSQgP"})
    
    cast_list = []
    
    # get cast names
    for a in top_cast:
        cast_list.append(str(a.string))
        
    # IMDB RATING
    rating_tag = soup.find("span", {"class": "sc-7ab21ed2-1 jGRxWM"})
    if rating_tag is None :
        rating = np.nan # some shows are in the popular list but haven't actually been released yet so no rating
    else :
        rating = rating_tag.string
        
    return[title, cast_list, rating]

In [None]:
def imdb_whole_shebang(url):
    df = get_imdb_list_links(url)
    df['imdb_info'] = df.apply(lambda row : title_specs(row['link']), axis=1)
    df = pd.DataFrame(df["imdb_info"].to_list(), columns=['title', 'cast', 'imdb_rating'])
    df = df.explode('cast').reset_index(drop=True)
    df.loc[:,"nepos"] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1)
    return df

In [None]:
pop_tv_df = imdb_whole_shebang(url_pop_tv)

In [None]:
pop_tv_pct = pct_nepo(pop_tv_df, 'title')
pop_tv_df_1 = pop_tv_df.merge(pop_tv_pct, on=['title'], how='left')
pop_tv_df_1.to_csv("pop_tv.csv")

In [None]:
pop_mov_df = imdb_whole_shebang(url_pop_mov)

In [None]:
box_office_df = imdb_whole_shebang(url_box_office)

In [None]:
# charts start in 2016 and go up to most recent full year
years_for_billboard_charts = list(range(2016, current_year-1))
categories_for_billboard_charts = ['top-artists', 'top-rock-artists', 'top-r&b-artists', 
                                   'top-dance-electronic-artists', 'top-latin-artists']

artists = []

for yr in years_for_billboard_charts :
    for category in categories_for_billboard_charts :
        for i in billboard.ChartData(category, year=yr):
            artists.append({'year': yr,
                            'chart': category,
                            'artist': i.artist})
            
artist_df = pd.DataFrame(artists)

In [None]:
artist_df["nepos"] = artist_df.apply(lambda row : wiki_scrape(row['artist']), axis=1)

In [None]:
artist_pct = pct_nepo(artist_df, 'Chart')

In [None]:
person_input = input("name: ")
person_cnxn = wiki_scrape(person_input)

if person_cnxn is False :
    print(f"{person_input} is not a nepo baby")
else :
    print(f'{person_input} is a nepo baby. Here are the connection links: {person_cnxn}')

In [None]:
movie_input = input("movie or tv show: ")
movie_choices = ia.search_movie(movie_input)

movie_choices_list = []

# only showing 5 possible options
for i in range(5) :
    
    index = [1, 2, 3, 4, 5]
    rank = index[i]
    
    movie_title = movie_choices[i]['title']
    movie_id = movie_choices[i].movieID
    
    movie = ia.get_movie(movie_id)
    
    short_cast_list = []
    try :
        cast = movie['cast']
        for i in range(3):
            short_cast_list.append(cast[i]['name'])
    
        formatted_cast_list = f"starring {short_cast_list[0]}, {short_cast_list[1]}, and {short_cast_list[2]}"
    except KeyError:
        formatted_cast_list = ""
    except IndexError:
        formatted_cast_list = ""
        
    try :
        year = movie['year']
        year = f" ({year})"
    except KeyError:
        year = ""
    print(f"{rank}: {movie_title}{year} {formatted_cast_list}")

movie_choice_input = input(f'people are not very creative with titles so there are a few titles - which number is the correct one?: ')

movie_id = movie_choices[int(movie_choice_input)-1].movieID
movie_url = imdb_search_base_url + movie_id

movie_full_cast = title_specs(movie_url)

df = pd.DataFrame([movie_full_cast], columns=['title', 'cast', 'imdb_rating'])
df = df.explode('cast').reset_index(drop=True)
df.loc[:,"nepos"] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1)
title_pct_nepo = pct_nepo(df, 'title')
pct_nepo_str = title_pct_nepo['pct_nepo'][0]

nepo_rows = df[df['nepos'] != False]

print(f'{movie_input} is {pct_nepo_str}% full of nepotism babies. The following people are the culprits: {nepo_rows}')


In [None]:
nepo_rows = df[df['nepos'] != False]
nepo_rows.loc[0, "cast"]

In [None]:
title_pct_nepo['pct_nepo'][0]