In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import wikipediaapi as wiki
import spotipy
import billboard
from datetime import datetime
import imdb
import random
import math

In [3]:
##########################
###     CONSTANTS      ###
##########################

In [4]:
# IMdb API key
api_key = "k_11m5rq35"

# Parent and Relative strings to search for in wikipedia infobox
parent_pattern = re.compile('Parent')
relative_pattern = re.compile('Relative')

not_nepo_relationships = 'son|daughter|nephew|niece|grandson|grand-daughter|grandnephew|grandniece|in-law|stepson|stepdaughter'

# url base
wiki_url_base = "https://en.wikipedia.org" # wikipedia
imdb_base_url = 'https://www.imdb.com/title/'

# wikipedia language setting
wiki_wiki = wiki.Wikipedia('en')

ia = imdb.Cinemagoer()

In [5]:
top_250_mov_url = "https://imdb-api.com/en/API/Top250Movies/"
top_250_tv_url = "https://imdb-api.com/en/API/Top250TVs/"
pop_mov_url = "https://imdb-api.com/en/API/MostPopularMovies/"
pop_tv_url = "https://imdb-api.com/en/API/MostPopularTVs/"
box_office_all_time_url = "https://imdb-api.com/en/API/BoxOfficeAllTime/"

In [6]:
##########################
###     FUNCTIONS      ###
##########################

In [7]:
def get_imdb_lists(url, imdb_api_key):
    data = requests.get(url + imdb_api_key).json()
    list_of_titles = data['items']
    df = pd.DataFrame(list_of_titles)
    return df

In [8]:
def title_specs(title_id):
    page_data =  requests.get(imdb_base_url+title_id).text
    soup = BeautifulSoup(page_data,'html.parser')
    
    # CAST
    # grab top cast section of imdb page
    top_cast = soup.find_all("a", {"class": "sc-36c36dd0-1 QSQgP"})
    
    cast_list = []
    
    # get cast names
    for a in top_cast:
        cast_list.append(str(a.string))
        
    return cast_list

In [9]:
# given name of person, outputs parent links or false if none
def wiki_scrape(name):
    subject = name.replace(" ", "_")
    url = wiki_url_base + "/wiki/" + subject

    # check if wiki page exists
    wiki_page = wiki_wiki.page(subject)
    does_page_exist = wiki_page.exists()

    if does_page_exist is False :
        return False # no wiki page -> not famous enough, not a nepo baby
    
    elif does_page_exist is True : 
        data = requests.get(url).text
        soup = BeautifulSoup(data,'html.parser') # full page
        
        # check if infobox exists
        infobox = soup.find("table",{"class":"infobox biography vcard"}) # infobox
        
        if infobox is None :
            return False # no infobox on wiki page -> not a nepo baby
        
        else :
            does_parent_or_rel_field_exist = bool(re.search("Parent|Relative", infobox.text)) # see if Parent or Relative field is listed in infobox
        
            if does_parent_or_rel_field_exist is False :
                return False # parent field not listed in infobox -> not a nepo baby
            
            elif does_parent_or_rel_field_exist is True :
                
                try :
                    does_parent_field_exist = bool(re.search("Parent", infobox.text))
                    does_relative_field_exist = bool(re.search("Relative", infobox.text))
                    
                    if does_parent_field_exist is True :   
                        parent_field = soup.find('th', string=parent_pattern).parent
                        parent_a_tags = parent_field.find_all('a')
                        
                        if len(parent_a_tags) == 0 :
                            return False # parents listed in infobox but not linked -> not a nepo baby
                        
                        else : # nepo baby!
                            parent_wiki_list = []
                            
                            for link in parent_field.find_all('a'):
                                parent_wiki = link.get('href')
                                parent_wiki_link = wiki_url_base + parent_wiki
                                parent_wiki_list.append(parent_wiki_link)

                                parent_wiki_list[:] = [x for x in parent_wiki_list if "cite_note" not in x] # cited entries are in <a href> tags so remove those links here
                                
                        return parent_wiki_list

                    elif does_relative_field_exist is True : 
                        relative_field = soup.find('th', string=relative_pattern).parent
                        relative_td_tags = relative_field.find_all('td')

                        list_of_relatives = re.split('</li>|<br/>', str(relative_td_tags))

                        parent_wiki_list = []

                        if len(relative_field.find_all('a')) == 0 :
                            pass
                        else :
                            for i in range(len(list_of_relatives)):
                                if bool(re.search(not_nepo_relationships, list_of_relatives[i])) is True:
                                    pass

                                else :
                                    try :
                                        href_match = re.search(r'href=\"(.*)\" title=', list_of_relatives[i])
                                        parent_wiki = href_match.group(1)
                                        parent_wiki_link = wiki_url_base + parent_wiki
                                        parent_wiki_list.append(parent_wiki_link)

                                    except AttributeError :
                                        pass  
                            
                            if len(parent_wiki_list) == 0 :
                                return False
                            else :
                                return parent_wiki_list
                
                except AttributeError :
                    pass


In [10]:
def pct_nepo(df, group_col):
    df = df[[group_col, 'nepos']]
    df = df[df.nepos != False].groupby(group_col).count() / df.groupby(group_col).count() * 100
    df = df.rename(columns={'nepos': 'pct_nepo'})
    return df

In [11]:
def imdb_whole_shebang(url):
    
    df = get_imdb_lists(url, api_key) # get list of titles from imdb
    df['cast'] = df.apply(lambda row : title_specs(row['id']), axis=1) # get top cast from imdb
    df = df.explode('cast').reset_index(drop=True) # expand list of cast into rows
    df.loc[:,"nepos"] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1) # check if each cast member is a nepo
    df_pct_nepo = pct_nepo(df, "fullTitle") # calculate pct nepo by title
    df = df.merge(df_pct_nepo, on=['fullTitle'], how="left") # merge data frames
    
    csv_name_match = re.search('API/(.*)/', url)
    csv_name = csv_name_match.group(1)
    
    df.to_csv(csv_name + ".csv")
    
    return df

In [None]:
##########################
###   GET IMDB DATA    ###
##########################

In [12]:
top_250_mov = imdb_whole_shebang(top_250_mov_url)

In [14]:
top_250_tv = imdb_whole_shebang(top_250_tv_url)

In [16]:
pop_mov = imdb_whole_shebang(pop_mov_url)

In [17]:
pop_tv = imdb_whole_shebang(pop_tv_url)

In [None]:
##########################
### GET BILLBOARD DATA ###
##########################

In [None]:
# charts start in 2016 and go up to most recent full year
years_for_billboard_charts = list(range(2016, current_year-1))
categories_for_billboard_charts = ['top-artists', 'top-rock-artists', 'top-r&b-artists', 
                                   'top-dance-electronic-artists', 'top-latin-artists']

artists = []

for yr in years_for_billboard_charts :
    for category in categories_for_billboard_charts :
        for i in billboard.ChartData(category, year=yr):
            artists.append({'year': yr,
                            'chart': category,
                            'artist': i.artist})
            
artist_df = pd.DataFrame(artists)

In [None]:
artist_df["nepos"] = artist_df.apply(lambda row : wiki_scrape(row['artist']), axis=1)

In [None]:
artist_pct = pct_nepo(artist_df, 'Chart')

In [None]:
##########################
###     USER INPUT     ###
##########################

In [None]:
person_input = input("name: ")
person_cnxn = wiki_scrape(person_input)

if person_cnxn is False :
    print(f"{person_input} is not a nepo baby")
else :
    print(f'{person_input} is a nepo baby. Here are the connection links: {person_cnxn}')

In [None]:
url_search_base = "https://imdb-api.com/en/API/SearchTitle/k_11m5rq35/"
movie_input = input("movie or tv show: ")

data = requests.get(url_search_base + movie_input).json()
data['results']


In [None]:
movie_input = input("movie or tv show: ")
movie_choices = ia.search_movie(movie_input)

movie_choices_list = []

# only showing 5 possible options
for i in range(5) :
    
    index = [1, 2, 3, 4, 5]
    rank = index[i]
    
    movie_title = movie_choices[i]['title']
    movie_id = movie_choices[i].movieID
    
    movie = ia.get_movie(movie_id)
    
    short_cast_list = []
    try :
        cast = movie['cast']
        for i in range(3):
            short_cast_list.append(cast[i]['name'])
    
        formatted_cast_list = f"starring {short_cast_list[0]}, {short_cast_list[1]}, and {short_cast_list[2]}"
    except KeyError:
        formatted_cast_list = ""
    except IndexError:
        formatted_cast_list = ""
        
    try :
        year = movie['year']
        year = f" ({year})"
    except KeyError:
        year = ""
    print(f"{rank}: {movie_title}{year} {formatted_cast_list}")

movie_choice_input = input(f'people are not very creative with titles so there are a few titles - which number is the correct one?: ')

movie_id = movie_choices[int(movie_choice_input)-1].movieID
movie_url = imdb_base_url + "tt" + movie_id

movie_full_cast = title_specs("tt"+movie_id)

df = pd.DataFrame([movie_full_cast], columns=['title', 'cast', 'imdb_rating'])
df = df.explode('cast').reset_index(drop=True)
df.loc[:,"nepos"] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1)
title_pct_nepo = pct_nepo(df, 'title')
pct_nepo_str = title_pct_nepo['pct_nepo'][0]

nepo_rows = df[df['nepos'] != False]

print(f'{movie_input} is {pct_nepo_str}% full of nepotism babies. The following people are the culprits: {nepo_rows}')


In [None]:
##########################
###  IMDB RAND SAMPLE  ###
##########################

In [None]:
number_of_titles_per_pg_str = "250"
number_of_titles_per_pg_int = 250
number_of_titles_to_select = 250

url_all = f"https://www.imdb.com/search/title/?title_type=feature,tv_movie,tv_series,tv_miniseries&countries=us&count={number_of_titles_per_pg_str}&view=simple&sort=alpha,asc"
imdb_search_url = f"https://imdb-api.com/API/AdvancedSearch/{api_key}/?title_type=feature,tv_movie,tv_series,tv_miniseries&countries=us&count={number_of_titles_per_pg_str}&sort=alpha,asc&start="

data = requests.get(url_all).text
soup = BeautifulSoup(data, 'html.parser')
    
# get total number of titles
totals_tag = soup.find_all("div", {"class" : "desc"})[0].find('span').text
totals_tag = re.search('of (.*) titles', totals_tag)  
total_ct_titles = int(totals_tag.group(1).replace(',', ''))


In [None]:
# randomly generate 250 titles (same numbers as in pop/top mov/tv lists)
random_title_numbers = [random.randint(0, total_ct_titles) for iter in range(number_of_titles_to_select)]

In [None]:
titles = pd.DataFrame()

for i in range(number_of_titles_to_select) :
    
    # each api request only pulls one page of data (250 titles)
    # determine which page title is on and the index of that title on that page
    pg_start = math.ceil(random_title_numbers[i] / number_of_titles_per_pg_int) * number_of_titles_per_pg_int + 1 # plus one because pages start on 1-, 251-, 501-, etc titles
    num_title_on_pg = random_title_numbers[i] % number_of_titles_per_pg_int
    
    url_for_pg_num = imdb_search_url + str(pg_start)
    
    data = requests.get(url_for_pg_num).json()
    data = data['results']
    df = pd.DataFrame(data)
    
    title = df.iloc[num_title_on_pg].transpose()
    
    titles = pd.concat([titles, title])
    

In [None]:
##########################
###       TESTING      ###
##########################

In [None]:
data = requests.get("https://en.wikipedia.org/wiki/Ted_Danson").text
soup = BeautifulSoup(data,'html.parser')

infobox = soup.find("table",{"class":"infobox biography vcard"})
does_relative_field_exist = bool(re.search("Relative", infobox.text))

relative_field = soup.find('th', string=relative_pattern).parent
relative_td_tags = relative_field.find_all('td')

list_of_relatives = re.split('</li>|<br/>', str(relative_td_tags))

parent_wiki_list = []

if len(relative_field.find_all('a')) == 0 :
    pass
else :
    for i in range(len(list_of_relatives)):
        if bool(re.search(not_nepo_relationships, list_of_relatives[i])) is True:
            pass
        
        else :
            try :
                href_match = re.search(r'href=\"(.*)\" title=', list_of_relatives[i])
                parent_wiki = href_match.group(1)
                parent_wiki_link = wiki_url_base + parent_wiki
                parent_wiki_list.append(parent_wiki_link)
                
            except AttributeError :
                pass
            
    if len(parent_wiki_list) == 0 :
        return False
    else :
        return parent_wiki_list

parent_wiki_list