In [758]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
import wikipediaapi as wiki
from datetime import datetime
import random
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import wikipedia
import time
import urllib
import urllib.request
import json

In [None]:
##########################
###     CONSTANTS      ###
##########################

In [10]:
# IMdb API key
api_key = "k_11m5rq35"

# headers
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive',
        "referer":"referer: https://www.google.com/",
        "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
        }

cookies = {"CONSENT":"YES+shp.gws-20210330-0-RC1.de+FX+412"}

options = Options()
options.headless = True

# Parent and Relative strings to search for in wikipedia infobox
parent_pattern = re.compile('Parent')
relative_pattern = re.compile('Relative')
starring_pattern = re.compile('Starring')
voice_pattern = re.compile('Voices of')
wikipedia_pattern = re.compile("wiki")

not_nepo_relationships = 'son|daughter|nephew|niece|grandson|grand-daughter|grandnephew|grandniece|in-law|stepson|stepdaughter|brother|sister|cousin'

# url base
wiki_url_base = "https://en.wikipedia.org" # wikipedia
imdb_base_url = 'https://www.imdb.com/title/'

# wikipedia language setting
wiki_wiki = wiki.Wikipedia('en')

ia = imdb.Cinemagoer()

In [472]:
top_250_mov_url = "https://imdb-api.com/en/API/Top250Movies/"
top_250_tv_url = "https://imdb-api.com/en/API/Top250TVs/"
pop_mov_url = "https://imdb-api.com/en/API/MostPopularMovies/"
pop_tv_url = "https://imdb-api.com/en/API/MostPopularTVs/"
box_office_all_time_url = "https://imdb-api.com/en/API/BoxOfficeAllTime/"

In [4]:
##########################
###     FUNCTIONS      ###
##########################

In [129]:
def get_imdb_lists(url, imdb_api_key):
    data = requests.get(url + imdb_api_key).json()
    list_of_titles = data['items']
    df = pd.DataFrame(list_of_titles)
    return df

In [130]:
df = get_imdb_lists(pop_mov_url, api_key)

In [131]:
def remove_ws(words):
    no_ws = words.replace(" ", "_")
    return no_ws

In [132]:
def get_title_link(search_term):
    wiki_search = wikipedia.search(search_term)[0]
    wiki_search_no_ws = remove_ws(wiki_search)
    wiki_url = wiki_url_base + "/wiki/" + wiki_search_no_ws
    
    return wiki_url

In [133]:
def get_soup(url):
    data = requests.get(url, headers=headers).text
    soup = BeautifulSoup(data,'html.parser') # full page
    
    return soup

In [134]:
def find_child_in_soup(soup, parent, child):
    child = soup.find(f"{parent}",{"class":f"{child}"})
    
    return child

In [711]:
def do_infobox_fields_exist(soup, infobox):
    infobox_fields = infobox.find_all('th', {'class' : 'infobox-label'})
    does_starring_field_exist = bool(re.search("Starring", str(infobox_fields)))
    does_voices_field_exist = bool(re.search("Voices of", str(infobox_fields)))
    
    if does_voices_field_exist is False and does_starring_field_exist is False:
        return [False, False]
    
    else :
        return field_results(soup, infobox, does_starring_field_exist, does_voices_field_exist)

In [712]:
def field_results(soup, infobox, does_starring_field_exist, does_voices_field_exist):
    if does_voices_field_exist is True:
        th_pattern = voice_pattern
    elif does_starring_field_exist is True:
        th_pattern = starring_pattern
        
    th = soup.find('th', string=th_pattern).parent
    li_list = th.find_all('li')
    
    cast_list = []
    cast_links = []
    
    for li in li_list:
        a = li.find('a')
        if a == None :
            cast_list.append(li.text)
            cast_links.append("")
        else :
            cast_list.append(a.text)
            cast_links.append(a['href'])
            
    if len(cast_list) == 0 :
        return [False, False]
    else :
        return [cast_list, cast_links]

In [713]:
def get_cast_from_wiki(title, year, tv_or_film): 
    
    search_term = f"{title} {year} {tv_or_film}"
    wiki_url = get_title_link(search_term)
    
    soup = get_soup(wiki_url)
    infobox = find_child_in_soup(soup, "table", "infobox vevent")
    
    if infobox is None :
        return [[],[]]
        
    else :        
        cast_list = []
        cast_list = do_infobox_fields_exist(soup, infobox)
        
        cast_names = cast_list[0]
        cast_links = cast_list[1]
        
        return cast_list

In [714]:
def title_specs(title_id):
    
    url = imdb_base_url + title_id
    
    soup = get_soup(url)
    
    # CAST
    # grab top cast section of imdb page
    t = soup.find_all("div", {"class": "sc-bfec09a1-7 dpBDvu"})
    s = soup.find_all("a", {"class": "sc-bfec09a1-1 fUguci"})
    
    cast_list = []
    cast_links = []
    
    # get cast names
    for a in s:
        cast_list.append(str(a.string))
        cast_links.append("")
        
    return [cast_list, cast_links]

In [715]:
def does_wiki_exist(name) :
    # check if wiki page exists
    wiki_page = wiki_wiki.page(name)
    does_page_exist = wiki_page.exists()
    
    return does_page_exist

In [716]:
def get_infobox_image(url):
    soup = get_soup(url)
    
    image = find_child_in_soup(soup, "a", "image")
    image_str = str(image)
    
    try:
        found = re.search('src="//(.+?)"', image_str).group(1)
    except AttributeError:
        found = ''
        
    return found

In [717]:
def extract_infobox_data(url):
    soup = get_soup(url)
    
    bio_vcard = find_child_in_soup(soup, "table", "infobox biography vcard")
    vcard = find_child_in_soup(soup, "table", "infobox vcard")
    
    if bio_vcard is None and vcard is None : 
        return '', []
    else :
        if bio_vcard is not None :
            infobox = bio_vcard
        else :
            infobox = vcard
        
        blue_links = parse_infobox_parent_fields(soup, infobox)
        subj_img = get_infobox_image(url)
        
        parent_data = []
        
        if len(blue_links) != 0:
            for link in blue_links:
                name = link.partition('/wiki/')[2]
                name_wws = name.replace("_", " ")
                
                wiki_link = wiki_url_base + "/wiki/" + name
                image = get_infobox_image(wiki_link)
                
                parent = {
                    'name': name_wws,
                    'image': image,
                    'link': wiki_link
                }
                
                parent_data.append(parent)
        
        return subj_img, parent_data
    

In [718]:
def parse_infobox_parent_fields(soup, infobox):
    
    infobox_fields = infobox.find_all('th', {'class' : 'infobox-label'})
    
    try :
        does_parent_field_exist = bool(re.search("Parent", str(infobox_fields)))
        does_relative_field_exist = bool(re.search("Relative", str(infobox_fields)))

        if does_parent_field_exist is False and does_relative_field_exist is False:
            return [] # parent field not listed in infobox -> not a nepo baby 

        else :
            if does_parent_field_exist is True : 
                return get_parent_links(soup)
            else :
                return get_rel_links(soup)
            
    except AttributeError:
        pass


In [795]:
def get_parent_links(soup) : 
    parent_field = soup.find('th', string=parent_pattern).parent
    parent_a_tags = parent_field.find_all('a')
                        
    if len(parent_a_tags) == 0 :
        return [] # parents listed in infobox but not linked -> not a nepo baby

    else : # nepo baby!
        parent_wiki_list = []

        for link in parent_field.find_all('a'):
            parent_wiki = link.get('href')
            parent_wiki_link = wiki_url_base + parent_wiki
            parent_wiki_list.append(parent_wiki_link)

            parent_wiki_list[:] = [x for x in parent_wiki_list if "cite_note" not in x] # cited entries are in <a href> tags so remove those links here

    return parent_wiki_list

In [720]:
def get_rel_links(soup) :
    relative_field = soup.find('th', string=relative_pattern).parent
    relative_td_tags = relative_field.find_all('td')

    list_of_relatives = re.split('</li>|<br/>', str(relative_td_tags))

    parent_wiki_list = []

    if len(relative_field.find_all('a')) == 0 :
        return []
    else :
        for i in range(len(list_of_relatives)):
            if bool(re.search(not_nepo_relationships, list_of_relatives[i])) is True:
                pass

            else :
                try :
                    href_match = re.search(r'href=\"(.*)\" title=', list_of_relatives[i])
                    parent_wiki = href_match.group(1)
                    parent_wiki_link = wiki_url_base + parent_wiki
                    parent_wiki_list.append(parent_wiki_link)

                except AttributeError :
                    pass  

        if len(parent_wiki_list) == 0 :
            return []
        else :
            return parent_wiki_list


In [732]:
def are_they_a_nepo(name, wiki_name_link = ""):
    
    poi = {'name': name,
          'image': '',
          'link': wiki_name_link,
          'nepo': False
          }
    
    if name is False or name == 'nan' :
        return poi
    
    if wiki_name_link != "" :
        url = wiki_url_base + wiki_name_link
        does_page_exist = True
    else :
        subject = remove_ws(name)
        url = wiki_url_base + "/wiki/" + subject
        does_page_exist = does_wiki_exist(name)
        
    if does_page_exist is False :
        poi['parents'] = []

    else : 
        subj_image, parent_data = extract_infobox_data(url)
        poi['link'] = url
        poi['image'] = subj_image
        poi['parents'] = parent_data
        
        if len(poi['parents']) != 0 :
            poi['nepo'] = True

    return poi

In [200]:
def pct_nepo(df, group_col):
    df = df[[group_col, 'nepos']]
    df = df[df.nepos != False].groupby(group_col).count() / df.groupby(group_col).count()
    df = df.rename(columns={'nepos': 'pct_nepo'})
    return df

In [802]:
def imdb_whole_shebang(url, tv_or_film):
    
    # get list of titles from imdb
    df = get_imdb_lists(url, api_key) 
    
    # get cast list and cast wiki links from wikipedia
    df.loc[:,'cast'] = df.apply(lambda row : get_cast_from_wiki(row['title'], row['year'], tv_or_film), axis=1)
    df[['cast_names','cast_links']] = pd.DataFrame(df.cast.tolist(), index= df.index)
    df = df.drop('cast', axis=1)
    
    df['cast_len'] = df['cast_names'].str.len()
    wiki_df = df[(df['cast_len'] != 0) & (df['cast_len'].notnull())]
    imdb_df = df[(df['cast_len'] == 0) | (df['cast_len'].isnull())]
    
    # if wiki didn't list the cast then get it from imdb
    imdb_df.loc[:, "cast"] = imdb_df.apply(lambda row : title_specs(row['id']), axis=1) # get top cast from imdb
    imdb_df[['cast_names','cast_links']] = pd.DataFrame(imdb_df.cast.tolist(), index= imdb_df.index)
    imdb_df = imdb_df.drop('cast', axis=1)
    
    df = pd.concat([wiki_df, imdb_df]).reset_index(drop=True)
    # drop duplicates incase they were not removed from correct_df. take last dupe as its the imdb list
    df.drop_duplicates(subset=['id'], keep='last') 
    
    df = df.explode(['cast_names', 'cast_links']).reset_index(drop=True) # expand list of cast into rows
    
    df = df.dropna(subset=['cast_names'])
    
    df.loc[:,"nepo_links"] = df.apply(lambda row : are_they_a_nepo(row['cast_names'], row['cast_links']), axis=1)
    df.loc[:, 'nepos'] =  df['nepo_links'].apply(lambda x: x.get('nepo')) 
        
    # calc percent nepo by title
    df_pct_nepo = pct_nepo(df, "id")
    df = df.merge(df_pct_nepo, on=['id'], how="left") # merge data frames
    df['pct_nepo'].fillna(0, inplace=True)
    
    df = df.drop(['fullTitle', 'crew', 'cast_names', 'cast_links', 'nepos'], axis=1)
    df.rename(columns={'cast_len': 'cast_length', 'nepo_links': 'cast'}, inplace=True)
     
    # export csv
    csv_name_match = re.search('API/(.*)/', url)
    csv_name = csv_name_match.group(1)
    
    df.to_csv(csv_name + ".csv", index=False)
    
    # convert to dict
#     dictionary = df.groupby(['id', 'title', 'year', 'image', 'cast_length', 'pct_nepo']).agg({'cast':lambda x: list(x)})
    
#     with open(csv_name + ".json", "w") as fp:
#         json.dump(dictionary , fp) 
    
    return df


In [805]:
df = imdb_whole_shebang(pop_tv_url, "film")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df.loc[:, "cast"] = imdb_df.apply(lambda row : title_specs(row['id']), axis=1) # get top cast from imdb
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df[['cast_names','cast_links']] = pd.DataFrame(imdb_df.cast.tolist(), index= imdb_df.index)


In [792]:
nepo_dict

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,cast
id,title,year,image,imdb-rating,imdb-rating-count,cast_length,pct_nepo,Unnamed: 8_level_1
tt0068646,The Godfather,1972,https://m.media-amazon.com/images/M/MV5BM2MyNjYxNmUtYTAwNi00MTYxLWJmNWYtYzZlODY3ZTk3OTFlXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_Ratio0.7015_AL_.jpg,9.2,1918724,9.0,0.000000,"[{'name': 'Marlon Brando', 'image': 'upload.wi..."
tt0073195,Jaws,1975,https://m.media-amazon.com/images/M/MV5BMmVmODY1MzEtYTMwZC00MzNhLWFkNDMtZjAwM2EwODUxZTA5XkEyXkFqcGdeQXVyNTAyODkwOQ@@._V1_Ratio0.6716_AL_.jpg,8.1,627082,5.0,0.000000,"[{'name': 'Roy Scheider', 'image': 'upload.wik..."
tt0082971,Indiana Jones and the Raiders of the Lost Ark,1981,https://m.media-amazon.com/images/M/MV5BNTU2ODkyY2MtMjU1NC00NjE1LWEzYjgtMWQ3MzRhMTE0NDc0XkEyXkFqcGdeQXVyMjM4MzQ4OTQ@._V1_Ratio0.6716_AL_.jpg,8.4,993564,6.0,0.000000,"[{'name': 'Harrison Ford', 'image': 'upload.wi..."
tt0087469,Indiana Jones and the Temple of Doom,1984,https://m.media-amazon.com/images/M/MV5BYzgzMTIzNzctNmNiZC00ZDYyLWJjNzktMmQ2MDM2ZDkwZGVhXkEyXkFqcGdeQXVyMjM4MzQ4OTQ@._V1_Ratio0.6716_AL_.jpg,7.5,508082,6.0,0.000000,"[{'name': 'Harrison Ford', 'image': 'upload.wi..."
tt0096895,Batman,1989,https://m.media-amazon.com/images/M/MV5BMTYwNjAyODIyMF5BMl5BanBnXkFtZTYwNDMwMDk2._V1_Ratio0.6716_AL_.jpg,7.5,389802,8.0,0.000000,"[{'name': 'Jack Nicholson', 'image': 'upload.w..."
...,...,...,...,...,...,...,...,...
tt8936646,Extraction,2020,https://m.media-amazon.com/images/M/MV5BMDJiNzUwYzEtNmQ2Yy00NWE4LWEwNzctM2M0MjE0OGUxZTA3XkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_Ratio0.6716_AL_.jpg,6.8,230395,7.0,0.285714,"[{'name': 'Chris Hemsworth', 'image': 'upload...."
tt9100018,Cobweb,2023,https://m.media-amazon.com/images/M/MV5BMWI2NjQ2NGYtNDMwYS00YzNlLWFjZTctNzQ5MTRiYzQyZjMzXkEyXkFqcGdeQXVyMTUzMTg2ODkz._V1_Ratio0.6716_AL_.jpg,,0,4.0,0.250000,"[{'name': 'Lizzy Caplan', 'image': 'upload.wik..."
tt9362722,Spider-Man: Across the Spider-Verse,2023,https://m.media-amazon.com/images/M/MV5BMzI0NmVkMjEtYmY4MS00ZDMxLTlkZmEtMzU4MDQxYTMzMjU2XkEyXkFqcGdeQXVyMzQ0MzA0NTM@._V1_Ratio0.6716_AL_.jpg,8.9,155654,13.0,0.153846,"[{'name': 'Shameik Moore', 'image': 'upload.wi..."
tt9603212,Mission: Impossible - Dead Reckoning Part One,2023,https://m.media-amazon.com/images/M/MV5BMDkyMmZlN2EtMzhlNC00ODc5LTk0ODgtOWRlNzRkYzRkNTdmXkEyXkFqcGdeQXVyMDM2NDM2MQ@@._V1_Ratio0.6716_AL_.jpg,,0,7.0,0.142857,"[{'name': 'Tom Cruise', 'image': 'upload.wikim..."


In [793]:
v = nepo_dict.values.tolist()
c = nepo_dict.columns.values.tolist()

d = [dict(zip(c, x)) for x in v]
d

[{'cast': [{'name': 'Marlon Brando',
    'image': 'upload.wikimedia.org/wikipedia/commons/thumb/5/53/Marlon_Brando_publicity_for_One-Eyed_Jacks.png/220px-Marlon_Brando_publicity_for_One-Eyed_Jacks.png',
    'link': 'https://en.wikipedia.org/wiki/Marlon_Brando',
    'nepo': False,
    'parents': []},
   {'name': 'Al Pacino',
    'image': 'upload.wikimedia.org/wikipedia/commons/thumb/3/3e/Al_Pacino_2016_%2830401544240%29.jpg/220px-Al_Pacino_2016_%2830401544240%29.jpg',
    'link': 'https://en.wikipedia.org/wiki/Al_Pacino',
    'nepo': False,
    'parents': []},
   {'name': 'James Caan',
    'image': 'upload.wikimedia.org/wikipedia/commons/thumb/d/d7/James_Caan_%281976%29.jpg/220px-James_Caan_%281976%29.jpg',
    'link': 'https://en.wikipedia.org/wiki/James_Caan',
    'nepo': False,
    'parents': []},
   {'name': 'Richard Castellano',
    'image': 'upload.wikimedia.org/wikipedia/commons/thumb/f/fb/The_Super_cast_1972_%28cropped%29.JPG/220px-The_Super_cast_1972_%28cropped%29.JPG',
    'li

In [None]:
##########################
###   GET IMDB DATA    ###
##########################

In [None]:
# top_250_mov = imdb_whole_shebang(top_250_mov_url, "film")
top_250_tv = imdb_whole_shebang(top_250_tv_url, "tv")
pop_mov = imdb_whole_shebang(pop_mov_url, "film")
pop_tv = imdb_whole_shebang(pop_tv_url, 'tv')
box_office = imdb_whole_shebang(box_office_all_time_url, "film")

In [None]:
df = get_imdb_lists(pop_tv_url, api_key) # get list of titles from imdb

In [None]:
df['cast'] = df.apply(lambda row : wiki_cast(row['title'], row['year'], "tv series"), axis=1)

In [None]:
df = df[(df["cast"] != False)]

In [None]:
df = df.explode('cast').reset_index(drop=True) # expand list of cast into rows

In [None]:
df.loc[:,"nepos"] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1)

In [None]:
noskip_df = df[df['cast'] != 'SKIP']
nolist_df = noskip_df[~noskip_df['cast'].str.contains("list", case=False)]
correct_df = nolist_df[~nolist_df['cast'].str.contains("various", case=False)]

In [None]:
fix_cast_imdb = df[(df['cast'] == 'SKIP') | df['cast'].str.contains("list", case=False) | df['cast'].str.contains("various", case=False)]
fix_cast_imdb.loc[:,"cast"] = fix_cast_imdb.apply(lambda row : title_specs(row['id']), axis=1) # get top cast from imdb
fix_cast_imdb = fix_cast_imdb.explode('cast').reset_index(drop=True) # expand list of cast into rows
fix_cast_imdb.loc[:,"nepos"] = fix_cast_imdb.apply(lambda row : wiki_scrape(row['cast']), axis=1)
fix_cast_imdb


In [None]:
df = pd.concat([correct_df, fix_cast_imdb]).reset_index(drop=True)
df.drop_duplicates(subset=['id','cast'], keep='last')

In [None]:
df_pct_nepo = pct_nepo(df, "id") # calculate pct nepo by title

In [None]:
df = df.merge(df_pct_nepo, on=['id'], how="left") # merge data frames

In [None]:
df.to_csv("poptv" + ".csv", index=False)

In [None]:
##########################
###  DUPED NEPO ACTORS ###
##########################

In [None]:
#cols_to_select = ['id', 'title', 'year', 'cast', 'nepos', 'pct_nepo']

# create one big fat df with all the diff imdb lists
# select desired cols
# top_250_mov_skinny = top_250_mov[cols_to_select]
# top_250_tv_skinny = top_250_tv[cols_to_select]
# box_office_skinny = box_office[cols_to_select]

# add column for type (tv or film)
top_250_mov['type'] = 'film'
top_250_tv['type'] = 'tv'
pop_mov['type'] = 'film'
pop_tv['type'] = 'tv'
#box_office_skinny['type'] = 'film'

all_df = [pop_mov, pop_tv]
df = pd.concat(all_df)

# drop actors+films groups that appear in >1 imdb list
df.drop_duplicates(subset=['id', 'cast'], keep=False)
df = df.drop(['crew', 'rank'], axis=1)
df['pct_nepo'] = df['pct_nepo'].replace(np.nan, 0) # no nepo babies in the top cast
df["imDbRatingCount"] = pd.to_numeric(df["imDbRatingCount"])

df.to_csv("all_imdb_data.csv", index=False)

### NEPOS ###
# create df with just nepo babies
nepos_df = df.loc[df['nepos'] != False]

# get count of how many titles a nepo baby appears in
nepos_num_titles = nepos_df.groupby(['cast']).size().reset_index(name='num_titles')
mean_nepos_repeats = nepos_num_titles['num_titles'].mean()

nepos_df.to_csv('nepos_only.csv', index=False)

### NOT NEPOS ###
non_nepo_df = df.loc[df['nepos'] == False]
non_nepo_num_titles = non_nepo_df.groupby(['cast']).size().reset_index(name='num_titles')
mean_non_nepo_repeats = non_nepo_num_titles['num_titles'].mean()

### OVERALL COUNTS ###
distinct_nepos = nepos_df.groupby(['cast'])
num_nepo_cast = distinct_nepos.ngroups

distinct_cast = df.groupby(['cast'])
num_cast = distinct_cast.ngroups

distinct_titles = df.groupby(['id'])
num_titles = distinct_titles.ngroups

stat_dict = {
    "total_cast": num_cast,
    "nepos_cast": num_nepo_cast,
    "num_titles": num_titles
    }

In [None]:
##########################
###     USER INPUT     ###
##########################

In [753]:
person_input = input("name: ")
person_cnxn = are_they_a_nepo(person_input)

if person_cnxn is False :
    print(f"{person_input} is not a nepo baby")
else :
    print(f'{person_input} is a nepo baby. Here are the connection links: {person_cnxn}')

name: Maude Apatow
Maude Apatow is a nepo baby. Here are the connection links: {'name': 'Maude Apatow', 'image': 'upload.wikimedia.org/wikipedia/commons/thumb/8/87/Maude_Apatow_at_SXSW_2018_%28cropped%29.jpg/220px-Maude_Apatow_at_SXSW_2018_%28cropped%29.jpg', 'link': 'https://en.wikipedia.org/wiki/Maude_Apatow', 'nepo': True, 'parents': [{'name': 'Judd Apatow', 'image': 'upload.wikimedia.org/wikipedia/commons/thumb/b/bb/Judd_Apatow_2012_Shankbone.JPG/220px-Judd_Apatow_2012_Shankbone.JPG', 'link': 'https://en.wikipedia.org/wiki/Judd_Apatow'}, {'name': 'Leslie Mann', 'image': 'upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Leslie_Mann_at_SXSW_Red_Carpet_premiere_of_BLOCKERS_%2839852917225%29_%28cropped%29.jpg/220px-Leslie_Mann_at_SXSW_Red_Carpet_premiere_of_BLOCKERS_%2839852917225%29_%28cropped%29.jpg', 'link': 'https://en.wikipedia.org/wiki/Leslie_Mann'}]}


In [None]:
url_search_base = "https://imdb-api.com/en/API/SearchTitle/k_11m5rq35/"
movie_input = input("movie or tv show: ")

data = requests.get(url_search_base + movie_input).json()
data['results']


In [None]:
movie_input = input("movie or tv show: ")
movie_choices = ia.search_movie(movie_input)

movie_choices_list = []

# only showing 5 possible options
for i in range(5) :
    
    index = [1, 2, 3, 4, 5]
    rank = index[i]
    
    movie_title = movie_choices[i]['title']
    movie_id = movie_choices[i].movieID
    
    movie = ia.get_movie(movie_id)
    
    short_cast_list = []
    try :
        cast = movie['cast']
        for i in range(3):
            short_cast_list.append(cast[i]['name'])
    
        formatted_cast_list = f"starring {short_cast_list[0]}, {short_cast_list[1]}, and {short_cast_list[2]}"
    except KeyError:
        formatted_cast_list = ""
    except IndexError:
        formatted_cast_list = ""
        
    try :
        year = movie['year']
        year = f" ({year})"
    except KeyError:
        year = ""
    print(f"{rank}: {movie_title}{year} {formatted_cast_list}")

movie_choice_input = input(f'people are not very creative with titles so there are a few titles - which number is the correct one?: ')

movie_id = movie_choices[int(movie_choice_input)-1].movieID
movie_url = imdb_base_url + "tt" + movie_id

movie_full_cast = title_specs("tt"+movie_id)

df = pd.DataFrame([movie_full_cast], columns=['title', 'cast', 'imdb_rating'])
df = df.explode('cast').reset_index(drop=True)
df.loc[:,"nepos"] = df.apply(lambda row : wiki_scrape(row['cast']), axis=1)
title_pct_nepo = pct_nepo(df, 'title')
pct_nepo_str = title_pct_nepo['pct_nepo'][0]

nepo_rows = df[df['nepos'] != False]

print(f'{movie_input} is {pct_nepo_str}% full of nepotism babies. The following people are the culprits: {nepo_rows}')
