# Celebrity Guest Stars (from Wikipedia)

Identify which episodes had Special Guest Stars. 

Unfortunately, there does not appear to be a simple list for this, so we have to use the full list of guest stars and reduce it.

In [1]:
# First 20 seasons
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars_(seasons_1%E2%80%9320)

# Seasons 21 - current
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars

from requests import get
from bs4 import BeautifulSoup
import pandas as pd

# Functions

In [2]:
def scrape_tables(wiki_url):
    """Get episode tables (for each season) from wikipedia"""
    
    # Request from the server the content of the web page by using get()
    response = get(wiki_url)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    guest_table = page_html.find('table', class_='sortable')
    
    return guest_table

In [3]:
def extract_row_details(rows):
    guest_list = []
    for row in rows:
        name = row.find('th')
        name = name.text.strip()
        cols=row.find_all('td')
        cols=[x.text.strip() for x in cols]
        cols.append(name)
        guest_list.append(cols)

    del guest_list[0] # Delete header 
    return guest_list

In [4]:
def convert_row_array(guest_list):
    guests = pd.DataFrame(guest_list, columns=['season', 'role', 'no', 
                                           'prodCode', 'epTitle', 'GuestStar'])
    return guests

# Scrape wikipedia

In [5]:
url1 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars_(seasons_1%E2%80%9320)'
url2 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars'

In [6]:
guest_tables1 = scrape_tables(url1)
guest_tables2 = scrape_tables(url2)

In [7]:
rows1 = guest_tables1.find_all('tr')
rows2 = guest_tables2.find_all('tr')
rows = rows1 + rows2

guest_list = extract_row_details(rows)

In [8]:
guests = convert_row_array(guest_list)
guests = guests.dropna()

In [9]:
guests.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar
0,1,Edna Krabappel Ms. Melon,002–102,7G02,"""Bart the Genius""",Marcia Wallace
1,1,Worker,003–103,7G03,"""Homer's Odyssey""[35]",Sam McMurray
2,1,Edna Krabappel,003–103,7G03,"""Homer's Odyssey""",Marcia Wallace
3,1,Ms. Barr,006–106,7G06,"""Moaning Lisa""",Miriam Flynn
4,1,Bleeding Gums Murphy,006–106,7G06,"""Moaning Lisa""[36]",Ron Taylor


In [10]:
guests['GuestStar'].value_counts()

Marcia Wallace              174
Phil Hartman                 52
Joe Mantegna                 34
Maurice LaMarche             25
Kelsey Grammer               22
Frank Welker                 21
Jon Lovitz                   20
Kevin Michael Richardson     17
Jackie Mason                 11
Glenn Close                  10
Michael Dees                  8
Terry W. Greene               8
Jane Kaczmarek                8
Dawnn Lewis                   8
Kipp Lennon                   8
Albert Brooks                 7
Valerie Harper                7
Sally Stevens                 7
Jan Hooks                     6
Stacy Keach                   6
Michael York                  6
Dana Gould                    6
George Takei                  5
J. K. Simmons                 5
Ken Burns                     4
Scott Thompson                4
Stephen Hawking               4
Eric Idle                     4
Renee Ridgeley                4
Charles Napier                4
                           ... 
Richard 

In [11]:
guests.to_excel('guest_stars.xlsx')

# Filtering out the 'Celebrity Guests'

Guests that only appeared once

In [12]:
ep_count = guests['GuestStar'].value_counts()
keep_names = ep_count[ep_count == 1].index # Idenfity people that appeared once

In [13]:
guests_single_appearance = guests[guests['GuestStar'].isin(keep_names)]

In [14]:
guests_single_appearance.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar
3,1,Ms. Barr,006–106,7G06,"""Moaning Lisa""",Miriam Flynn
5,1,Cowboy Bob,007–107,7G09,"""The Call of the Simpsons""[37]",Albert Brooks[B]
9,1,Gendarme Officer,011–111,7G13,"""The Crepes of Wrath""",Christian Coffinet
11,1,Babysitter service receptionistDoofy the Elf,013–113,7G01,"""Some Enchanted Evening""[41]",June Foray
12,1,Ms. Botz / Lucille Botzcowski,013–113,7G01,"""Some Enchanted Evening""[42]",Penny Marshall


In [15]:
guests_single_appearance.shape

(722, 6)

Identify guests that are listed as 'Himself', 'Herself', 'Themselves', 'Narrator'

In [16]:
celeb_markers = ['Himself', 'Herself', 'Themselves', 'Narrator'] # People credited as themselves are most likely celebrity guests

In [17]:
guests_themselves = pd.DataFrame(columns = guests.columns)
for role in celeb_markers:
    temp = guests[guests['role'].str.contains(role)]
    guests_themselves = guests_themselves.append(temp)

In [18]:
guests_themselves.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar
17,2,Himself,018–205,7F05,"""Dancin' Homer""[45]",Tony Bennett
25,2,Himself,024–211,7F11,"""One Fish, Two Fish, Blowfish, Blue Fish""[51]",Larry King
40,2,Himself,031–218,7F18,"""Brush with Greatness""[60]",Ringo Starr
48,3,Himself playing Bart Simpson,039–304,8F03,"""Bart the Murderer""[66]",Neil Patrick Harris
50,3,Fat TonyHimself playing Fat Tony,039–304,8F03,"""Bart the Murderer""[66]",Joe Mantegna


# Combine into a single data-frame

In [19]:
guests_list = guests_themselves.append(guests_single_appearance)
guests_list = guests_list.reset_index(drop=True)
guests_list = guests_list.drop_duplicates()

In [20]:
guests_list.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar
0,2,Himself,018–205,7F05,"""Dancin' Homer""[45]",Tony Bennett
1,2,Himself,024–211,7F11,"""One Fish, Two Fish, Blowfish, Blue Fish""[51]",Larry King
2,2,Himself,031–218,7F18,"""Brush with Greatness""[60]",Ringo Starr
3,3,Himself playing Bart Simpson,039–304,8F03,"""Bart the Murderer""[66]",Neil Patrick Harris
4,3,Fat TonyHimself playing Fat Tony,039–304,8F03,"""Bart the Murderer""[66]",Joe Mantegna


In [21]:
guests_list.shape

(827, 6)