# Celebrity Guest Stars (from Wikipedia)

Identify which episodes had Special Guest Stars. 

Unfortunately, there does not appear to be a simple list for this, so we have to use the full list of guest stars and reduce it.

In [8]:
# First 20 seasons
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars_(seasons_1%E2%80%9320)

# Seasons 21 - current
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars_(seasons_21%E2%80%93present)

from requests import get
from bs4 import BeautifulSoup
import pandas as pd

# Functions

In [9]:
def scrape_tables(wiki_url):
    """Get episode tables (for each season) from wikipedia"""
    
    # Request from the server the content of the web page by using get()
    response = get(wiki_url)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    guest_table = page_html.find('table', class_='sortable')
    
    return guest_table

In [10]:
def extract_row_details(rows):
    guest_list = []
    for row in rows:
        name = row.find('th')
        name = name.text.strip()
        cols=row.find_all('td')
        cols=[x.text.strip() for x in cols]
        cols.append(name)
        guest_list.append(cols)

    del guest_list[0] # Delete header 
    return guest_list

In [11]:
def convert_row_array(guest_list):
    guests = pd.DataFrame(guest_list, columns=['season', 'role', 'no', 
                                           'prodCode', 'epTitle', 'GuestStar'])
    return guests

# Scrape wikipedia

In [12]:
url1 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars_(seasons_1%E2%80%9320)'
url2 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_guest_stars_(seasons_21%E2%80%93present)'

In [13]:
guest_tables1 = scrape_tables(url1)
guest_tables2 = scrape_tables(url2)

In [14]:
rows1 = guest_tables1.find_all('tr')
rows2 = guest_tables2.find_all('tr')
rows = rows1 + rows2

guest_list = extract_row_details(rows)

In [15]:
guests = convert_row_array(guest_list)
guests = guests.dropna()

In [16]:
guests.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar
0,1,Edna Krabappel Ms. Melon,002–102,7G02,"""Bart the Genius""",Marcia Wallace
1,1,Worker,003–103,7G03,"""Homer's Odyssey""[35]",Sam McMurray
2,1,Edna Krabappel,003–103,7G03,"""Homer's Odyssey""",Marcia Wallace
3,1,Ms. Barr,006–106,7G06,"""Moaning Lisa""",Miriam Flynn
4,1,Bleeding Gums Murphy,006–106,7G06,"""Moaning Lisa""[36]",Ron Taylor


In [17]:
guests.shape

(1594, 6)

In [18]:
guests['GuestStar'].value_counts()

Marcia Wallace      174
Phil Hartman         52
Joe Mantegna         43
Maurice LaMarche     27
Kelsey Grammer       22
                   ... 
Michelle Kwan         1
Warren Sapp           1
Yao Ming              1
50 Cent               1
Dick Van Dyke         1
Name: GuestStar, Length: 936, dtype: int64

In [19]:
#guests.to_excel('data/guest_stars.xlsx')

# Filtering out the 'Celebrity Guests'

### Limited Appearances

Guests that only appeared up to 5 times (some actors, such as JK Simmons have appeared 3 times).

In [20]:
ep_count = guests['GuestStar'].value_counts()
keep_names1 = ep_count[ep_count == 1].index # Idenfity people that appeared once
keep_names2 = ep_count[ep_count == 2].index # Idenfity people that appeared once
keep_names3 = ep_count[ep_count == 3].index # Idenfity people that appeared once
keep_names4 = ep_count[ep_count == 4].index # Idenfity people that appeared once
keep_names5 = ep_count[ep_count == 5].index # Idenfity people that appeared once

keep_names = keep_names1.append(keep_names2).append(keep_names3).append(keep_names4).append(keep_names5)

In [21]:
guests_single_appearance = guests[guests['GuestStar'].isin(keep_names)]

In [22]:
guests_single_appearance.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar
1,1,Worker,003–103,7G03,"""Homer's Odyssey""[35]",Sam McMurray
3,1,Ms. Barr,006–106,7G06,"""Moaning Lisa""",Miriam Flynn
4,1,Bleeding Gums Murphy,006–106,7G06,"""Moaning Lisa""[36]",Ron Taylor
5,1,Cowboy Bob,007–107,7G09,"""The Call of the Simpsons""[37]",Albert Brooks[B]
8,1,Gulliver Dark,010–110,7G10,"""Homer's Night Out""[39]",Sam McMurray


In [23]:
guests_single_appearance.shape

(1109, 6)

### Starring as themselves

Identify guests that are listed as 'Himself', 'Herself', 'Themselves', 'Narrator'

In [24]:
celeb_markers = ['Himself', 'Herself', 'Themselves', 'Narrator'] # People credited as themselves are most likely celebrity guests

In [25]:
guests_themselves = pd.DataFrame(columns = guests.columns)
for role in celeb_markers:
    temp = guests[guests['role'].str.contains(role)]
    guests_themselves = pd.concat([guests_themselves, temp])

In [26]:
guests_themselves.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar
17,2,Himself,018–205,7F05,"""Dancin' Homer""[45]",Tony Bennett
25,2,Himself,024–211,7F11,"""One Fish, Two Fish, Blowfish, Blue Fish""[51]",Larry King
40,2,Himself,031–218,7F18,"""Brush with Greatness""[60]",Ringo Starr
47,3,Himself playing Bart Simpson,039–304,8F03,"""Bart the Murderer""[65]",Neil Patrick Harris
49,3,Fat TonyHimself playing Fat Tony,039–304,8F03,"""Bart the Murderer""[65]",Joe Mantegna


### Remove starring as themselves from limited appearances

We do not to manually go through any guest stars that are listed as 'themselves' etc. so we can remove these from the guests to be manually parsed.

In [80]:
a_index = guests_single_appearance.index
b_index = guests_themselves.index
mask = ~a_index.isin(b_index)
guests_single_appearance = guests_single_appearance.loc[mask]

In [81]:
guests_single_appearance.shape

(635, 6)

In [82]:
guests_single_appearance.to_excel('data/guest_stars_limited.xlsx')

**At this point,the `guest_stars_limited` excel should be manually parsed to only include 'real' celebrity guests.**

- Kept no one that has 'archival' footage
- Has a wikipedia page
- Known primarily for non voice acting work
- Not a writer on the simpsons

# Combine into a single data-frame

In [27]:
guests_themselves['As_Themselves'] = 1

In [28]:
# Reload the hand filtered list
df_hand_filter = pd.read_excel('data/guest_stars_hand_filtered.xlsx', index_col=0)  
df_hand_filter['Character_Played'] = 1

In [29]:
df_hand_filter.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar,Character_Played
1,1,Worker,003–103,7G03,"""Homer's Odyssey""[35]",Sam McMurray,1
4,1,Bleeding Gums Murphy,006–106,7G06,"""Moaning Lisa""[36]",Ron Taylor,1
5,1,Cowboy Bob,007–107,7G09,"""The Call of the Simpsons""[37]",Albert Brooks[B],1
8,1,Gulliver Dark,010–110,7G10,"""Homer's Night Out""[39]",Sam McMurray,1
12,1,Ms. Botz / Lucille Botzcowski,013–113,7G01,"""Some Enchanted Evening""[42]",Penny Marshall,1


In [30]:
guests_list = pd.concat([guests_themselves, df_hand_filter])
guests_list = guests_list.reset_index(drop=True)
guests_list = guests_list.drop_duplicates()

In [32]:
guests_list.shape

(971, 8)

In [34]:
guests_list.As_Themselves = guests_list.As_Themselves.fillna(0)
guests_list.Character_Played = guests_list.Character_Played.fillna(0)

In [35]:
guests_list.head()

Unnamed: 0,season,role,no,prodCode,epTitle,GuestStar,As_Themselves,Character_Played
0,2,Himself,018–205,7F05,"""Dancin' Homer""[45]",Tony Bennett,1.0,0.0
1,2,Himself,024–211,7F11,"""One Fish, Two Fish, Blowfish, Blue Fish""[51]",Larry King,1.0,0.0
2,2,Himself,031–218,7F18,"""Brush with Greatness""[60]",Ringo Starr,1.0,0.0
3,3,Himself playing Bart Simpson,039–304,8F03,"""Bart the Murderer""[65]",Neil Patrick Harris,1.0,0.0
4,3,Fat TonyHimself playing Fat Tony,039–304,8F03,"""Bart the Murderer""[65]",Joe Mantegna,1.0,0.0


# Save curated Guest List

In [38]:
#guests_list.to_excel('data/guest_stars.xlsx')

In [37]:
guests_list.to_pickle("./data/simpsons_guest_stars.pkl")