In [9]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import urllib.parse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
tqdm.pandas()

NUM_SEASONS = 43

## Contestant Table

Pull contestant info from https://en.wikipedia.org/wiki/List_of_Survivor_(American_TV_series)_contestants

Get contestant name, age, hometown, profession, season, and placement

In [10]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_Survivor_(American_TV_series)_contestants"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')
tables = soup.findAll("table", class_='wikitable')
contestant_table = pd.read_html(str(tables))
contestant_table = pd.concat(contestant_table, axis = 0).reset_index(drop=True)
contestant_table.columns = contestant_table.columns.str.lower()
contestant_table = contestant_table.rename(columns={'name':'contestant_name'})
contestant_table = contestant_table.rename(columns={'season':'num_season'})

Label season number each contestant was on

In [11]:
seasons = contestant_table.num_season.unique()
contestant_table["num_season"] = contestant_table.apply(lambda x: np.where(seasons == x.num_season)[0][0]+1,axis=1)
contestant_table = contestant_table[contestant_table.num_season <= NUM_SEASONS]

Fix contestant names by taking their alias instead of their given name

Convert contestant placement (string) to numerical placement

In [12]:
def fix_name(x):
    if '"' in x:
        return x[x.find('"'):].replace('"',"").strip()
    return x

In [13]:
def get_finish(f):
    if str(f) == "nan":
        return np.nan
    if f.lower() == "winner":
        return 1
    if f.lower() == "2nd runner-up":
        return 3
    if f.lower() == "runner-up" or f.lower() == "co-runner up":
        return 2
    return int(f[:-2])

In [14]:
contestant_table["finish"] = contestant_table.progress_apply(lambda x:get_finish(x.finish),axis = 1)
contestant_table["contestant_name"] = contestant_table.progress_apply(lambda x:fix_name(x.contestant_name),axis = 1)

100%|█████████████████████████████████████████████████████████████████████████████| 785/785 [00:00<00:00, 23830.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 785/785 [00:00<00:00, 55960.17it/s]


MANUAL CLEANUP: manually fix names to match those on https://survivor.fandom.com/wiki/Main_Page

In [15]:
name_fixes = {
    'Jonny Fairplay Dalton':'Jon Dalton',
    'J.P. Palyok': 'John Palyok',
    'Bubba Sampson': 'Travis Sampson',
    'Sarge Masters': 'Lea Masters',
    'Mikey B Bortone': 'Mikey Bortone',
    'J.T. Thomas, Jr.' : 'J.T. Thomas',
    'Papa Bear Caruso' : 'Mark Caruso',
    'Rodney Lavoie, Jr.' : 'Rodney Lavoie',
    'Joe Del Campo':'Joe del Campo',
    'The Wardog DaSilva':'Wardog DaSilva',
    'Amber Brkich':'Amber Mariano',
    'Kim Spradlin':'Kim Spradlin-Wolfe',
    'Candice Woodcock':'Candice Cody',
    'Wendy Jo DeSmidt-Kohlhoff':'Wendy DeSmidt-Kohlhoff',
    'Christine Shields-Markoski':'Christine Shields Markoski',
    'Flicka Smith' : 'Jessica Smith',
    'Mad Dog Hershey' : 'Maralyn Hershey',
    'Taylor Lee Stocker' : 'Taylor Stocker'
}

contestant_table = contestant_table.replace({'contestant_name':name_fixes})

Get the gender of each contestant. We loop through the pages of male contestants and store all their names.

In [16]:
url = "https://survivor.fandom.com/wiki/Category:Male_Contestants"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pages = soup.find(class_="category-page__pagination")
pagination_links = pages.findAll("a",href=True)
males = []
done = False
while not done:
    mlinks = soup.findAll(class_="category-page__member-link") #link of male contestant
    
    for mlink in mlinks:
        males.append(mlink['title']) #male contestant's name
    for link in pagination_links: #loop through pagination links to find the "Next" link
        label = ''
        try:
            label = link.find('span').text
        except:
            label = ''
        
        if label == 'Next': 
            next_link = link['href']
            break
    
    if label == 'Next': #if not at last page, get next batch of names through next link
        response = requests.get(next_link)
        soup = BeautifulSoup(response.text, 'html.parser')
        pages = soup.find(class_="category-page__pagination")
        
        pagination_links = pages.findAll("a",href=True)
    else: #label will be "Previous" and we have accessed all the pages
        done = True
               


In [17]:
contestant_table.loc[contestant_table.contestant_name.isin(males),"gender"] = 'M' #male contestants
contestant_table.loc[contestant_table.gender != 'M', 'gender'] = 'F' #female contestants
contestant_table.loc[contestant_table.contestant_name == 'Evvie Jagoda', 'gender'] = 'N' #non-binary contestants

In [25]:
def clean_ethnicity_link(link):
    return re.sub('-','_', link).lower()[len('/wiki/category:'):len(link)-len('_contestants')]

url = "https://survivor.fandom.com/wiki/Category:Contestants_by_Ethnicity"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pages = soup.find_all(class_ = "category-page__member-link",href=True) #get ethnicity categories
start_url = "https://survivor.fandom.com"
contestants_by_ethnicity = {clean_ethnicity_link(page['href']):[] for page in pages}
for page in pages:
    response = requests.get(start_url+page["href"])
    soup = BeautifulSoup(response.text, 'html.parser')
    contestant_list = []
    contestant_links = soup.findAll(class_="category-page__member-link")
    for c_link in contestant_links:
        contestant_list.append(c_link["title"])
    contestants_by_ethnicity[clean_ethnicity_link(page['href'])] = contestant_list
    
    
for ethnicity, eth_list in contestants_by_ethnicity.items():
    contestant_table[ethnicity] = contestant_table.apply(lambda x: int(x['contestant_name'] in eth_list), axis = 1)
    
contestant_table['poc'] = contestant_table['african_american'] | contestant_table['asian_american'] | contestant_table['latin_american']

jewish = ['Ethan Zohn','Shawn Cohen','Eliza Orlins','Caryn Groedel','Jonathan Penner','Charlie Herschel'
          ,'Corinne Kaplan','Stephen Fishbach','John Cochran','R.C.Saint-Amour','David Samson','Garrett Adelstein',
          'Max Dawson','Adam Klein','Hannah Shapiro','Zeke Smith','Mike Zahalsky','Jacob Derwin',
          'Julie Rosenberg','Ronnie Bardah','Jason Linden','Sydney Segal','Tiffany Seely','Evvie Jagoda',
          'Liana Wallace','Zach Wurtenberger','Lindsay Dolashewich']
muslim = ['Ibrehem Rahman', 'Natalia Azoqa', 'Naseer Muttalif', 'Omar Zaheer']


for religion in ['jewish', 'muslim']:
    contestant_table[religion] = contestant_table.apply(lambda x: int(x['contestant_name'] in eval(religion)), axis = 1)


In [28]:
url = "https://survivor.fandom.com/wiki/Category:LGBT_Contestants"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
lgbt = []
pages = soup.findAll(class_ = "category-page__member-link", href=True)
for page in pages:
    lgbt.append(page["title"])
    
contestant_table["lgbt"] = contestant_table.apply(lambda x: int(x['contestant_name'] in lgbt), axis = 1)

In [14]:
contestant_table['hometown'] = contestant_table.progress_apply(lambda x: ", ".join(x['hometown'].split(',')[0:2]), axis=1)

100%|█████████████████████████████████████████████████████████████████████████████| 785/785 [00:00<00:00, 49795.51it/s]


In [15]:
states_and_provinces = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
"District of Columbia": "DC",
"American Samoa": "AS",
"Guam": "GU",
"Northern Mariana Islands": "MP",
"Puerto Rico": "PR",
"United States Minor Outlying Islands": "UM",
"U.S. Virgin Islands": "VI",
"Ontario":"ON",
"Quebec":"QC",
"Saskatchewan":"SK",
"Alberta":"AB",
"British Columbia":"BC",
"Manitoba":"MB",
"New Brunswick":"NB",
"Nova Scotia":"NS",
"Newfoundland and Labrador":"NL",
"Prince Edward Island":"PE",
"Nunavut":"NU",
"Northwest Territories":"NT",
"Yukon":"YT"
}

states_and_provinces_abbr = dict(map(reversed, states_and_provinces.items()))

In [16]:
def get_state_and_country(x):
    ht = x['hometown']
    state = ht.split(', ')[-1].strip()
    country = "US"
    if len(state) == 2:
        state = states_and_provinces_abbr[state]
    if state in list(states_and_provinces.keys())[-13:]:
        country = "CA"
    return state, country

In [17]:
contestant_table[['state','country']] = contestant_table.progress_apply(lambda x : pd.Series(get_state_and_country(x)), axis = 1)

100%|██████████████████████████████████████████████████████████████████████████████| 785/785 [00:00<00:00, 2315.71it/s]


In [48]:
contestant_table["num_appearance"] = contestant_table.groupby("contestant_name").rank()["num_season"].to_numpy().astype(np.int32)

In [19]:
def get_birthdate(name):
    url = "https://survivor.fandom.com/wiki/" + name.replace(" ","_")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    bdate = ""
    try:
        bdate = pd.to_datetime(soup.find(class_="bday").text.split('[')[0])
    except:
        try:
            bdate = pd.to_datetime(soup.find(class_="pi-item pi-data pi-item-spacing pi-border-color").find(class_="pi-data-value pi-font").text.split('[')[0])
        except:
            print(name)
    return bdate

In [20]:
contestant_table["birthdate"] = contestant_table.progress_apply(lambda x:get_birthdate(x.contestant_name),axis=1)

100%|████████████████████████████████████████████████████████████████████████████████| 785/785 [06:45<00:00,  1.94it/s]


In [21]:
seasons = ['Survivor: Borneo', 'Survivor: The Australian Outback',
       'Survivor: Africa', 'Survivor: Marquesas', 'Survivor: Thailand',
       'Survivor: The Amazon', 'Survivor: Pearl Islands',
       'Survivor: All-Stars', 'Survivor: Vanuatu', 'Survivor: Palau',
       'Survivor: Guatemala', 'Survivor: Panama',
       'Survivor: Cook Islands', 'Survivor: Fiji', 'Survivor: China',
       'Survivor: Micronesia', 'Survivor: Gabon', 'Survivor: Tocantins',
       'Survivor: Samoa', 'Survivor: Heroes vs. Villains',
       'Survivor: Nicaragua', 'Survivor: Redemption Island',
       'Survivor: South Pacific', 'Survivor: One World',
       'Survivor: Philippines', 'Survivor: Caramoan',
       'Survivor: Blood vs. Water', 'Survivor: Cagayan',
       'Survivor: San Juan del Sur', 'Survivor: Worlds Apart',
       'Survivor: Cambodia', 'Survivor: Kaôh Rōng',
       'Survivor: Millennials vs. Gen X', 'Survivor: Game Changers',
       'Survivor: Heroes vs. Healers vs. Hustlers',
       'Survivor: Ghost Island', 'Survivor: David vs. Goliath',
       'Survivor: Edge of Extinction', 'Survivor: Island of the Idols',
       'Survivor: Winners at War', 'Survivor 41', 'Survivor 42', 'Survivor 43']

merged_tribes = ['Rattana', 'Barramundi', 'Moto Maji', 'Soliantu', 'Chuay Jai',
       'Jacaré', 'Balboa', 'Chaboga Mogo', 'Alinta', 'Koror', 'Xhakúm',
       'Gitanos', 'Aitutonga', 'Bula Bula', 'Hae Da Fung', 'Dabu',
       'Nobag', 'Forza', 'Aiga', 'Yin Yang', 'Libertad', 'Murlonio',
       'Te Tuna', 'Tikiano', 'Dangrayne', 'Enil Edam', 'Kasama',
       'Solarrion', 'Huyopa', 'Merica', 'Orkun', 'Dara', 'Vinaka',
       'Maku Maku', 'Solewa', 'Lavita', 'Kalokalo', 'Vata', 'Lumuwaku',
       'Koru', 'Viakana', 'Kula Kula', 'Gaia']

num_merge = [10, 10, 10, 10,  8, 10, 10,  9, 10,  9, 10, 10,  9, 10, 10, 10,  9,
       10, 12, 10, 12, 12, 12, 12, 11, 12, 11, 11, 12, 12, 13, 11, 13, 13,
       12, 13, 13, 14, 13, 13, 12, 12, 12]

day_merge = [20, 20, 20, 20, 25, 19, 21, 26, 20, 22, 18, 16, 25, 22, 20, 22, 27,
       19, 19, 25, 19, 19, 19, 17, 17, 20, 19, 17, 16, 17, 17, 17, 21, 19,
       17, 20, 18, 17, 20, 19, 12, 12, 13]

num_jury = [ 7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  9,  9,  7,  8,  7,
        7,  9,  9,  9,  9,  9,  9,  8,  8,  8,  9,  8,  8, 10,  8, 10, 10,
        8, 10, 10, 13, 10, 16,  8,  8, 8]

num_ftc = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

num_swaps = [0, 0, 1, 1, 0, 1, 0, 2, 1, 0, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 0, 0, 0]



In [22]:
season_table = pd.DataFrame({
    'num_season': list(range(1, NUM_SEASONS + 1)),
    'season' : seasons,
    'merged_tribe' : merged_tribes,
    'num_merge' : num_merge,
    'day_merge' : day_merge,
    'num_jury' : num_jury,
    'num_ftc' : num_ftc,
    'num_swaps' : num_swaps
})

In [23]:
season_table['num_contestants'] = season_table.apply(lambda x:len(contestant_table[contestant_table.num_season == x['num_season']]), axis = 1)

In [24]:
def made_merge(x):
    if x.finish <= season_table.loc[x.num_season-1, "num_merge"]:
        return 1
    return 0

def made_ftc(x):
    if x.finish <= season_table.loc[x.num_season-1, "num_ftc"]:
        return 1
    return 0

def made_jury(x):
    if x.finish > season_table.loc[x.num_season-1, "num_ftc"] and x.finish <=season_table.loc[x.num_season-1, "num_ftc"].any()+season_table.loc[x.num_season-1, "num_jury"]:
        return 1
    return 0

contestant_table['merge'] = contestant_table.apply(lambda x:made_merge(x), axis = 1)
contestant_table['jury'] = contestant_table.apply(lambda x: made_jury(x), axis = 1)
contestant_table['ftc'] = contestant_table.apply(lambda x:made_ftc(x), axis = 1)

jury_modifications = [('Wendy Diaz', 38, 0), ('Sandra Diaz-Twine', 40, 0), ('Reem Daly', 38, 1), ('Amber Mariano', 40, 1)]

for name, ns, j in jury_modifications:
    contestant_table.loc[(contestant_table.contestant_name == name) & (contestant_table.num_season == ns), 'jury'] = j

In [25]:
votes_list = []
season_list = []
finish_list = []
for row in tqdm(range(NUM_SEASONS)):
    season_name = season_table.loc[row,"season"]
    num_contestants = season_table.loc[row,"num_contestants"]
    num_season = season_table.loc[row,"num_season"]
    url = "https://survivor.fandom.com/wiki/"+season_name.replace(" ","_")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table_index = 0
    if num_season == 31:
        table_index = 1
    rows = soup.findAll(class_="wikitable")[table_index].tbody.findAll("tr")
    finish = 1
    i = 1
    while finish <= num_contestants:
        votes = -1
        try:
            votes = int(rows[-i].findAll("td")[-1].text)
        except:
            votes = -1
        if votes != -1:
            votes_list.append(votes)
            season_list.append(num_season)
            finish_list.append(finish)
            finish += 1
        
        i+=1
        
votes_df = pd.DataFrame({"num_season":season_list, "finish":finish_list, "votes_against":votes_list})
contestant_table = pd.merge(contestant_table, votes_df, on=["num_season","finish"])

100%|██████████████████████████████████████████████████████████████████████████████████| 43/43 [00:27<00:00,  1.55it/s]


In [26]:
season_table["redemption_island"] = 0
season_table.loc[[21,22,26],"redemption_island"] = 1
season_table["edge_of_extinction"] = 0
season_table.loc[[37,39],"edge_of_extinction"] = 1
season_table["num_days"] = 39
season_table.loc[1,"num_days"] = 42
season_table.loc[[40,41, 42],"num_days"] = 26

In [27]:
total_demographic_counts = pd.DataFrame(contestant_table.groupby(["num_season"]).sum()[["african_american","asian_american","latin_american","poc","lgbt","jewish","muslim"]],
                                                            index=season_table.num_season)

season_table = pd.merge(season_table, total_demographic_counts, on=['num_season'])

In [28]:
contestant_table['num_boot'] = season_table.loc[contestant_table.num_season - 1,"num_contestants"].reset_index(drop = True) - contestant_table['finish'] + 1

In [30]:
url = "https://survivor.fandom.com/wiki/Tribe"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.findAll("table",class_='wikitable')[1]
tribe_data = pd.read_html(str(table))[0]
tribe_list = np.ravel(tribe_data[tribe_data.columns[1:]].loc[:NUM_SEASONS - 1,:].to_numpy())
tribe_list = [tribe for tribe in tribe_list if str(tribe) != 'nan']
tribe_list.append("David")
tribe_list.append("Vuku")
tribe_list.append("Goliath")
tribe_list.append("Jabeni")

In [31]:
tlist = [[],[],[]]
slist = []
flist = []
for row in tqdm(range(NUM_SEASONS)):    
    season_name = season_table.loc[row,"season"]
    num_contestants = season_table.loc[row,"num_contestants"]
    num_season = season_table.loc[row,"num_season"]
    num_swaps = season_table.loc[row,"num_swaps"]
    url = "https://survivor.fandom.com/wiki/"+season_name.replace(" ","_")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table_index = 0
    if num_season == 31:
         table_index = 1
    rows = soup.findAll(class_="wikitable")[table_index].tbody.findAll("tr")
    finish = 1
    i = 1
    while finish <= num_contestants:
        tribes = [np.nan, np.nan, np.nan]
        votes = -1
        try:
            votes = int(rows[-i].findAll("td")[-1].text)
        except:
            votes = -1
        if votes != -1:
            #tribe = rows[-i].findAll("td")[2].text.strip()
            for j in range(num_swaps + 1):
                tribe = rows[-i].findAll("td")[2+j].text.strip()
                if tribe in tribe_list:
                    tribes[j] = tribe
            tlist[0].append(tribes[0])
            tlist[1].append(tribes[1])
            tlist[2].append(tribes[2])
            slist.append(num_season)
            flist.append(finish)
            finish += 1

        i+=1

100%|██████████████████████████████████████████████████████████████████████████████████| 43/43 [00:26<00:00,  1.63it/s]


In [32]:
tribe_contestant_df = pd.DataFrame({'num_season':slist, 'finish':flist, 'tribe1':tlist[0], 'tribe2':tlist[1], 'tribe3':tlist[2]})
contestant_table = pd.merge(contestant_table, tribe_contestant_df, how='left', on=['num_season','finish'])

In [33]:
tribe_table1 = pd.DataFrame(contestant_table[["num_season","tribe1"]].drop_duplicates().to_numpy(),columns=["num_season","tribe"])
tribe_table1["iter_num"] = 1
tribe_table1["num_contestants"] = tribe_table1.apply(lambda x:
                                                  len(contestant_table[(contestant_table.num_season == x.num_season)&
                                                                      (contestant_table.tribe1 == x.tribe)]), axis=1)

tribe_table2 = pd.DataFrame(contestant_table[["num_season","tribe2"]].drop_duplicates().to_numpy(),columns=["num_season","tribe"])
tribe_table2['iter_num'] = 2
tribe_table2["num_contestants"] = tribe_table2.apply(lambda x:
                                                  len(contestant_table[(contestant_table.num_season == x.num_season)&
                                                                      (contestant_table.tribe2 == x.tribe)]), axis=1)
tribe_table2 = tribe_table2.dropna()
tribe_table3 = pd.DataFrame(contestant_table[["num_season","tribe3"]].drop_duplicates().to_numpy(),columns=["num_season","tribe"])
tribe_table3['iter_num'] = 3
tribe_table3["num_contestants"] = tribe_table3.apply(lambda x:
                                                  len(contestant_table[(contestant_table.num_season == x.num_season)&
                                                                      (contestant_table.tribe3 == x.tribe)]), axis=1)
tribe_table3 = tribe_table3.dropna()


tribe_table = pd.concat([tribe_table1, tribe_table2, tribe_table3],ignore_index=True)
tribe_table = tribe_table.sort_values(["num_season","iter_num"])
tribe_table['merge'] = 0

In [34]:
expanded_contestant_table = pd.concat([
    contestant_table.drop(['tribe2', 'tribe3'], axis = 1).rename(columns={'tribe1':'tribe'}).assign(iter_num = 1),
    contestant_table.drop(['tribe1', 'tribe3'], axis = 1).rename(columns={'tribe2':'tribe'}).assign(iter_num = 2),
    contestant_table.drop(['tribe1', 'tribe2'], axis = 1).rename(columns={'tribe3':'tribe'}).assign(iter_num = 3)
]).dropna(subset = ['tribe'])

tribe_demos = exp_contestant_table.groupby(['num_season','tribe','iter_num']).sum().reset_index()
tribe_table = pd.merge(tribe_table, tribe_demos[['num_season','tribe','iter_num','african_american','asian_american',
                                                'latin_american','poc','jewish','muslim','lgbt']], how='left',
                      on=['num_season','tribe','iter_num'])


In [36]:
tribe_table['male'] = tribe_table.apply(lambda x:len(expanded_contestant_table[(expanded_contestant_table.num_season == x.num_season)
                                                                         & (expanded_contestant_table.tribe == x.tribe)
                                                                         & (expanded_contestant_table.iter_num == x.iter_num)
                                                                         & (expanded_contestant_table.gender == 'M')]), axis = 1)
tribe_table['female'] = tribe_table.apply(lambda x:len(expanded_contestant_table[(expanded_contestant_table.num_season == x.num_season)
                                                                         & (expanded_contestant_table.tribe == x.tribe)
                                                                         & (expanded_contestant_table.iter_num == x.iter_num)
                                                                         & (expanded_contestant_table.gender == 'F')]), axis= 1)
tribe_table['non_binary'] = tribe_table.apply(lambda x:len(expanded_contestant_table[(expanded_contestant_table.num_season == x.num_season)
                                                                         & (expanded_contestant_table.tribe == x.tribe)
                                                                         & (expanded_contestant_table.iter_num == x.iter_num)
                                                                         & (expanded_contestant_table.gender == 'N')]), axis = 1)

In [37]:
merged_tribe_table = pd.DataFrame({
    "num_season":season_table.num_season.unique(),
    "tribe": season_table.merged_tribe.unique(),
    "iter_num": np.zeros(len(season_table.index)),
    "merge": np.ones(len(season_table.index)),
    "num_contestants": season_table.num_merge
})

merge_demos = contestant_table[contestant_table['merge'] == 1].groupby('num_season').sum().reset_index()
merged_tribe_table = pd.merge(merged_tribe_table, merge_demos[['num_season','african_american','asian_american',
                                                'latin_american','poc','jewish','muslim','lgbt']], how='left',
                                                  on=['num_season'])

In [38]:
merged_tribe_table['male'] = merged_tribe_table.apply(lambda x:len(contestant_table[(contestant_table.num_season == x.num_season)
                                                                         & (contestant_table['merge'] == 1)
                                                                         & (contestant_table.gender == 'M')]), axis = 1)
merged_tribe_table['female'] = merged_tribe_table.apply(lambda x:len(contestant_table[(contestant_table.num_season == x.num_season)
                                                                         & (contestant_table['merge'] == 1)
                                                                         & (contestant_table.gender == 'F')]), axis = 1)
merged_tribe_table['non_binary'] = merged_tribe_table.apply(lambda x:len(contestant_table[(contestant_table.num_season == x.num_season)
                                                                         & (contestant_table['merge'] == 1)
                                                                         & (contestant_table.gender == 'N')]), axis = 1)

In [39]:
tribe_table = pd.concat([tribe_table, merged_tribe_table], ignore_index = True)
tribe_table = tribe_table.sort_values(by=["num_season","merge","iter_num"], ascending=[True,True,True])
tribe_table = tribe_table.reset_index(drop=True)

In [40]:
def get_tribe_color(x):
    if x['tribe'] == 'David' or x['tribe'] == 'Vuku':
        return 'Blue/Teal'
    if x['tribe'] == 'Goliath' or x['tribe'] == 'Jabeni':
        return 'Purple'
    return tribe_data.columns[np.where(tribe_data.isin([x['tribe']]).any())[0][0]][1]


In [41]:
tribe_table['color'] = tribe_table.progress_apply(lambda x: get_tribe_color(x), axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████| 225/225 [00:00<00:00, 618.82it/s]


In [46]:
tribe_table = tribe_table.dropna(subset = ['tribe'])

In [42]:
def get_quit_and_evac(row):
    name = row["contestant_name"]
    num_appearance = row["num_appearance"]
    url = "https://survivor.fandom.com/wiki/"+"_".join(name.split(" "))
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    response.close()
    tables = soup.findAll(class_='wikitable')
    quit, evac = 0, 0
    try:
        status1 = tables[num_appearance-1].findAll("td")[-1].text.strip().split(",")[0]
        status2 = tables[num_appearance-1].findAll("td")[-3].text.strip().split(",")[0]
        if status1 == 'Quit' or status2 == 'Quit':
            quit = 1
        if status1 == 'Evacuated' or status2 == 'Evacuated':
            evac = 1
    except:
        pass
    
    return quit, evac

In [43]:
contestant_table[["quit","evac"]] = contestant_table.progress_apply(lambda x: pd.Series(get_quit_and_evac(x)),axis=1)

100%|████████████████████████████████████████████████████████████████████████████████| 785/785 [05:45<00:00,  2.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 785/785 [06:27<00:00,  2.03it/s]


In [55]:
season_table["num_quits"] = contestant_table[["num_season","quit","evac"]].groupby("num_season").sum()["quit"].reset_index(drop=True)
season_table["num_evacs"] = contestant_table[["num_season","quit","evac"]].groupby("num_season").sum()["evac"].reset_index(drop=True)
contestant_table["ejected"] = 0
contestant_table.loc[contestant_table.contestant_name=="Dan Spilo","ejected"] = 1
contestant_table.loc[(contestant_table.contestant_name == 'Jack Nichting') & (contestant_table.num_season == 39), 'jury'] = 1

In [44]:
def get_jury_votes(row):
    name = row["contestant_name"]
    num_appearance = row["num_appearance"]
    ftc = int(row["ftc"])
    if ftc == 0:
        return np.nan
    
    url = "https://survivor.fandom.com/wiki/"+"_".join(name.split(" "))
    session = requests.Session()
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    response.close()
    tables = soup.findAll(class_='wikitable')
    table = tables[int(num_appearance) - 1]
    votes = table.findAll("td")[-2].text.strip().split(',')
    if votes == ['-']:
        return 0
    return len(votes)

In [45]:
contestant_table['fmc'] = 0
contestant_table.loc[(contestant_table.num_season >= 35) & (contestant_table.finish == 4), 'fmc'] = 1
fmcs = [('Bobby Jon Drinkard', 10), ('Jennifer Lyon', 10), ('Cirie Fields', 12), ('Becky Lee', 13), ('Matty Whitmore', 17),
       ('Rodney Lavoie', 30), ('Cydney Gillon', 32)]
for name, ns in fmcs:
    contestant_table.loc[(contestant_table.contestant_name == name) & (contestant_table.num_season == ns), 'fmc'] = 1

In [46]:
contestant_table["num_jury_votes"] = contestant_table.progress_apply(lambda x:get_jury_votes(x), axis = 1)

100%|████████████████████████████████████████████████████████████████████████████████| 785/785 [01:04<00:00, 12.13it/s]


In [47]:
contestant_table['normalized_finish'] = contestant_table.apply(lambda x : 1 - x.finish / season_table.loc[season_table.num_season == x.num_season, 'num_contestants'].iloc[0], axis = 1)

In [48]:
season_table['winner'] = contestant_table.loc[contestant_table.finish == 1, 'contestant_name'].reset_index(drop = True)

In [57]:
contestant_table.to_csv("contestant_table.csv", index=False)
season_table.to_csv("season_table.csv", index=False)
tribe_table.to_csv("tribe_table.csv", index=False)

In [58]:
contestant_table.to_pickle('contestant_table.pkl')
season_table.to_pickle('season_table.pkl')
tribe_table.to_pickle('tribe_table.pkl')

In [68]:
contestant_table = pd.read_csv('contestant_table.csv')
season_table = pd.read_csv('season_table.csv')
tribe_table = pd.read_csv('tribe_table.csv')