In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import html5lib

To start this whole thing out, I need to collect the urls from wikipedia that have contestant data. I do this for both the Bachelor and Bachelorette.

In [2]:
import numpy as np
bachelor_seasons = np.arange(9,21)

bachelor_urls = []
for season in bachelor_seasons:
    season_url = 'https://en.wikipedia.org/wiki/The_Bachelor_(season_{})'.format(season)
    bachelor_urls.append(season_url)

In [3]:
import numpy as np
bachelorette_seasons = np.arange(4,13)
bachelorette_seasons = np.insert(bachelorette_seasons,0,2)

bachelorette_urls = []
for season in bachelorette_seasons:
    season_url = 'https://en.wikipedia.org/wiki/The_Bachelorette_(season_{})'.format(season)
    bachelorette_urls.append(season_url)

Let's create a function that takes in a url and then creates a soup from the html. This will come in handy when we are collecting the data.

In [4]:
def makeSoup(url):
    response=requests.get(url)
    soup=BeautifulSoup(response.content,"lxml")
    return soup

In [111]:
def dataCollector(soup):
    import re
    # make the soup
    this = makeSoup(soup)
    
    # find the right table
    tables = this.findChildren('table')
    table = tables[1]
    
    # turn the data into a workable format
    data   = [[td.text for td in row.select('td')]
             for row in table.findAll('tr')]
    
    # create the header row and the body
    header = ['name','age', 'hometown', 'occupation', 'elimination']
    body = data[1:]
    cols   =  zip(*body)
    
    # create a dict with the data
    tbl_d  = {name:col for name, col in zip(header,cols)}
    
    # extract the season number from the original soup
    number = re.findall(r'\d+', soup)
    
    # remove brackets
    num= ''.join(number)
    
    # create a new key of seasons
    tbl_d['season'] = [num] * len(tbl_d['age'])
    
    new_names = []
    for name in tbl_d['name']:
        cleaned_name = re.sub(r'\[\w+\]', ' ', name)
        new_names.append(cleaned_name)

    tbl_d['name'] = new_names
    
    # find the first name with last name abbreviation
    name_abbreviation = []
    
    for name in tbl_d['name']:
        names = name.split(" ")

        new_names = []
        for name in names:
            cleaned_name = re.sub(r'\(\w+\)', ' ', name)
            new_names.append(cleaned_name)
        filtered = filter(lambda items: items.strip(), new_names) # remove blank space items 
        new_names = list(filtered)

        if len(new_names)== 1:
            new_name = new_names[0]
        else:
            new_name = "{} {}.".format(new_names[0], new_names[-1][0])
        name_abbreviation.append(new_name)
        
    tbl_d['name_abbreviation'] = name_abbreviation
    
    new_hometowns = []
    for hometown in tbl_d['hometown']:
        cleaned_hometown = re.sub(r'\[\w+\]', ' ', hometown)
        new_hometowns.append(cleaned_hometown)

    tbl_d['hometown'] = new_hometowns
    
    elimination_week = []

    for item in tbl_d['elimination']:
        if hasNumbers(item):
            digits = get_digits(item)
            episode = digits[0]
        else:
            episode = item
        elimination_week.append(episode)
    tbl_d['elimination'] = elimination_week
    
    # return dictionary
    return tbl_d

In [112]:
def frameMaker(urls):
    frames = []

    for url in urls:
        dictionary = dataCollector(url)
        frame = pd.DataFrame(dictionary)
        frames.append(frame)

    combined = pd.concat(frames)
    
    return combined.reset_index()

In [116]:
bachelorette_frame = frameMaker(bachelorette_urls)

In [113]:
bachelor_frame = frameMaker(bachelor_urls)

In [117]:
bachelorette_frame.tail(50)

Unnamed: 0,index,age,elimination,hometown,name,name_abbreviation,occupation,season
207,2,26,8.0,"Warsaw, Indiana","Benjamin ""Ben"" Higgins",Benjamin H.,Software Salesman,11
208,3,26,7.0,"Warwick, Rhode Island",Jared Haibon,Jared H.,Restaurant Manager,11
209,4,28,7.0,"Glasgow, Kentucky","Joseph ""Joe"" Bailey",Joseph B.,Insurance Agent,11
210,5,28,7.0,"Granite Bay, California","Christopher ""Chris"" Strandburg",Christopher S.,Dentist,11
211,6,26,6.0,"Falls Church, Virginia","Benjamin ""Ben"" Zorn",Benjamin Z.,Fitness Coach,11
212,7,28,6.0,"Stilwell, Kansas",Tanner Tolbert,Tanner T.,Auto Finance Manager,11
213,8,32,6.0,"Dacono, Colorado","John ""JJ"" Lane III",John I.,Former Investment Banker,11
214,9,31,5.0,"Ventura, California",Joshua Albers,Joshua A.,Industrial Welder,11
215,10,28,5.0,"Elgin, Illinois",Justin Reich,Justin R.,Fitness Trainer,11
216,11,28,5.0,"Ramsey, New Jersey",Ian Thomson,Ian T.,Executive Recruiter,11


In [84]:
def get_digits(str1):
    c = ""
    for i in str1:
        if i.isdigit():
            c += i
    return c

In [89]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [93]:
elimination_week = []

for item in bachelorette_frame['elimination']:
    if hasNumbers(item):
        season = get_digits(item)
    else:
        season = item
    elimination_week.append(season)
elimination_week

['Winner',
 '8',
 '6',
 '5',
 '4',
 '4',
 '4',
 '3',
 '3',
 '3',
 '2',
 '2',
 '2',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 'Winner',
 '8',
 '7',
 '6',
 '5',
 '5',
 '4',
 '4',
 '4',
 '3',
 '3',
 '3',
 '2',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '75',
 'Runner-up',
 '9',
 '8',
 '7',
 '7',
 '6',
 '6',
 '6',
 '5',
 '4',
 '4',
 '4',
 '3',
 '3',
 '3',
 '2',
 '2',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 'Winner',
 'Runner-up',
 '9',
 '8',
 '7',
 '6',
 '6',
 '5',
 '5',
 '4',
 '4',
 '3',
 '3',
 '3',
 '2',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 'Winner',
 'Runner-up',
 '9',
 '8',
 '7',
 '7',
 '6',
 '6',
 '5',
 '5',
 '5',
 '4',
 '3',
 '3',
 '3',
 '2',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 'Winner',
 'Runner-up',
 '9',
 '8',
 '7',
 '7',
 '6',
 '6',
 '5',
 '5',
 '4',
 '4',
 '4',
 '3',
 '3',
 '3',
 '2',
 '2',
 '2',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 'Winner',
 

In [43]:
bachelorette_frame.to_csv('data/bachelorette_contestants.csv', index = False)

In [44]:
bachelor_frame.to_csv('data/bachelor_contestants.csv', index = False)