In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import html5lib
import glob

### Writing and Reading Data

The following two functions are a way to save scraped data and then read it back in. Both functions have the are prefixed with the word "raw". This simply denotes that no preprocessing of the data is done, just a pure scraping of the webpage of interest.  Both of these functions can be found in the file `/data_grabber.py`.

In [170]:
# write scraped file 
# ------------------
def rawWriter(url,i, show = "bachelorette"):
    
    name = requests.get(url)

    with open("/Users/jamesbain/Desktop/bachelor/data/raw/{}{}.html".format(show,i), "wb") as f:
        f.write(name.content)

        
# read file in
# -------------
def rawReader(file):
    with open(file, "rb") as f:
        soup = BeautifulSoup(f, "lxml")
    return soup

### String Processing

Strings pulled from the interwebs are rather messy. Below are a few functions that clean up some of the common problems that I have when cleaning up strings. The first two are variants of removing strings around another string and the third removes the prefix.

In [11]:
def findBetween( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

def findBetweenR( s, first, last ):
    try:
        start = s.rindex( first ) + len( first )
        end = s.rindex( last, start )
        return s[start:end]
    except ValueError:
        return ""
    
def removePrefix(text, prefix):
    return text[len(prefix):] if text.startswith(prefix) else text

### Preprocessing       

In [13]:
def soupCleanser(soup):
    import re
    import numpy as np
    import pandas as pd
    # make the soup
    
    # find the right table
    tables = soup.findChildren('table')
    table = tables[2]
    
    # find the descriptive colors in hex format
    descript_colors_cols = soup.findChildren('dd')
    styles = [color.span['style'] for color in descript_colors_cols]
    colors = [findBetween(color,"background-color:",";") for color in styles]
    
    
    messy_color_codes = [dd.text for dd in descript_colors_cols]
    color_codes = [removePrefix(code,"\xa0\xa0\xa0\xa0 ") for code in messy_color_codes]
    
    color_dict = dict(zip(colors, color_codes))

    # turn the data into a workable format
    data   = [[td.text for td in row.select('td')]
             for row in table.findAll('tr')]
    
    body = data[1:][1:]
    cols   =  zip(*body)
    
    body = [[re.sub(r'\[\w+\]', '', i) for i in row]
        for row in body]
    
    for i in body:
        if len(i) == len(body[0]):
            pass
        else:
            a = [" "]
            a.extend(" " * (len(body[0])))
            diff = len(a) - len(i) 
            n = i.extend( a[0:diff])
                
    contestants = [row[0] for row in body]
    
    # create lists of call out
    call_out_order_list = []
    for i in list(np.arange(0,(len(body[0])-1))):
        c = [row[i] for row in body] # column
        ci = list(np.arange(1,len(c) + 1)) # column index/counter
        cdict = dict(zip(c, ci)) # creaet dictionary
        col_out_col = [cdict.get(k, 'eliminated') for k in contestants] # call out order
        call_out_order_list.append(col_out_col)
    
    call_out_order_list[0] = contestants # replace first row with contestant names
    
    # find headers for data frame creation
    d = [[td.text for td in row.select('th')]
        for row in table.findAll('tr')]
    
    season_headers = d[1]
    header = ["contestants"] + season_headers
    
    header = [re.sub(r'\[\w+\]', '', i) for i in header]
    
    vov = dict(zip(header, call_out_order_list))
    
    return pd.DataFrame(vov)

In [178]:
soupCleanser(soup)

Unnamed: 0,1,2,3,4,5,6,7,contestants
0,8,7,4,5,3,4,eliminated,Sheena
1,1,9,6,1,2,2,1,Jenni
2,18,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Kim
3,13,2,11,eliminated,eliminated,eliminated,eliminated,Sarah
4,3,4,9,2,4,3,3,Bettina
5,16,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Jessica
6,21,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Morgan
7,23,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Regina
8,10,13,eliminated,eliminated,eliminated,eliminated,eliminated,Erin
9,25,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Tauni


In [14]:
files = glob.glob("/Users/jamesbain/Desktop/bachelor/data/raw/*")
frames = []
for i in files:
    soup = rawReader(i)
    frame = soupCleanser(soup)
    frames.append(frame)

# old code

In [148]:
#soup = makeSoup('https://en.wikipedia.org/wiki/The_Bachelorette_(season_7)')

In [2]:
#def makeSoup(url):
#    response=requests.get(url)
#    soup=BeautifulSoup(response.content,"lxml")
#    return soup

In [69]:
#def dataCollector(soup):
#    import re
#    import numpy as np
#    # make the soup
#    this = makeSoup(soup)
#    
#    # find the right table
#    tables = this.findChildren('table')
#    table = tables[2]
#    
#    
#    # find the descriptive colors in hex format
#    descript_colors_cols = this.findChildren('dd')
#    styles = [color.span['style'] for color in descript_colors_cols]
#    colors = [findBetween(color,"background-color:",";") for color in styles]
#    
#    
#    messy_color_codes = [dd.text for dd in descript_colors_cols]
#    color_codes = [removePrefix(code,"\xa0\xa0\xa0\xa0 ") for code in messy_color_codes]
#    
#    color_dict = dict(zip(colors, color_codes))
#
#    # turn the data into a workable format
#    data   = [[td.text for td in row.select('td')]
#             for row in table.findAll('tr')]
#    
#    body = data[1:][1:]
#    cols   =  zip(*body)
#    
#    body_n = []
#
#    for i in body:
#        if len(i) == len(body[0]):
#            body_n.append(i)
#        else:
#            a = [" "]
#            a.extend(" " * (len(body[0])))
#            diff = len(a) - len(i) 
#            n = i.extend( a[0:diff])
#            body_n.append(n)
#                
#    # create a dict with the data
#    #tbl_d  = {name:col for name, col in zip(header,cols)}
#    contestants = [row[0] for row in body]
#    
#    #c2 = [row[2] for row in body]
#    #ci2 = list(np.arange(1,len(c2) + 1))
#    #c2dict = dict(zip(c2, ci2))
#    
#    #yup = [c2dict[x] for x in contestants]
#    #yup = [c2dict.get(k, 'eliminated') for k in contestants] # default to eliminated if name doesn't exist
#    
#    elimination_list = []
#    for i in list(np.arange(0,(len(body[0])-1))):
#        c = [row[i] for row in body]
#        ci = list(np.arange(1,len(c) + 1))
#        cdict = dict(zip(c, ci))
#        elim_col = [cdict.get(k, 'eliminated') for k in contestants]
#        elimination_list.append(elim_col)
#          
#    return elimination_list

In [70]:
#body = dataCollector('https://en.wikipedia.org/wiki/The_Bachelorette_(season_4)')