In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import html5lib
import glob

from data_grabber import RawProcessor

### Writing and Reading Data

The following two functions are a way to save scraped data and then read it back in. Both functions have the are prefixed with the word "raw". This simply denotes that no preprocessing of the data is done, just a pure scraping of the webpage of interest.  Both of these functions can be found in the file `/data_grabber.py`.

In [3]:
# write scraped file 
# ------------------
def rawWriter(url,i, show = "bachelorette"):
    
    name = requests.get(url)

    with open("/Users/jamesbain/Desktop/bachelor/data/raw/{}{}.html".format(show,i), "wb") as f:
        f.write(name.content)

        
# read file in
# -------------
def rawReader(file):
    with open(file, "rb") as f:
        soup = BeautifulSoup(f, "lxml")
    return soup

### String Processing

Strings pulled from the interwebs are rather messy. Below are a few functions that clean up some of the common problems that I have when cleaning up strings. The first two are variants of removing strings around another string and the third removes the prefix.

In [4]:
def findBetween( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

def findBetweenR( s, first, last ):
    try:
        start = s.rindex( first ) + len( first )
        end = s.rindex( last, start )
        return s[start:end]
    except ValueError:
        return ""
    
def removePrefix(text, prefix):
    return text[len(prefix):] if text.startswith(prefix) else text

### Preprocessing       

In [104]:
def soupCleanser(soup):
    import re
    import numpy as np
    import pandas as pd
    # make the soup
    
    # find the right table
    tables = soup.findChildren('table')
    table = tables[2]
    
    season = soup.findAll('h1')
    season_cleaned = findBetween(season[0].text, "The Bachelor (", ")")
    
    
    # find the descriptive colors in hex format
    descript_colors_cols = soup.findChildren('dd')
    styles = [color.span['style'] for color in descript_colors_cols]
    colors = [findBetween(color,"background-color:",";") for color in styles]
    
    
    messy_color_codes = [dd.text for dd in descript_colors_cols]
    color_codes = [removePrefix(code,"\xa0\xa0\xa0\xa0 ") for code in messy_color_codes]
    
    color_dict = dict(zip(colors, color_codes))

    # turn the data into a workable format
    data   = [[td.text for td in row.select('td')]
             for row in table.findAll('tr')]
    
    body = data[1:][1:]
    cols   =  zip(*body)
    
    body = [[re.sub(r'\[\w+\]', '', i) for i in row]
        for row in body]
    
    for i in body:
        if len(i) == len(body[0]):
            pass
        else:
            a = [" "]
            a.extend(" " * (len(body[0])))
            diff = len(a) - len(i) 
            n = i.extend( a[0:diff])
                
    contestants = [row[0] for row in body]
    
    # create lists of call out
    call_out_order_list = []
    for i in list(np.arange(0,(len(body[0])-1))):
        c = [row[i] for row in body] # column
        ci = list(np.arange(1,len(c) + 1)) # column index/counter
        cdict = dict(zip(c, ci)) # creaet dictionary
        col_out_col = [cdict.get(k, 'eliminated') for k in contestants] # call out order
        call_out_order_list.append(col_out_col)
    
    call_out_order_list[0] = contestants # replace first row with contestant names
    
    season_col = []
    for i in list(np.arange(0,len(contestants))):
        season_col.append(season_cleaned)
    
    
    
    # find headers for data frame creation
    d = [[td.text for td in row.select('th')]
        for row in table.findAll('tr')]
    
    season_headers = d[1]
    header = ["contestants"] + season_headers 
    
    
    header = [re.sub(r'\[\w+\]', '', i) for i in header]
    
    #call_out_order_list.append(season_col)
    #header.append("season")
    
    vov = dict(zip(header, call_out_order_list))
    # i need to debug this. missing last column
    return vov

In [105]:
soup = rawReader("/Users/jamesbain/Desktop/bachelor/data/raw/bachelor10.html")
soupCleanser(soup)

{'1': [5,
  10,
  12,
  2,
  18,
  7,
  16,
  19,
  20,
  15,
  14,
  13,
  11,
  1,
  6,
  3,
  9,
  17,
  21,
  24,
  4,
  23,
  25,
  22,
  8],
 '2': [13,
  8,
  11,
  10,
  'eliminated',
  5,
  'eliminated',
  'eliminated',
  'eliminated',
  6,
  9,
  12,
  14,
  1,
  3,
  4,
  2,
  'eliminated',
  'eliminated',
  'eliminated',
  7,
  'eliminated',
  'eliminated',
  'eliminated',
  15],
 '3': ['eliminated',
  8,
  10,
  12,
  'eliminated',
  3,
  'eliminated',
  'eliminated',
  'eliminated',
  5,
  6,
  11,
  'eliminated',
  9,
  4,
  1,
  2,
  'eliminated',
  'eliminated',
  'eliminated',
  7,
  'eliminated',
  'eliminated',
  'eliminated',
  'eliminated'],
 '4': ['eliminated',
  8,
  'eliminated',
  'eliminated',
  'eliminated',
  1,
  'eliminated',
  'eliminated',
  'eliminated',
  6,
  5,
  'eliminated',
  'eliminated',
  9,
  3,
  4,
  2,
  'eliminated',
  'eliminated',
  'eliminated',
  7,
  'eliminated',
  'eliminated',
  'eliminated',
  'eliminated'],
 '5': ['eliminated',
 

In [14]:
files = glob.glob("/Users/jamesbain/Desktop/bachelor/data/raw/*")
frames = []
for i in files:
    soup = rawReader(i)
    frame = soupCleanser(soup)
    frames.append(frame)

In [19]:
frames[12]

Unnamed: 0,1,2,3,4,5,6,7,8,9,contestants
0,2,6,1,2,5,6,2,1,eliminated,Lauren B.
1,4,2,10,8,4,1,4,2,3,Caila
2,7,11,12,9,10,eliminated,eliminated,eliminated,eliminated,Jennifer
3,6,13,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Jami
4,13,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Samantha
5,8,5,3,6,11,eliminated,eliminated,eliminated,eliminated,Jubilee
6,9,4,5,4,1,2,1,4,eliminated,Amanda
7,21,10,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Lace
8,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Lauren R.
9,eliminated,15,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,eliminated,Shushanna


# old code

In [148]:
#soup = makeSoup('https://en.wikipedia.org/wiki/The_Bachelorette_(season_7)')

In [2]:
#def makeSoup(url):
#    response=requests.get(url)
#    soup=BeautifulSoup(response.content,"lxml")
#    return soup

In [69]:
#def dataCollector(soup):
#    import re
#    import numpy as np
#    # make the soup
#    this = makeSoup(soup)
#    
#    # find the right table
#    tables = this.findChildren('table')
#    table = tables[2]
#    
#    
#    # find the descriptive colors in hex format
#    descript_colors_cols = this.findChildren('dd')
#    styles = [color.span['style'] for color in descript_colors_cols]
#    colors = [findBetween(color,"background-color:",";") for color in styles]
#    
#    
#    messy_color_codes = [dd.text for dd in descript_colors_cols]
#    color_codes = [removePrefix(code,"\xa0\xa0\xa0\xa0 ") for code in messy_color_codes]
#    
#    color_dict = dict(zip(colors, color_codes))
#
#    # turn the data into a workable format
#    data   = [[td.text for td in row.select('td')]
#             for row in table.findAll('tr')]
#    
#    body = data[1:][1:]
#    cols   =  zip(*body)
#    
#    body_n = []
#
#    for i in body:
#        if len(i) == len(body[0]):
#            body_n.append(i)
#        else:
#            a = [" "]
#            a.extend(" " * (len(body[0])))
#            diff = len(a) - len(i) 
#            n = i.extend( a[0:diff])
#            body_n.append(n)
#                
#    # create a dict with the data
#    #tbl_d  = {name:col for name, col in zip(header,cols)}
#    contestants = [row[0] for row in body]
#    
#    #c2 = [row[2] for row in body]
#    #ci2 = list(np.arange(1,len(c2) + 1))
#    #c2dict = dict(zip(c2, ci2))
#    
#    #yup = [c2dict[x] for x in contestants]
#    #yup = [c2dict.get(k, 'eliminated') for k in contestants] # default to eliminated if name doesn't exist
#    
#    elimination_list = []
#    for i in list(np.arange(0,(len(body[0])-1))):
#        c = [row[i] for row in body]
#        ci = list(np.arange(1,len(c) + 1))
#        cdict = dict(zip(c, ci))
#        elim_col = [cdict.get(k, 'eliminated') for k in contestants]
#        elimination_list.append(elim_col)
#          
#    return elimination_list

In [70]:
#body = dataCollector('https://en.wikipedia.org/wiki/The_Bachelorette_(season_4)')