In [1]:
# Use dataframes to get 4 x speedup (5 minutes instead of 21 minutes per batch)
# utilize apache spark

In [2]:
!pip install bs4 tqdm

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm, tqdm_pandas, tqdm_notebook


In [4]:
tqdm.pandas()

In [5]:
# Host files online so they are usable with databricks.
df = pd.read_csv('https://dl.dropboxusercontent.com/u/25590211/yale_bios.csv.gz?raw=1', compression='gzip', engine='python')
df.info()

In [6]:
# combine with labels
labels = pd.read_csv('https://dl.dropboxusercontent.com/u/25590211/yale_bio_urls.csv?raw=1', header=None)

In [7]:
print df.shape
print labels.shape

In [8]:
df.index= labels

In [9]:
df = df.drop(['Unnamed: 0'], axis=1)

In [10]:
df.reset_index(level=0, inplace=True)

In [11]:
df['index']

In [12]:
tuples = df.apply(lambda x: (x['index'], x['0']), axis=1)

We'll need to break up the strings into blocks and only use the markup that we actually need.

In [14]:
DF = sc.parallelize(tuples)

In [15]:
# the output needs to be handled 1 at a time, and then dumped out carefully.
# df['soup'] = df['0'].progress_apply(lambda x: BeautifulSoup(x, 'lxml') )

In [16]:
# use a distributed database to handle this more performantly before
DF.take(5)

In [17]:
SOUPS = DF.map(lambda x: (x[0], BeautifulSoup(x[1], 'lxml')) if  )

In [18]:
SOUPS.take(2)

In [19]:
def tableToDict(soup):
    "Given a headerless table, return a dict with all the fields"
    table = {}
    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        table[str(cells[0].text.strip())] = cells[1].text.strip()
        
    return table
    
def soupToDict(soup):
    "Given a player's soup blob, parse their header"
    playerData = tableToDict(soup.select('div.player-info')[0].find('table'))
    playerData['name'] = soup.select('div.player-name span.name')[0].text.strip()
    return playerData
def getSynopsis(soup):
    "Given a player's soup, split their bio information into \
    chunks for each strong header"
    
    # Get all top level children
#     bioPs = soup.select('div.synopsis')[0].find_all('p', recursive=False)
    
#     blocks= []
#     # if their blocks are malformed, skip that block.
#     for i, val in enumerate(bioPs):
#         try:
#             blocks[val.select('strong')[0].text] = val.get_text()
#         except:
#             printf("Text not detected")
        
    
    # Given the variation in blob types, just return
    # single blob for now.
    # Ryan Brenner: b instead of strong tags
    # Jackson Stallings (Junior Year) - UL blobs mixed with P tags
    # Jackson Stallings (Freshman) - Clean example
    
    # Sometimes, no bio
#     http://yalebulldogs.com/sports/c-sail/2016-17/bios/buehler_patrick_nu3o?view=news
    bio = soup.find('div',class_='synopsis')
    return bio.get_text() if bio else None

In [20]:
HEADERS = DF.map(lambda x: (x[0] ,  soupToDict(BeautifulSoup(x[1], 'lxml')) if (type(x[1]) == str) else {}  ))

In [21]:
HEADERS.persist()

In [22]:
heads = HEADERS.collect()

In [23]:
# SYNOPSIS = df['0'].progress_apply(lambda x: getSynopsis(BeautifulSoup(x, 'lxml')) if (type(x) == str) else None)
SYNOPSIS = DF.map(lambda x: (x[0] ,  getSynopsis(BeautifulSoup(x[1], 'lxml')) if (type(x[1]) == str) else {}  ))

In [24]:
SYNOPSIS.persist()

In [25]:
save = SYNOPSIS.collect()

In [26]:
TOTAL = SYNOPSIS.join(HEADERS)

In [27]:
dheads = HEADERS.toDF()

In [28]:
dtotal = TOTAL.toDF()

In [29]:
newDF = dtotal.toPandas()
newHeads = dheads.toPandas()

In [30]:
newHeads.columns = ['url', 'headers']

In [31]:
newDF.columns= ['url', 'synopsis']

In [32]:
merged = pd.merge(newHeads, newDF, on='url', suffixes=['_l', '_r'])

In [33]:
indexed = merged.set_index('url')

In [34]:
indexed.to_csv('/tmp/yale_bio_parsed.csv', compression='gzip')

In [35]:
pwd !!

In [36]:
dbutils.fs.ls("file:/tmp/")

In [37]:

display(dbutils.fs.ls("/FileStore/"))

In [38]:
dbutils.fs.cp("file:/tmp/yale_bio_parsed.csv", "/FileStore/sports/yale_bio_parsed.csv.gz")

In [39]:
# access data via https://community.cloud.databricks.com/files/yale_bios.csv.gz

In [40]:
!gzip -c yale_bio_parsed.csv > yale_bio_parsed.csv.gz