In [1]:
# 11/5 Get Raw Player Bios
# Cameron Yick

# For data manipulation
import os
import pandas as pd
PROJ_ROOT = os.pardir

# For Scraping
from bs4 import BeautifulSoup
import requests
import time

# For munging
import re
import json

# Backoff time for large scrapes
THROTTLE_TIME = .05

# ipython magics
# %load_ext watermark
%matplotlib inline
# a nice bonus for making progress bars in ipython

from tqdm import tqdm, tqdm_pandas, tqdm_notebook


In [2]:
# Initialize reused objects
tqdm.pandas()
s = requests.Session()

In [4]:
playerPath = os.path.join(PROJ_ROOT, "data", "interim", "players.json")

In [6]:
df = pd.read_json(playerPath)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8702 entries, 0 to 999
Data columns (total 17 columns):
B/T                     398 non-null object
City                    8702 non-null object
Cl.                     8702 non-null object
Events                  1235 non-null object
High School             8702 non-null object
Hometown/High School    8702 non-null object
Hometown/Region         8702 non-null object
Ht.                     5067 non-null object
Name                    8702 non-null object
Name_link               8702 non-null object
No.                     4410 non-null object
Pos.                    4410 non-null object
Region                  8702 non-null object
Weapon                  291 non-null object
Wt.                     3252 non-null object
season                  8702 non-null object
sport                   8702 non-null object
dtypes: object(17)
memory usage: 1.2+ MB


In [8]:
# There is are 8618 unique bio links... hm. Scrape now, clean later.
df['Name_link'].unique().shape

(8618L,)

In [9]:
df['Name_link'].head()

0                /sports/m-basebl/mtt/brenner_ryan00.html
1                /sports/m-basebl/mtt/castaldi_joe00.html
10                 /sports/m-basebl/mtt/ludwig_pat00.html
100       /sports/m-basebl/2016-17/bios/cecere_bobby_a3p9
1000    /sports/m-crewlt/2015-16/bios/sanderson_eric_edr3
Name: Name_link, dtype: object

In [14]:
# Let's make some data soup!
num = 0
def makeAthleteSoup(link, session=s):
#     print("."),
    global num
    print num
    num +=1
    if not link:
        return None
    time.sleep(THROTTLE_TIME)
    result = session.get("http://www.yalebulldogs.com" + link)
    if result.status_code is 200:
        
        return BeautifulSoup(result.content, 'lxml').find('div', class_="bio-wrap")
    else:
        print("Site is down!")

In [10]:
bios = df['Name_link'].unique()

In [11]:
bios = pd.Series(bios)

In [13]:
bios.head()

0             /sports/m-basebl/mtt/brenner_ryan00.html
1             /sports/m-basebl/mtt/castaldi_joe00.html
2               /sports/m-basebl/mtt/ludwig_pat00.html
3      /sports/m-basebl/2016-17/bios/cecere_bobby_a3p9
4    /sports/m-crewlt/2015-16/bios/sanderson_eric_edr3
dtype: object

In [14]:
bios = pd.read_json('bio_urls', typ='series')

In [16]:
bios.head()

0                /sports/m-basebl/mtt/brenner_ryan00.html
1                /sports/m-basebl/mtt/castaldi_joe00.html
10      /sports/m-crewlt/2015-16/bios/straus-goldfarb_...
100     /sports/m-crewlt/2013-14/bios/wulbern_robert_4pfm
1000     /sports/m-footbl/2007-08/bios/coombs_matt00.html
dtype: object

In [None]:
soups = bios.map(makeAthleteSoup)

In [16]:
soups.to_json('bios.json')

In [22]:
pd.DataFrame(soups).to_pickle('bios.p')

In [27]:
pd.DataFrame(soups).to_csv('bios.csv.gz', compression='gzip')

In [36]:
# !~/cs323/hw4/encode -m 20 -d 3 < bios.p > bios.lzw

^C


In [38]:
# !ls -lS

total 3335480
-rw-r--r-- 1 cyy5 cyy5 1683240696 Nov  6 00:22 bios.hdf
-rw-r--r-- 1 cyy5 cyy5 1682183444 Nov  6 00:13 bios.p
-rw-r--r-- 1 cyy5 cyy5   37748736 Nov  6 00:33 bios.lzw
-rw-r--r-- 1 cyy5 cyy5   10055061 Nov  6 00:24 bios.csv.gz
-rw-r--r-- 1 cyy5 cyy5    1338990 Aug 16 14:35 AY15-16-cs223-20160816.tar.gz
-rw-r--r-- 1 cyy5 cyy5     533777 Nov  5 22:08 bio_urls
-rw-r--r-- 1 cyy5 cyy5     157502 Nov  6 00:31 3-cyy5-get-bios.ipynb
-rw-r--r-- 1 cyy5 cyy5     119543 Nov  5 23:50 bios.json
-rw-r--r-- 1 cyy5 cyy5      33148 Mar  1  2015 '~cs201'
-rw-r--r-- 1 cyy5 cyy5       7631 Nov  5 16:13 cy8-parse-covenantSoup.ipynb
drwxr-xr-x 3 cyy5 cyy5       4096 Feb  9  2015 Desktop
drwxr-xr-x 2 cyy5 cyy5       4096 Mar  2  2015 Documents
drwxr-xr-x 2 cyy5 cyy5       4096 Nov  3 16:30 Downloads
drwxr-xr-x 2 cyy5 cyy5       4096 Jan 21  2015 Music
drwxr-xr-x 2 cyy5 cyy5       4096 Sep 12 13:19 Pictures
drwxr-xr-x 2 cyy5 cyy5       4096 Jan 21  2015 Public
drwxrwxr-x 3 cyy5 cyy

In [29]:
d = pd.read_csv('yale_bios.csv.gz', compression='gzip')

In [34]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8618 entries, 0 to 8617
Data columns (total 2 columns):
Unnamed: 0    8618 non-null float64
0             8535 non-null object
dtypes: float64(1), object(1)
memory usage: 134.7+ KB


In [33]:
d['0'].iloc[1]

'<div class="bio-wrap clearfix" data-module="stats/responsive-container">\n<div class="related-bios clearfix"> <select class="roster-select-list">\n<option selected="selected">Related Bios</option>\n<option value="/sports/m-basebl/mtt/brenner_ryan00.html">Ryan Brenner</option>\n<option value="/sports/m-basebl/mtt/elmore_gant00.html">Gant Elmore</option>\n<option value="/sports/m-basebl/mtt/finneran_chris00.html">Chris Finneran</option>\n<option value="/sports/m-basebl/mtt/gruber_robert00.html">Robert Gruber</option>\n<option value="/sports/m-basebl/mtt/josselyn_brandon00.html">Brandon Josselyn</option>\n<option value="/sports/m-basebl/mtt/kolmar_andrew00.html">Andrew Kolmar</option>\n<option value="/sports/m-basebl/mtt/koulos_harry00.html">Harry Koulos</option>\n<option value="/sports/m-basebl/mtt/lally_vinny00.html">Vinny Lally</option>\n<option value="/sports/m-basebl/mtt/larsson-danforth_trygg00.html">Trygg Larsson-Danforth</option>\n<option value="/sports/m-basebl/mtt/ludwig_pat00.