In [1]:
# 11/5 Get Raw Player Bios
# Cameron Yick

# For data manipulation
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
PROJ_ROOT = os.pardir

# For Scraping
from bs4 import BeautifulSoup
import requests
import time

# For munging
import re
import json

# Backoff time for large scrapes
THROTTLE_TIME = .05

# ipython magics
# %load_ext watermark
%matplotlib inline
# a nice bonus for making progress bars in ipython

from tqdm import tqdm, tqdm_pandas, tqdm_notebook




In [2]:
# Initialize reused objects
tqdm.pandas()
s= requests.Session()
s.headers.update({'User-agent': 'Mozilla/5.0'})


In [3]:
# playerPath = os.path.join(PROJ_ROOT, "data", "interim", "players.json")

In [4]:
# use with temp auth token

In [5]:
ghPath = "https://raw.githubusercontent.com/hydrosquall/yale_athletics_data/master/data/interim/harvard_players.json?token=AImmM-Q_QprW_hYT6_KwnNcP21It4pS7ks5YPkJ2wA%3D%3D"

In [6]:
# use online read path for usage with a distributed crawler
df = pd.read_json(ghPath)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10896 entries, 0 to 9999
Data columns (total 17 columns):
B/T            488 non-null object
City           10896 non-null object
Event          1418 non-null object
High School    10896 non-null object
Hometown       10896 non-null object
Ht.            5445 non-null object
Name           10896 non-null object
Name_link      10896 non-null object
No.            6265 non-null object
Position       6490 non-null object
Region         10896 non-null object
S/C            580 non-null object
Weapon         243 non-null object
Wt.            4251 non-null object
Yr.            10896 non-null object
season         10896 non-null object
sport          10896 non-null object
dtypes: object(17)
memory usage: 1.5+ MB


In [8]:
# There is are 10806 unique bio links... hm. Scrape now, clean later.
df['Name_link'].unique().shape

(10806,)

In [9]:
df.duplicated(subset=['Name_link'],keep=False).sum()

180

In [10]:
# how did this guy show up twice?!
df[df['Name_link'] == "/sports/bsb/2008-09/bios/albright_tyler"]

Unnamed: 0,B/T,City,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport
0,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2008-09,bsb
122,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2009-10,bsb


In [11]:
df[df.duplicated(subset=['Name_link'],keep=False)].head()

Unnamed: 0,B/T,City,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport
0,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6.0,C,Calif.,,,190.0,Jr.,2008-09,bsb
1,R/R,Austin,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8.0,C,Texas,,,180.0,Jr.,2008-09,bsb
10,S/R,Winnetka,,New Trier,"Winnetka, Ill.",6-3,Zach Hofeld,/sports/bsb/2008-09/bios/hofeld_zach,29.0,RHP,Ill.,,,205.0,Jr.,2008-09,bsb
10059,L/L,Carlsbad,,La Costa Canyon,"Carlsbad, Calif.",,Jackie Cooley,/sports/sball/2010-11/bios/cooley_jackie,9.0,Outfield,Calif.,,,,Freshman,2010-11,sball
10227,,Castle Rock,Diving,Rock Canyon,"Castle Rock, Colo.",,Tanille Paniogue,/sports/wswimdive/2008-09/bios/paniogue_tanille,,,Colo.,,,,Senior,2008-09,wswimdive


In [12]:
df['Name_link'].head()

0        /sports/bsb/2008-09/bios/albright_tyler
1          /sports/bsb/2008-09/bios/arledge_cole
10          /sports/bsb/2008-09/bios/hofeld_zach
100     /sports/bsb/2016-17/bios/allen_jake_gyf5
1000        /sports/mbkb/1984-85/Bios/Eric_Wanta
Name: Name_link, dtype: object

In [13]:
# Let's make some data soup!

def makeAthleteSoup(link, session=s):
    if not link:
        return None
    time.sleep(THROTTLE_TIME)
    result = session.get("http://www.gocrimson.com" + link)
    if result.status_code is 200:
        return BeautifulSoup(result.content, 'lxml').find('div', class_="bio-wrap")
    else:
        print("Site is down!")

In [14]:
bios = df['Name_link'].unique()

In [15]:
bios = pd.Series(bios)

## Main Loop

Download all the bios, contained within bio-wrapper!

In [None]:
soups = bios.progress_apply(lambda x: makeAthleteSoup(x))

  1%|          | 110/10806 [01:31<2:21:09,  1.26it/s]

In [None]:
soups.info()

In [None]:
pd.DataFrame(soups).to_csv('bios.csv.gz', compression='gzip')

In [16]:
soups.to_json('bios.json')

In [22]:
pd.DataFrame(soups).to_pickle('bios.p')

In [39]:
rm bios.hdf

In [36]:
!~/cs323/hw4/encode -m 20 -d 3 < bios.p > bios.lzw

^C


In [38]:
!ls -lS

total 3335480
-rw-r--r-- 1 cyy5 cyy5 1683240696 Nov  6 00:22 bios.hdf
-rw-r--r-- 1 cyy5 cyy5 1682183444 Nov  6 00:13 bios.p
-rw-r--r-- 1 cyy5 cyy5   37748736 Nov  6 00:33 bios.lzw
-rw-r--r-- 1 cyy5 cyy5   10055061 Nov  6 00:24 bios.csv.gz
-rw-r--r-- 1 cyy5 cyy5    1338990 Aug 16 14:35 AY15-16-cs223-20160816.tar.gz
-rw-r--r-- 1 cyy5 cyy5     533777 Nov  5 22:08 bio_urls
-rw-r--r-- 1 cyy5 cyy5     157502 Nov  6 00:31 3-cyy5-get-bios.ipynb
-rw-r--r-- 1 cyy5 cyy5     119543 Nov  5 23:50 bios.json
-rw-r--r-- 1 cyy5 cyy5      33148 Mar  1  2015 '~cs201'
-rw-r--r-- 1 cyy5 cyy5       7631 Nov  5 16:13 cy8-parse-covenantSoup.ipynb
drwxr-xr-x 3 cyy5 cyy5       4096 Feb  9  2015 Desktop
drwxr-xr-x 2 cyy5 cyy5       4096 Mar  2  2015 Documents
drwxr-xr-x 2 cyy5 cyy5       4096 Nov  3 16:30 Downloads
drwxr-xr-x 2 cyy5 cyy5       4096 Jan 21  2015 Music
drwxr-xr-x 2 cyy5 cyy5       4096 Sep 12 13:19 Pictures
drwxr-xr-x 2 cyy5 cyy5       4096 Jan 21  2015 Public
drwxrwxr-x 3 cyy5 cyy

In [29]:
d = pd.read_csv('bios.csv.gz', compression='gzip')

In [34]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8618 entries, 0 to 8617
Data columns (total 2 columns):
Unnamed: 0    8618 non-null float64
0             8535 non-null object
dtypes: float64(1), object(1)
memory usage: 134.7+ KB


In [33]:
d['0'].iloc[1]

'<div class="bio-wrap clearfix" data-module="stats/responsive-container">\n<div class="related-bios clearfix"> <select class="roster-select-list">\n<option selected="selected">Related Bios</option>\n<option value="/sports/m-basebl/mtt/brenner_ryan00.html">Ryan Brenner</option>\n<option value="/sports/m-basebl/mtt/elmore_gant00.html">Gant Elmore</option>\n<option value="/sports/m-basebl/mtt/finneran_chris00.html">Chris Finneran</option>\n<option value="/sports/m-basebl/mtt/gruber_robert00.html">Robert Gruber</option>\n<option value="/sports/m-basebl/mtt/josselyn_brandon00.html">Brandon Josselyn</option>\n<option value="/sports/m-basebl/mtt/kolmar_andrew00.html">Andrew Kolmar</option>\n<option value="/sports/m-basebl/mtt/koulos_harry00.html">Harry Koulos</option>\n<option value="/sports/m-basebl/mtt/lally_vinny00.html">Vinny Lally</option>\n<option value="/sports/m-basebl/mtt/larsson-danforth_trygg00.html">Trygg Larsson-Danforth</option>\n<option value="/sports/m-basebl/mtt/ludwig_pat00.