In [35]:
# Flatten Harvard data

# For data manipulation
import os
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
PROJ_ROOT = os.pardir

# For Scraping
from bs4 import BeautifulSoup
import requests
import time

# For munging
import re
import json
import math

In [4]:
playerPath = os.path.join(PROJ_ROOT, "data", "interim", "harvard_players.json")

In [5]:
players = pd.read_json(playerPath)

In [6]:
players.head()

Unnamed: 0,B/T,City,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport
0,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2008-09,bsb
1,R/R,Austin,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8,C,Texas,,,180,Jr.,2008-09,bsb
10,S/R,Winnetka,,New Trier,"Winnetka, Ill.",6-3,Zach Hofeld,/sports/bsb/2008-09/bios/hofeld_zach,29,RHP,Ill.,,,205,Jr.,2008-09,bsb
100,L/R,Charlottesville,,St. Anne's-Belfield,"Charlottesville, Va.",5-11,Jake Allen,/sports/bsb/2016-17/bios/allen_jake_gyf5,10,C,Va.,,,185,So.,2016-17,bsb
1000,,Greendale,,Greendale HS,"Greendale, Wis.",6-8,Eric Wanta,/sports/mbkb/1984-85/Bios/Eric_Wanta,15,Forward,Wis.,,,210,Freshman,1984-85,mbkb


In [7]:
players.shape

(10896, 17)

In [8]:
bio_parsed_path = os.path.join(PROJ_ROOT, "data", "interim", "harvard_bio_parsed.csv.gz")
bios = pd.read_csv(bio_parsed_path, compression="gzip")


In [15]:
bios.shape
bios.columns = ["Name_link", "synopsis", "headers"]

In [18]:
combined = players.merge(bios, on="Name_link", how="inner")

In [19]:
combined.shape

(10895, 19)

In [43]:
combined.head()

Unnamed: 0,B/T,City,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport,synopsis,headers
0,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2008-09,bsb,\nJunior (2010):\n\n Named team's captain prio...,"{u'Position:': u'C', u'Year:': u'Jr.', u'Heigh..."
1,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2009-10,bsb,\nJunior (2010):\n\n Named team's captain prio...,"{u'Position:': u'C', u'Year:': u'Jr.', u'Heigh..."
2,R/R,Austin,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8,C,Texas,,,180,Jr.,2008-09,bsb,\nJunior (2010):\n\n Started at catcher agains...,"{u'Position:': u'C', u'Year:': u'Jr.', u'Heigh..."
3,R/R,Austin,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8,C,Texas,,,180,Jr.,2009-10,bsb,\nJunior (2010):\n\n Started at catcher agains...,"{u'Position:': u'C', u'Year:': u'Jr.', u'Heigh..."
4,S/R,Winnetka,,New Trier,"Winnetka, Ill.",6-3,Zach Hofeld,/sports/bsb/2008-09/bios/hofeld_zach,29,RHP,Ill.,,,205,Jr.,2008-09,bsb,\nSophomore (2009):\n\nStruck out three batter...,"{u'Position:': u'RHP', u'Year:': u'Jr.', u'Hei..."


In [39]:
def formatBio(row):
    synopsis = row["synopsis"]
    if type(synopsis) == float:
        return ""
    return re.sub("\n", "", synopsis)

In [44]:
combined["Bio"] = combined.apply(formatBio, axis=1)

In [47]:
del(combined["synopsis"])

In [49]:
playersSorted = combined.sort_values(["season"], ascending=False)

In [50]:
groupedFirst = playersSorted.groupby(by=["Name"]).first()
groupedLast = playersSorted.groupby(by=["Name"]).last()

In [60]:
len(groupedLast)

4270

In [61]:
for i in range(len(groupedLast)):
    first = groupedFirst.iloc[i]
    groupedLast.iloc[i]["season"] = groupedLast.iloc[i]["season"][:-2] + first["season"][-2:]

In [62]:
groupedLast.head()

Unnamed: 0_level_0,B/T,City,Event,High School,Hometown,Ht.,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport,headers,Bio
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A.J. Jaffe,,St. Charles,,Marmion Academy,"St. Charles, Ill.",,/sports/wrest/2016-17/bios/jaffe_aj,,141,Ill.,,,,Freshman,2016-17,wrest,"{u'Hometown:': u'St. Charles, Ill.', u'Positio...",Before Harvard Four-time letterwinner at Marmi...
A.J. Kennedy,,Fonthill,,Notre Dame College,"Fonthill, Ont.",6-3,/sports/fball/2003-04/bios/a.j._kennedy,98.0,DE,Ont.,,,220.0,Freshman,2003-04,fball,"{u'Position:': u'DE', u'Year:': u'Freshman', u...",
AJ Carvalho,,Swansea,,Joseph Case,"Swansea, Mass.",,/sports/mcrew-lw/2010-11/bios/varsity/Carvalho...,,,Mass.,,,,Sophomore,2010-13,mcrew-lw,"{u'name': u'Antonio Carvalho', u'Year:': u'Sop...",Sophomore • 2011Did not see spring race action...
Aaki Vora,,Mumbai,Freestyle,The Cathedral and John Connon School,"Mumbai, India",,/sports/wswimdive/2016-17/bios/Vora_Aaki,,,India,,,,Freshman,2016-17,wswimdive,"{u'Hometown:': u'Mumbai, India', u'Position:':...",Before Harvard Best female athlete at the 2015...
Aaron Byrd,,Abilene,,Cooper,"Abilene, Texas",6-2,/sports/fball/2001-02/bios/aaron_byrd,3.0,DB,Texas,,,190.0,Freshman,2001-03,fball,"{u'Position:': u'DB', u'Year:': u'Freshman', u...",


In [63]:
processedPath = os.path.join(PROJ_ROOT, "data", "interim", "harvard_bios_processed.csv")
groupedLast.to_csv(processedPath, encoding='utf-8')