In [50]:
# Flatten Harvard data
# For data manipulation
import os
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
PROJ_ROOT = os.pardir

# For Scraping
from bs4 import BeautifulSoup
import requests
import time

# For munging
import re
import json
import math

In [51]:
playerPath = os.path.join(PROJ_ROOT, "data", "interim", "harvard_players.json")

In [52]:
players = pd.read_json(playerPath)

In [53]:
players.head()

Unnamed: 0,B/T,City,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport
0,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2008-09,bsb
1,R/R,Austin,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8,C,Texas,,,180,Jr.,2008-09,bsb
10,S/R,Winnetka,,New Trier,"Winnetka, Ill.",6-3,Zach Hofeld,/sports/bsb/2008-09/bios/hofeld_zach,29,RHP,Ill.,,,205,Jr.,2008-09,bsb
100,L/R,Charlottesville,,St. Anne's-Belfield,"Charlottesville, Va.",5-11,Jake Allen,/sports/bsb/2016-17/bios/allen_jake_gyf5,10,C,Va.,,,185,So.,2016-17,bsb
1000,,Greendale,,Greendale HS,"Greendale, Wis.",6-8,Eric Wanta,/sports/mbkb/1984-85/Bios/Eric_Wanta,15,Forward,Wis.,,,210,Freshman,1984-85,mbkb


In [54]:
players.shape

(10896, 17)

In [55]:
bio_parsed_path = os.path.join(PROJ_ROOT, "data", "interim", "harvard_bio_parsed.csv")
bios = pd.read_csv(bio_parsed_path)

In [56]:
bios.shape
bios.columns = ["Name_link", "synopsis", "headers"]

In [57]:
combined = players.merge(bios, on="Name_link", how="inner")

In [58]:
combined.shape

(10895, 19)

In [59]:
combined.head()

Unnamed: 0,B/T,City,Event,High School,Hometown,Ht.,Name,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport,synopsis,headers
0,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2008-09,bsb,\r\nJunior (2010):\r\n\r\n Named team's captai...,"{""Position:"": ""C"", ""Year:"": ""Jr."", ""Height:"": ..."
1,R/R,Long Beach,,Woodrow Wilson,"Long Beach, Calif.",6-2,Tyler Albright,/sports/bsb/2008-09/bios/albright_tyler,6,C,Calif.,,,190,Jr.,2009-10,bsb,\r\nJunior (2010):\r\n\r\n Named team's captai...,"{""Position:"": ""C"", ""Year:"": ""Jr."", ""Height:"": ..."
2,R/R,Austin,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8,C,Texas,,,180,Jr.,2008-09,bsb,\r\nJunior (2010):\r\n\r\n Started at catcher ...,"{""Position:"": ""C"", ""Year:"": ""Jr."", ""Height:"": ..."
3,R/R,Austin,,St. Stephens Episcopal,"Austin, Texas",5-10,Cole Arledge,/sports/bsb/2008-09/bios/arledge_cole,8,C,Texas,,,180,Jr.,2009-10,bsb,\r\nJunior (2010):\r\n\r\n Started at catcher ...,"{""Position:"": ""C"", ""Year:"": ""Jr."", ""Height:"": ..."
4,S/R,Winnetka,,New Trier,"Winnetka, Ill.",6-3,Zach Hofeld,/sports/bsb/2008-09/bios/hofeld_zach,29,RHP,Ill.,,,205,Jr.,2008-09,bsb,\r\nSophomore (2009):\r\n\r\nStruck out three ...,"{""Position:"": ""RHP"", ""Year:"": ""Jr."", ""Height:""..."


In [60]:
def formatBio(row):
    synopsis = row["synopsis"]
    if type(synopsis) == float:
        return ""
    return re.sub("\n", "", synopsis)

In [61]:
combined["Bio"] = combined.apply(formatBio, axis=1)

In [62]:
del(combined["synopsis"])

In [63]:
playersSorted = combined.sort_values(["season"], ascending=False)

In [64]:
# Get range of years a player was involved in the sport
groupedFirst = playersSorted.groupby(by=["Name"]).first()
groupedLast = playersSorted.groupby(by=["Name"]).last()

In [65]:
len(groupedLast)

4270

In [66]:
for i in range(len(groupedLast)):
    first = groupedFirst.iloc[i]
    groupedLast.iloc[i]["season"] = groupedLast.iloc[i]["season"][:-2] + first["season"][-2:]

In [67]:
groupedLast.head()

Unnamed: 0_level_0,B/T,City,Event,High School,Hometown,Ht.,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport,headers,Bio
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A.J. Jaffe,,St. Charles,,Marmion Academy,"St. Charles, Ill.",,/sports/wrest/2016-17/bios/jaffe_aj,,141,Ill.,,,,Freshman,2016-17,wrest,"{""Hometown:"": ""St. Charles, Ill."", ""Position:""...",\rBefore Harvard Four-time letterwinner at Mar...
A.J. Kennedy,,Fonthill,,Notre Dame College,"Fonthill, Ont.",6-3,/sports/fball/2003-04/bios/a.j._kennedy,98.0,DE,Ont.,,,220.0,Freshman,2003-04,fball,"{""Position:"": ""DE"", ""Year:"": ""Freshman"", ""Heig...",
AJ Carvalho,,Swansea,,Joseph Case,"Swansea, Mass.",,/sports/mcrew-lw/2010-11/bios/varsity/Carvalho...,,,Mass.,,,,Sophomore,2010-13,mcrew-lw,"{""name"": ""Antonio Carvalho"", ""Year:"": ""Sophomo...",\rSophomore • 2011\r\rDid not see spring race ...
Aaki Vora,,Mumbai,Freestyle,The Cathedral and John Connon School,"Mumbai, India",,/sports/wswimdive/2016-17/bios/Vora_Aaki,,,India,,,,Freshman,2016-17,wswimdive,"{""Hometown:"": ""Mumbai, India"", ""Position:"": ""F...",\rBefore Harvard\r\r Best female athlete at th...
Aaron Byrd,,Abilene,,Cooper,"Abilene, Texas",6-2,/sports/fball/2001-02/bios/aaron_byrd,3.0,DB,Texas,,,190.0,Freshman,2001-03,fball,"{""Position:"": ""DB"", ""Year:"": ""Freshman"", ""Heig...",


In [75]:
# Export data to csv
processedPath = os.path.join(PROJ_ROOT, "data", "interim", "harvard_bios_processed")
# groupedLast.to_csv(processedPath, encoding='utf-8')
groupedLast = groupedLast.reset_index()
groupedLast.to_pickle(processedPath + ".p")

In [77]:
groupedLast.head()

Unnamed: 0,Name,B/T,City,Event,High School,Hometown,Ht.,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport,headers,Bio
0,A.J. Jaffe,,St. Charles,,Marmion Academy,"St. Charles, Ill.",,/sports/wrest/2016-17/bios/jaffe_aj,,141,Ill.,,,,Freshman,2016-17,wrest,"{""Hometown:"": ""St. Charles, Ill."", ""Position:""...",\rBefore Harvard Four-time letterwinner at Mar...
1,A.J. Kennedy,,Fonthill,,Notre Dame College,"Fonthill, Ont.",6-3,/sports/fball/2003-04/bios/a.j._kennedy,98.0,DE,Ont.,,,220.0,Freshman,2003-04,fball,"{""Position:"": ""DE"", ""Year:"": ""Freshman"", ""Heig...",
2,AJ Carvalho,,Swansea,,Joseph Case,"Swansea, Mass.",,/sports/mcrew-lw/2010-11/bios/varsity/Carvalho...,,,Mass.,,,,Sophomore,2010-13,mcrew-lw,"{""name"": ""Antonio Carvalho"", ""Year:"": ""Sophomo...",\rSophomore • 2011\r\rDid not see spring race ...
3,Aaki Vora,,Mumbai,Freestyle,The Cathedral and John Connon School,"Mumbai, India",,/sports/wswimdive/2016-17/bios/Vora_Aaki,,,India,,,,Freshman,2016-17,wswimdive,"{""Hometown:"": ""Mumbai, India"", ""Position:"": ""F...",\rBefore Harvard\r\r Best female athlete at th...
4,Aaron Byrd,,Abilene,,Cooper,"Abilene, Texas",6-2,/sports/fball/2001-02/bios/aaron_byrd,3.0,DB,Texas,,,190.0,Freshman,2001-03,fball,"{""Position:"": ""DB"", ""Year:"": ""Freshman"", ""Heig...",
