In [262]:
# Flattening players.json file into more usable data

In [263]:
# For data manipulation
import os
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
PROJ_ROOT = os.pardir

# For Scraping
from bs4 import BeautifulSoup
import requests
import time

# For munging
import re
import json

In [264]:
playersPath = os.path.join(PROJ_ROOT, "data", "interim", "players.json")
biosPath = os.path.join(PROJ_ROOT, "data", "interim", "yale_names_bios.csv.gz")

In [265]:
players = pd.read_json(playersPath)
bios = pd.read_csv(biosPath, compression="gzip")

In [266]:
players.head(5)

Unnamed: 0,B/T,City,Cl.,Events,High School,Hometown/High School,Hometown/Region,Ht.,Name,Name_link,No.,Pos.,Region,Weapon,Wt.,season,sport
0,,Manhasset,Freshman,,Chaminade,"Manhasset, N.Y. / Chaminade","Manhasset, N.Y.",5-11,Ryan Brenner,/sports/m-basebl/mtt/brenner_ryan00.html,11.0,C,N.Y.,,180.0,2008-09,Baseball
1,,Sunrise,Junior,,Pine Crest School,"Sunrise, Fla. / Pine Crest School","Sunrise, Fla.",6-2,Joe Castaldi,/sports/m-basebl/mtt/castaldi_joe00.html,34.0,P,Fla.,,190.0,2008-09,Baseball
10,,Okemos,Freshman,,Okemos,"Okemos, Mich. / Okemos","Okemos, Mich.",6-2,Pat Ludwig,/sports/m-basebl/mtt/ludwig_pat00.html,26.0,P,Mich.,,175.0,2008-09,Baseball
100,R/R,-,Freshman,,-,- / -,-,,Bobby Cecere,/sports/m-basebl/2016-17/bios/cecere_bobby_a3p9,,RHP,-,,,2016-17,Baseball
1000,,Rapid City,Fr.,,Stevens High School,"Rapid City, South Dakota / Stevens High School","Rapid City, South Dakota",,Eric Sanderson,/sports/m-crewlt/2015-16/bios/sanderson_eric_edr3,,,South Dakota,,,2015-16,Crew (Lightweight)


In [267]:
bios.head(5)

Unnamed: 0.1,Unnamed: 0,Name,Bio
0,0,Ryan Brenner,"2009: Made 22 appearances, including 20 starts..."
1,1,Joe Castaldi,"2010: Team captain.2009: Made 12 appearances, ..."
2,2,Adam Straus-Goldfarb,2015-16 (Junior): Three seat of the fourth var...
3,3,Robert Wulbern,2012-13: Rowed in stroke-seat of the 2V at Eas...
4,4,Matt Coombs,2006 Backup DB who played in all but one game....


In [268]:
# Warning: Number of players is inconsistent across the 2 dataframes
bios = bios.dropna()

In [269]:
# Remove first column off bios
bios = bios.drop(["Unnamed: 0"], axis=1)

In [270]:
# Throw in the Bio column
players = players.merge(bios, how="left", on="Name")

In [271]:
# Add the headers
headersPath = os.path.join(PROJ_ROOT, "data", "interim", "yale_bio_parsed.csv.gz")
headers = pd.read_csv(headersPath)

In [272]:
headers.head()

Unnamed: 0,url,synopsis,headers
0,/sports/m-basebl/mtt/brenner_ryan00.html,0,"{""Position:"": ""C"", ""Height:"": ""5-11"", ""Year:"":..."
1,/sports/m-basebl/mtt/castaldi_joe00.html,0,"{""Position:"": ""P"", ""Height:"": ""6-2"", ""Year:"": ..."
2,/sports/m-basebl/mtt/ludwig_pat00.html,0,"{""Hometown:"": ""Princeton, N.J."", ""Major:"": ""Ph..."
3,/sports/m-basebl/2016-17/bios/cecere_bobby_a3p9,0,"{""name"": ""Robert Wulbern"", ""Year:"": ""Jr."", ""Ma..."
4,/sports/m-crewlt/2015-16/bios/sanderson_eric_edr3,0,"{""Position:"": ""DB"", ""Year:"": ""Senior"", ""Height..."


In [273]:
# headers["url"] = headers.apply(lambda x: x["url"][9:-2], axis=1)

In [274]:
# Rename columns to prepare for merge
headers.columns = ["Name_link", "synopsis", "headers"]

In [275]:
players = players.merge(headers[["Name_link", "headers"]], how="left", on="Name_link")

In [276]:
players["headers"][0]

'{"Position:": "C", "Height:": "5-11", "Year:": "Freshman", "Birth Date:": "09/26/1990", "Hometown:": "Manhasset, N.Y.", "Weight:": "180", "High School:": "Chaminade", "name": "Ryan Brenner"}'

In [277]:
# Preserve only the latest season
playersSorted = players.sort_values(["season"], ascending=False)
groupedFirst = playersSorted.groupby(by=["Name"]).first()
groupedLast = playersSorted.groupby(by=["Name"]).last()

In [278]:
groupedLast.iloc[2500]["season"][-2:]

u'09'

In [279]:
groupedLast["headers"][0]

'{"Position:": "G", "Year:": "So.", "Height:": "5-7", "Hometown:": "Fayetteville, NY", "Weight:": "170", "High School:": "Jamesville-Dewitt", "name": "Josh Greenberg"}'

In [280]:
for i in range(groupedLast.shape[0]):
    first = groupedFirst.iloc[i]
    groupedLast.iloc[i]["season"] = groupedLast.iloc[i]["season"][:-2] + first["season"][-2:]

In [283]:
groupedLast.head(40)

Unnamed: 0_level_0,B/T,City,Cl.,Events,High School,Hometown/High School,Hometown/Region,Ht.,Name_link,No.,Pos.,Region,Weapon,Wt.,season,sport,Bio,headers
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
A.J. Haase,,Bonne Terre,Fr.,,North County,"Bonne Terre, MO / North County","Bonne Terre, MO",6-4,/sports/m-footbl/2006-07/bios/haase_a.j.00.html,,TE,MO,,245.0,2006-10,Football,2008: Started eight games at TE... LedYale end...,"{""Position:"": ""G"", ""Year:"": ""So."", ""Height:"": ..."
AJ Edwards,,Seattle,Fr.,,South Kent School,"Seattle, Wash. / South Kent School","Seattle, Wash.",6-5,/sports/m-baskbl/2013-14/bios/edwards_aj_geat,25.0,G,Wash.,,190.0,2013-17,Basketball,2013-14: Appeared in five games… Scored five p...,"{""Position:"": ""Pole Vault"", ""name"": ""Brendan S..."
Aarica West,,Buena Park,Freshman,,Vista Murrieta,"Buena Park, Calif. / Vista Murrieta","Buena Park, Calif.",5-10,/sports/w-baskbl/2009-10/bios/west_aarica,31.0,G/F,Calif.,,,2009-12,Basketball,Before Yale: Played four years scholasticallyi...,"{""Hometown:"": ""Concord, MA"", ""High School:"": ""..."
Aaron Ault,,Altamonte Springs,Fr.,,Lake Brantley,"Altamonte Springs, FL / Lake Brantley","Altamonte Springs, FL",6-3,/sports/m-footbl/2012-13/bios/ault_aaron_lcur,93.0,LB,FL,,194.0,2012-16,Football,"2013: Played in all 10 games, mostly onspecial...","{""Position:"": ""DB"", ""Year:"": ""So."", ""Height:"":..."
Aaron Bosgang,,Port Washington,Fr.,,Paul D. Schreiber High School,"Port Washington, N.Y. / Paul D. Schreiber High...","Port Washington, N.Y.",,/sports/m-crewlt/2016-17/bios/bosgang_aaron_uh2u,,,N.Y.,,,2016-17,Crew (Lightweight),Off the Water: President of National Honor Soc...,"{""Position:"": ""OL"", ""Year:"": ""Jr."", ""Height:"":..."
Aaron Fuchs,,Somerset West,Senior,,Panel Vallei,"Somerset West, South Africa / Panel Vallei","Somerset West, South Africa",,/sports/m-squash/2009-10/bios/fuchs_aaron,,,South Africa,,,2009-10,Squash,"Ibegan my squash adventure at the age of 14, m...","{""name"": ""Duncan Logie"", ""Height:"": ""6-6"", ""Ye..."
Aaron Greenberg,,Eden Prairie,Fr.,Freestyle/Breaststroke/Backstroke,Eden Prairie High School,"Eden Prairie, Minn. / Eden Prairie High School","Eden Prairie, Minn.",,/sports/m-swim/2013-14/bios/greenberg_aaron_ifc2,,,Minn.,,,2013-17,Swimming & Diving,Best Times: 50 Freestyle: 19.74 100 ...,"{""Hometown:"": ""Woodbridge, Conn."", ""Position:""..."
Aaron Jones,,"St. Catharines, Ont., Canada",Fr.,,Governor Simcoe,"St. Catharines, Ont., Canada / Governor Simcoe","St. Catharines, Ont., Canada",,/sports/m-crewlt/2013-14/bios/jones_aaron_g4f2,,,NOREGION?,,,2013-17,Crew (Lightweight),Before Yale: Rowed for St. Catharines Rowing C...,"{""Position:"": ""LB"", ""Year:"": ""Senior"", ""Height..."
Aaron Rodriguez,,Cypress,Jr.,,Cy-Fair,"Cypress, TX / Cy-Fair","Cypress, TX",5-5,/sports/m-soccer/2003-04/bios/rodriguez_aaron0...,7.0,Midfield,TX,,135.0,2003-05,Soccer,"Rodriguez, who had an excellent spring season,...","{""name"": ""Hugh O'Cinneide"", ""Year:"": ""Senior"",..."
Aaron Seriff-Cullick,,-,Fr.,Diving,-,- / -,-,,/sports/m-swim/2009-10/bios/seriff-cullick_aar...,,,-,,,2009-13,Swimming & Diving,2012-2013: Earned Winter Academic All-Ivy reco...,"{""Hometown:"": ""Richboro, Pa."", ""Position:"": ""D..."


In [284]:
processedPath = os.path.join(PROJ_ROOT, "data", "interim", "yale_bios_processed.csv")
groupedLast.to_csv(processedPath, encoding='utf-8')