In [2]:
# Combine two dataframes into one common dataframe,
# and dump to SQL file for upload to Web app
import os
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
PROJ_ROOT = os.pardir

# For munging
import re
import json

In [3]:
yalePath = os.path.join(PROJ_ROOT, "data", "interim", "yale_bios_processed.csv")
harvardPath = os.path.join(PROJ_ROOT, "data", "interim", "harvard_bios_processed.csv")

yale = pd.read_csv(yalePath)
harvard = pd.read_csv(harvardPath)

In [4]:
yale.shape

(3004, 18)

In [5]:
harvard.shape

(4270, 19)

In [6]:
yale.columns

Index([u'Name', u'B/T', u'City', u'Cl.', u'Events', u'High School',
       u'Hometown/High School', u'Hometown/Region', u'Ht.', u'Name_link',
       u'No.', u'Pos.', u'Region', u'Weapon', u'Wt.', u'season', u'sport',
       u'Bio'],
      dtype='object')

In [7]:
harvard.columns

Index([u'Name', u'B/T', u'City', u'Event', u'High School', u'Hometown', u'Ht.',
       u'Name_link', u'No.', u'Position', u'Region', u'S/C', u'Weapon', u'Wt.',
       u'Yr.', u'season', u'sport', u'headers', u'Bio'],
      dtype='object')

In [8]:
yale.head()

Unnamed: 0,Name,B/T,City,Cl.,Events,High School,Hometown/High School,Hometown/Region,Ht.,Name_link,No.,Pos.,Region,Weapon,Wt.,season,sport,Bio
0,A.J. Haase,,Bonne Terre,Fr.,,North County,"Bonne Terre, MO / North County","Bonne Terre, MO",6-4,/sports/m-footbl/2006-07/bios/haase_a.j.00.html,,TE,MO,,245.0,2006-10,Football,2010-11: NCAA East Regional All-TournamentTeam...
1,AJ Edwards,,Seattle,Fr.,,South Kent School,"Seattle, Wash. / South Kent School","Seattle, Wash.",6-5,/sports/m-baskbl/2013-14/bios/edwards_aj_geat,25.0,G,Wash.,,190.0,2013-17,Basketball,2010: Appeared in five games... Recorded a 9.0...
2,Aarica West,,Buena Park,Freshman,,Vista Murrieta,"Buena Park, Calif. / Vista Murrieta","Buena Park, Calif.",5-10,/sports/w-baskbl/2009-10/bios/west_aarica,31.0,G/F,Calif.,,,2009-12,Basketball,2014-15 (Senior): Named captain of Yale Heavyw...
3,Aaron Ault,,Altamonte Springs,Fr.,,Lake Brantley,"Altamonte Springs, FL / Lake Brantley","Altamonte Springs, FL",6-3,/sports/m-footbl/2012-13/bios/ault_aaron_lcur,93.0,LB,FL,,194.0,2012-16,Football,Notes: Owns school record for career goals aga...
4,Aaron Bosgang,,Port Washington,Fr.,,Paul D. Schreiber High School,"Port Washington, N.Y. / Paul D. Schreiber High...","Port Washington, N.Y.",,/sports/m-crewlt/2016-17/bios/bosgang_aaron_uh2u,,,N.Y.,,,2016-17,Crew (Lightweight),2015: Played in all 10 games…Had 11 solo tackl...


In [9]:
harvard.head()

Unnamed: 0,Name,B/T,City,Event,High School,Hometown,Ht.,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport,headers,Bio
0,A.J. Jaffe,,St. Charles,,Marmion Academy,"St. Charles, Ill.",,/sports/wrest/2016-17/bios/jaffe_aj,,141,Ill.,,,,Freshman,2016-17,wrest,"{u'Hometown:': u'St. Charles, Ill.', u'Positio...",Before Harvard Four-time letterwinner at Marmi...
1,A.J. Kennedy,,Fonthill,,Notre Dame College,"Fonthill, Ont.",6-3,/sports/fball/2003-04/bios/a.j._kennedy,98.0,DE,Ont.,,,220.0,Freshman,2003-04,fball,"{u'Position:': u'DE', u'Year:': u'Freshman', u...",
2,AJ Carvalho,,Swansea,,Joseph Case,"Swansea, Mass.",,/sports/mcrew-lw/2010-11/bios/varsity/Carvalho...,,,Mass.,,,,Sophomore,2010-13,mcrew-lw,"{u'name': u'Antonio Carvalho', u'Year:': u'Sop...",Sophomore • 2011Did not see spring race action...
3,Aaki Vora,,Mumbai,Freestyle,The Cathedral and John Connon School,"Mumbai, India",,/sports/wswimdive/2016-17/bios/Vora_Aaki,,,India,,,,Freshman,2016-17,wswimdive,"{u'Hometown:': u'Mumbai, India', u'Position:':...",Before Harvard Best female athlete at the 2015...
4,Aaron Byrd,,Abilene,,Cooper,"Abilene, Texas",6-2,/sports/fball/2001-02/bios/aaron_byrd,3.0,DB,Texas,,,190.0,Freshman,2001-03,fball,"{u'Position:': u'DB', u'Year:': u'Freshman', u...",


Definitive list of columns:
1. Name
2. B/T
3. City
4. Events
5. High School
6. Hometown
7. Height
8. No.
9. Position
10. Region
11. S/C
12. Weapon
13. Weight
14. Year
15. Active Seasons
16. Sport
17. Headers
18. Bio

Next step involves adding blank data to tables when either does not contain the above columns, and cleaning up some of the data to ensure consistency.

In [10]:
yale.columns

Index([u'Name', u'B/T', u'City', u'Cl.', u'Events', u'High School',
       u'Hometown/High School', u'Hometown/Region', u'Ht.', u'Name_link',
       u'No.', u'Pos.', u'Region', u'Weapon', u'Wt.', u'season', u'sport',
       u'Bio'],
      dtype='object')

In [11]:
# Insert missing columns
yale["S/C"] = None
yale["Headers"] = None
yale["College"] = "Yale"
# Re-order, then rename
# Yale
yaleColumns = ["Name", "B/T", "City", "Events", "High School", "Hometown/High School",
               "Ht.", "No.", "Pos.", "Region", "S/C", "Weapon", "Wt.", "Cl.", "season", "sport", "Headers", "Bio", "College"]

In [12]:
yale = yale[yaleColumns]

In [13]:
harvard.columns

Index([u'Name', u'B/T', u'City', u'Event', u'High School', u'Hometown', u'Ht.',
       u'Name_link', u'No.', u'Position', u'Region', u'S/C', u'Weapon', u'Wt.',
       u'Yr.', u'season', u'sport', u'headers', u'Bio'],
      dtype='object')

In [14]:
harvard["College"] = "Harvard"
harvardColumns = ["Name", "B/T", "City", "Event", "High School", "Hometown",
               "Ht.", "No.", "Position", "Region", "S/C", "Weapon", "Wt.", "Yr.", "season", "sport", "headers", "Bio", "College"]

In [15]:
harvard = harvard[harvardColumns]

In [16]:
harvard.shape

(4270, 19)

In [17]:
yale.shape

(3004, 19)

In [18]:
# Normalize column names
normColumns = ["Name", "B/T", "City", "Events", "High School", "Hometown",
               "Ht.", "No.", "Position", "Region", "S/C", "Weapon", "Wt.", 
               "Class", "Active Seasons", "Sport", "Misc", "Bio", "College"]
yale.columns = normColumns
harvard.columns = normColumns

In [19]:
yale.head()

Unnamed: 0,Name,B/T,City,Events,High School,Hometown,Ht.,No.,Position,Region,S/C,Weapon,Wt.,Class,Active Seasons,Sport,Misc,Bio,College
0,A.J. Haase,,Bonne Terre,,North County,"Bonne Terre, MO / North County",6-4,,TE,MO,,,245.0,Fr.,2006-10,Football,,2010-11: NCAA East Regional All-TournamentTeam...,Yale
1,AJ Edwards,,Seattle,,South Kent School,"Seattle, Wash. / South Kent School",6-5,25.0,G,Wash.,,,190.0,Fr.,2013-17,Basketball,,2010: Appeared in five games... Recorded a 9.0...,Yale
2,Aarica West,,Buena Park,,Vista Murrieta,"Buena Park, Calif. / Vista Murrieta",5-10,31.0,G/F,Calif.,,,,Freshman,2009-12,Basketball,,2014-15 (Senior): Named captain of Yale Heavyw...,Yale
3,Aaron Ault,,Altamonte Springs,,Lake Brantley,"Altamonte Springs, FL / Lake Brantley",6-3,93.0,LB,FL,,,194.0,Fr.,2012-16,Football,,Notes: Owns school record for career goals aga...,Yale
4,Aaron Bosgang,,Port Washington,,Paul D. Schreiber High School,"Port Washington, N.Y. / Paul D. Schreiber High...",,,,N.Y.,,,,Fr.,2016-17,Crew (Lightweight),,2015: Played in all 10 games…Had 11 solo tackl...,Yale


In [20]:
stacked = pd.concat([yale, harvard], axis=0)

In [21]:
# Drop some of the columns we don't need
finalColumns = ["Name", "City", "High School", "Hometown",
                "Ht.", "No.", "Position", "Region", "Wt.",
                "Active Seasons", "Misc", "Bio", "College"]
stacked = stacked[finalColumns]

In [22]:
# Seems about right.
processedPath = os.path.join(PROJ_ROOT, "data", "processed", "player_bios_processed.csv")
stacked.to_csv(processedPath, encoding='utf-8')