In [353]:
# Combine two dataframes into one common dataframe,
# and dump to SQL file for upload to Web app
import os
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
PROJ_ROOT = os.pardir

# For munging
import re
import json

In [354]:
yalePath = os.path.join(PROJ_ROOT, "data", "interim", "yale_bios_processed.csv")
harvardPath = os.path.join(PROJ_ROOT, "data", "interim", "harvard_bios_processed.p")

yale = pd.read_csv(yalePath)
harvard = pd.read_pickle(harvardPath)

In [355]:
yale.columns

Index([u'Name', u'B/T', u'City', u'Cl.', u'Events', u'High School',
       u'Hometown/High School', u'Hometown/Region', u'Ht.', u'Name_link',
       u'No.', u'Pos.', u'Region', u'Weapon', u'Wt.', u'season', u'sport',
       u'Bio', u'headers'],
      dtype='object')

In [356]:
harvard.shape

(4270, 19)

In [357]:
yale.columns

Index([u'Name', u'B/T', u'City', u'Cl.', u'Events', u'High School',
       u'Hometown/High School', u'Hometown/Region', u'Ht.', u'Name_link',
       u'No.', u'Pos.', u'Region', u'Weapon', u'Wt.', u'season', u'sport',
       u'Bio', u'headers'],
      dtype='object')

In [358]:
harvard.columns

Index([       u'Name',         u'B/T',        u'City',       u'Event',
       u'High School',    u'Hometown',         u'Ht.',   u'Name_link',
               u'No.',    u'Position',      u'Region',         u'S/C',
            u'Weapon',         u'Wt.',         u'Yr.',      u'season',
             u'sport',     u'headers',         u'Bio'],
      dtype='object')

In [359]:
yale["headers"][0]

'{"Position:": "TE", "Year:": "Fr.", "Height:": "6-4", "Previous College:": "Iowa State", "Years at Yale:": "***", "Hometown:": "Bonne Terre, MO", "Weight:": "245", "High School:": "North County", "name": "A.J. Haase"}'

In [360]:
harvard.head()

Unnamed: 0,Name,B/T,City,Event,High School,Hometown,Ht.,Name_link,No.,Position,Region,S/C,Weapon,Wt.,Yr.,season,sport,headers,Bio
0,A.J. Jaffe,,St. Charles,,Marmion Academy,"St. Charles, Ill.",,/sports/wrest/2016-17/bios/jaffe_aj,,141,Ill.,,,,Freshman,2016-17,wrest,"{""Hometown:"": ""St. Charles, Ill."", ""Position:""...",\rBefore Harvard Four-time letterwinner at Mar...
1,A.J. Kennedy,,Fonthill,,Notre Dame College,"Fonthill, Ont.",6-3,/sports/fball/2003-04/bios/a.j._kennedy,98.0,DE,Ont.,,,220.0,Freshman,2003-04,fball,"{""Position:"": ""DE"", ""Year:"": ""Freshman"", ""Heig...",
2,AJ Carvalho,,Swansea,,Joseph Case,"Swansea, Mass.",,/sports/mcrew-lw/2010-11/bios/varsity/Carvalho...,,,Mass.,,,,Sophomore,2010-13,mcrew-lw,"{""name"": ""Antonio Carvalho"", ""Year:"": ""Sophomo...",\rSophomore • 2011\r\rDid not see spring race ...
3,Aaki Vora,,Mumbai,Freestyle,The Cathedral and John Connon School,"Mumbai, India",,/sports/wswimdive/2016-17/bios/Vora_Aaki,,,India,,,,Freshman,2016-17,wswimdive,"{""Hometown:"": ""Mumbai, India"", ""Position:"": ""F...",\rBefore Harvard\r\r Best female athlete at th...
4,Aaron Byrd,,Abilene,,Cooper,"Abilene, Texas",6-2,/sports/fball/2001-02/bios/aaron_byrd,3.0,DB,Texas,,,190.0,Freshman,2001-03,fball,"{""Position:"": ""DB"", ""Year:"": ""Freshman"", ""Heig...",


Definitive list of columns:
1. Name
2. B/T
3. City
4. Events
5. High School
6. Hometown
7. Height
8. No.
9. Position
10. Region
11. S/C
12. Weapon
13. Weight
14. Year
15. Active Seasons
16. Sport
17. Headers
18. Bio

Next step involves adding blank data to tables when either does not contain the above columns, and cleaning up some of the data to ensure consistency.

In [361]:
yale.columns

Index([u'Name', u'B/T', u'City', u'Cl.', u'Events', u'High School',
       u'Hometown/High School', u'Hometown/Region', u'Ht.', u'Name_link',
       u'No.', u'Pos.', u'Region', u'Weapon', u'Wt.', u'season', u'sport',
       u'Bio', u'headers'],
      dtype='object')

In [362]:
# Insert missing columns
yale["S/C"] = None
yale["College"] = "Yale"
# Re-order, then rename
# Yale
yaleColumns = ["Name", "B/T", "City", "Events", "High School", "Hometown/High School",
               "Ht.", "No.", "Pos.", "Region", "S/C", "Weapon", "Wt.", "Cl.", "season", "sport", "headers", "Bio", "College"]

In [363]:
yale = yale[yaleColumns]

In [364]:
# STILL WORKING

In [365]:
harvard.columns

Index([       u'Name',         u'B/T',        u'City',       u'Event',
       u'High School',    u'Hometown',         u'Ht.',   u'Name_link',
               u'No.',    u'Position',      u'Region',         u'S/C',
            u'Weapon',         u'Wt.',         u'Yr.',      u'season',
             u'sport',     u'headers',         u'Bio'],
      dtype='object')

In [366]:
harvard["College"] = "Harvard"
harvardColumns = ["Name", "B/T", "City", "Event", "High School", "Hometown",
               "Ht.", "No.", "Position", "Region", "S/C", "Weapon", "Wt.", "Yr.", "season", "sport", "headers", "Bio", "College"]

In [367]:
harvard = harvard[harvardColumns]

In [368]:
harvard.shape

(4270, 19)

In [369]:
yale.shape

(3004, 19)

In [370]:
# Normalize column names
normColumns = ["Name", "B/T", "City", "Events", "High School", "Hometown",
               "Ht.", "No.", "Position", "Region", "S/C", "Weapon", "Wt.", 
               "Class", "Active Seasons", "Sport", "Misc", "Bio", "College"]
yale.columns = normColumns
harvard.columns = normColumns

In [371]:
yale["Hometown"] = yale['Hometown'].apply(lambda x: x.split("/")[0].strip() if type(x) == str else "")

In [372]:
yale["Misc"][0]

'{"Position:": "TE", "Year:": "Fr.", "Height:": "6-4", "Previous College:": "Iowa State", "Years at Yale:": "***", "Hometown:": "Bonne Terre, MO", "Weight:": "245", "High School:": "North County", "name": "A.J. Haase"}'

In [373]:
stacked = pd.concat([yale, harvard], axis=0)

In [374]:
# Drop some of the columns we don't need
finalColumns = ["Name", "High School", "Hometown",
                "Ht.", "No.", "Position", "Wt.",
                "Active Seasons", "Misc", "Bio", "College"]
stacked = stacked[finalColumns]

In [375]:
stacked.head()

Unnamed: 0,Name,High School,Hometown,Ht.,No.,Position,Wt.,Active Seasons,Misc,Bio,College
0,A.J. Haase,North County,"Bonne Terre, MO",6-4,,TE,245.0,2006-10,"{""Position:"": ""TE"", ""Year:"": ""Fr."", ""Height:"":...",2008: Started eight games at TE... LedYale end...,Yale
1,AJ Edwards,South Kent School,"Seattle, Wash.",6-5,25.0,G,190.0,2013-17,"{""Position:"": ""G"", ""Year:"": ""Fr."", ""Height:"": ...",2013-14: Appeared in five games… Scored five p...,Yale
2,Aarica West,Vista Murrieta,"Buena Park, Calif.",5-10,31.0,G/F,,2009-12,"{""Position:"": ""G/F"", ""Year:"": ""Freshman"", ""Hei...",Before Yale: Played four years scholasticallyi...,Yale
3,Aaron Ault,Lake Brantley,"Altamonte Springs, FL",6-3,93.0,LB,194.0,2012-16,"{""Position:"": ""LB"", ""Year:"": ""Fr."", ""Height:"":...","2013: Played in all 10 games, mostly onspecial...",Yale
4,Aaron Bosgang,Paul D. Schreiber High School,"Port Washington, N.Y.",,,,,2016-17,"{""Hometown:"": ""Port Washington, N.Y."", ""Major:...",Off the Water: President of National Honor Soc...,Yale


In [376]:
indices = range(0, 7274)
stacked["Student_ID"] = indices

In [377]:
stacked = stacked.set_index(["Student_ID"])

In [378]:
stacked.shape

(7274, 11)

In [379]:
stacked.columns

Index([u'Name', u'High School', u'Hometown', u'Ht.', u'No.', u'Position',
       u'Wt.', u'Active Seasons', u'Misc', u'Bio', u'College'],
      dtype='object')

In [380]:
# Split up the active seasons into start and end
stacked["StartSeason"] = stacked.apply(lambda x: int(x["Active Seasons"][0:4]), axis=1)
stacked["EndSeason"] = stacked.apply(lambda x: int(x["Active Seasons"][0:2] +
                                                   x["Active Seasons"][-2:]),
                                     axis=1)

In [381]:
stacked = stacked.drop("Active Seasons", axis=1)

In [382]:
stacked.head()

Unnamed: 0_level_0,Name,High School,Hometown,Ht.,No.,Position,Wt.,Misc,Bio,College,StartSeason,EndSeason
Student_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,A.J. Haase,North County,"Bonne Terre, MO",6-4,,TE,245.0,"{""Position:"": ""TE"", ""Year:"": ""Fr."", ""Height:"":...",2008: Started eight games at TE... LedYale end...,Yale,2006,2010
1,AJ Edwards,South Kent School,"Seattle, Wash.",6-5,25.0,G,190.0,"{""Position:"": ""G"", ""Year:"": ""Fr."", ""Height:"": ...",2013-14: Appeared in five games… Scored five p...,Yale,2013,2017
2,Aarica West,Vista Murrieta,"Buena Park, Calif.",5-10,31.0,G/F,,"{""Position:"": ""G/F"", ""Year:"": ""Freshman"", ""Hei...",Before Yale: Played four years scholasticallyi...,Yale,2009,2012
3,Aaron Ault,Lake Brantley,"Altamonte Springs, FL",6-3,93.0,LB,194.0,"{""Position:"": ""LB"", ""Year:"": ""Fr."", ""Height:"":...","2013: Played in all 10 games, mostly onspecial...",Yale,2012,2016
4,Aaron Bosgang,Paul D. Schreiber High School,"Port Washington, N.Y.",,,,,"{""Hometown:"": ""Port Washington, N.Y."", ""Major:...",Off the Water: President of National Honor Soc...,Yale,2016,2017


In [383]:
# def strToDict(string):
#     if string:
#         string = string.encode('utf-8')
#         string = string.replace(" u'", " \"")
#         string = string.replace("': ", "\": ")
#         string = string.replace("', ", "\", ")
#         string = string.replace(" u\"", " \"")
#         string = string[0] + "\"" + string[3:]
#         string = string[:-2] + "\"" + string[-1]
#         try:
#             return json.loads(string, encoding="cp1252")
#         except:
#             print(string)
#             return {}
#     return {}

In [384]:
# stacked["Misc"] = stacked["Misc"].map(strToDict)

In [385]:
stacked.shape
print(json.loads(stacked["Misc"][104]))

{u'Hometown:': u'Shrub Oak, NY', u'Year:': u'Freshman', u'High School:': u'Hackley School', u'name': u'Alexandra Cadicamo'}


In [386]:
def extractMajors(row):
    if row["Misc"] and not pd.isnull(row["Misc"]):
        misc = json.loads(row["Misc"])
        if misc and "Major:" in misc:
            return misc["Major:"]
    return ""

In [387]:
def height2float(height):
    if not pd.isnull(height) and height:
        pair = None
        if '-' in height:
            pair = height.split('-')
        elif "'" in height:
            pair = height.split("'")
            pair[1] = pair[1][:-1] # Remove the "
        elif "0" in height:
            pair = height.split("0")
        if len(pair) == 1:
            pair.append(float(0))
        try:
            pair = map(float, pair)              # convert strings to ints
        except:
            print(pair)
        return (12 * pair[0] + pair[1])    # assumes imperial units (12 inches per foot)  
        return -1

In [388]:
stacked['Ht.'] = stacked['Ht.'].map(height2float)

[u'6', u'']
[u'6', u'']


In [389]:
stacked["Major"] = stacked.apply(extractMajors, axis=1)

In [390]:
majors = stacked.groupby("Major").count().sort_values("Name", ascending=False)

In [391]:
# Seems about right.
processedPath = os.path.join(PROJ_ROOT, "data", "processed", "player_bios_processed.csv")
stacked.to_csv(processedPath, encoding='utf-8')

In [392]:
majorsPath = os.path.join(PROJ_ROOT, "data", "processed", "by_major.csv")
majors.to_csv(majorsPath, encoding='utf-8')