# Python script to do joining and manipulation of data

In [1]:
# LIBRARIES TO USE ###################################################
import pandas

In [24]:
# UTILITY FUNCTIONS ##################################################
def get_dataframe( filename ):
    return pandas.read_csv( filename )

def join_data( data_one, data_two, cols, join_type='inner', rename={} ):
    
    # RENAME THE COLUMNS FOR DATA_TWO IF NECESSARY ###################
    if rename: # EMPTY DICTIONARIES EVALUATE TO FALSE ################
        data_two = data_two.rename( index=str, columns=rename )
    
    return pandas.merge( data_one, data_two, how=join_type, on=cols )

def write( data, filename, cols=[] ):
    if len(cols) > 0:
        data.to_csv( filename, columns=cols, index=False )
    else:    
        data.to_csv( filename, index=False )

## Examples

In [25]:
# BASIC JOIN EXAMPLE #################################################
df1 = get_dataframe( 'data/NCAATourneySeeds.csv')
df2 = get_dataframe( 'data/TeamConferences.csv')
cols = ['TeamID', 'Season']
joined = join_data( df1, df2, cols=cols )
write( joined, 'cleaned/ConferenceSeeds.csv')

In [26]:
# ORGANIZE THE NCAA TOURNAMENT RESULTS BASED ON THE SEED #############
seeds = get_dataframe( 'data/NCAATourneySeeds.csv' )
results = get_dataframe( 'data/NCAATourneyCompactResults.csv' )

join_cols = ['WTeamID', 'Season']
rename = { 'TeamID':'WTeamID', 'Seed':'WSeed' }
winner_joined = join_data( results, seeds, cols=join_cols, rename=rename )

join_cols = ['LTeamID', 'Season']
rename = { 'TeamID':'LTeamID', 'Seed':'LSeed' }
both_joined = join_data( winner_joined, seeds, cols=join_cols, rename=rename )

both_joined['WSeed'] = both_joined['WSeed'].apply( (lambda x: int(x[1:3])) )
both_joined['LSeed'] = both_joined['LSeed'].apply( (lambda x: int(x[1:3])) )
both_joined['SeedDiff'] = both_joined.apply( (lambda x: x['LSeed'] - x['WSeed']), axis=1 )

write_cols = ['Season', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WSeed', 'LSeed', 'SeedDiff']
write( both_joined, 'cleaned/TourneyResultsWithSeeds.csv', cols=write_cols )

finished = get_dataframe( 'cleaned/TourneyResultsWithSeeds.csv' )
finished.head(5)

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore,WSeed,LSeed,SeedDiff
0,1985,1116,63,1234,54,9,8,-1
1,1985,1120,59,1345,58,11,6,-5
2,1985,1120,66,1242,64,11,3,-8
3,1985,1207,68,1250,43,1,16,15
4,1985,1207,63,1396,46,1,8,7
