In [186]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [178]:
# read datast(s)
with open('../data/Coaches.csv', 'rb') as f:
    coaches = pd.read_csv(f).fillna(0)

with open('../data/stadiums-geocoded.csv', 'rb') as f:
    stadium = pd.read_csv(f).fillna(0)

with open('../data/graduation-rates.csv', 'rb') as f:
    grad_rates = pd.read_csv(f).fillna(0)

with open('../data/2017-season.csv', 'rb') as f:
    season_2017 = pd.read_csv(f).fillna(0)

In [179]:
# define monetary columns
monetary_columns = [
    'SchoolPay',
    'TotalPay',
    'Bonus',
    'BonusPaid',
    'AssistantPay',
    'Buyout'
]

# convert lowercase
coaches = coaches.apply(lambda x: x.astype(str).str.lower())
stadium = stadium.apply(lambda x: x.astype(str).str.lower())
grad_rates = grad_rates.apply(lambda x: x.astype(str).str.lower())
season_2017 = season_2017.apply(lambda x: x.astype(str).str.lower())

# remove non-numeric to empty space
coaches[monetary_columns] = coaches[monetary_columns].replace('[\$,--]', '', regex=True)

# coerce monetary to numeric
coaches[monetary_columns] = coaches[monetary_columns].apply(pd.to_numeric, errors='coerce', axis=1).fillna(0)

# column names to lowercase
coaches.columns = coaches.columns.str.lower()
stadium.columns = stadium.columns.str.lower()
grad_rates.columns = grad_rates.columns.str.lower()
season_2017.columns = season_2017.columns.str.lower()

In [180]:
# convert column name: allows dataframe merge
stadium = stadium.rename(columns={'team': 'school'})

# stadium only columns
stadium = stadium[['stadium', 'school', 'capacity', 'built', 'expanded']]

# replace acronym with school name
stadium['school'].replace(['ucf'], 'central florida')
stadium['school'].replace(['usf'], 'south florida')
stadium['school'].replace(['utsa'], 'texas-san antonio')
stadium['school'].replace(['byu'], 'brigham young')
stadium['school'].replace(['utep'], 'texas-el paso')
stadium['school'].replace(['ucf'], 'central florida')
stadium['school'].replace(['tcu'], 'texas christian')
stadium['school'].replace(['unlv'], 'nevada-las vegas')
stadium['school'].replace(['smu'], 'southern methodist')
stadium['school'].replace(['niu'], 'northern illinois')
stadium['school'].replace(['miami (oh)'], 'miami (ohio)')
stadium['school'].replace(['fiu'], 'florida international')
stadium['school'].replace(['umass'], 'massachusetts')
stadium['school'].replace(['yale bulldogs'], 'connecticut')

# merge coaches + stadium
merged_df = pd.merge(coaches, stadium, on='school', how='inner')

# merge graduation rates
merged_df = pd.merge(merged_df, grad_rates, on='school', how='inner')

# merge 2017 season
merged_df = pd.merge(merged_df, season_2017, on='school', how='inner')

In [188]:
# local variables
regr = linear_model.LinearRegression()

# split train + test
train, test = train_test_split(merged_df, test_size=0.33)

                school conference               coach  schoolpay   totalpay  \
32    georgia southern   sun belt       chad lunsford   650000.0   650000.0   
40          iowa state     big 12       matt campbell  3500000.0  3500000.0   
93              toledo        mac        jason candle  1125000.0  1125000.0   
98                utah     pac-12    kyle whittingham  3787917.0  3787917.0   
86            syracuse        acc         dino babers  2401206.0  2401206.0   
12         boise state   mt. west        bryan harsin  1650010.0  1650010.0   
45      louisiana tech      c-usa          skip holtz   700000.0   700000.0   
92          texas tech     big 12     kliff kingsbury  3703975.0  3703975.0   
68                ohio        mac        frank solich   580331.0   580331.0   
25       east carolina        aac  scottie montgomery  1102500.0  1102500.0   
99          utah state   mt. west          matt wells   900000.0   900000.0   
0            air force   mt. west        troy calhou