In [29]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn import svm
from sklearn import cross_validation
from sklearn import metrics
import re
pd.options.display.max_columns = 1000

In [30]:
seasons = [2013, 2014]
rb_column_names = ['Name', 'Team', 'Games', 'RunAtt', 'RunYards', 'RunTD', 'Targets', 'Rec', 'RecYards', 'RecTD', 'FFP', 'FFPPG']

RB_2014_1 = pd.read_csv('data/2014/RB_2014_1.csv',skiprows=1, thousands=',')
RB_2014_2 = pd.read_csv('data/2014/RB_2014_2.csv',skiprows=1, thousands=',')
RB_2013_1 = pd.read_csv('data/2013/RB_2013_1.csv',skiprows=1, thousands=',')
RB_2013_2 = pd.read_csv('data/2013/RB_2013_2.csv',skiprows=1, thousands=',')
RB_2012_1 = pd.read_csv('data/2012/RB_2012_1.csv',skiprows=1, thousands=',')
RB_2012_2 = pd.read_csv('data/2012/RB_2012_2.csv',skiprows=1, thousands=',')
RB_2011_1 = pd.read_csv('data/2011/RB_2011_1.csv',skiprows=1, thousands=',')
RB_2011_2 = pd.read_csv('data/2011/RB_2011_2.csv',skiprows=1, thousands=',')

RB_DF_LIST = [RB_2014_1, RB_2014_2, RB_2013_1, RB_2013_2, RB_2012_1, RB_2012_2, RB_2011_1, RB_2011_2]

In [31]:
for df in RB_DF_LIST:
    df.columns = rb_column_names
    df['Name'] = df['Name'].str.replace('[^a-z]', '',flags=re.IGNORECASE)

In [32]:
RB_2014 = RB_2014_1.append(RB_2014_2, ignore_index=True)
RB_2013 = RB_2013_1.append(RB_2013_2, ignore_index=True)
RB_2012 = RB_2012_1.append(RB_2011_2, ignore_index=True)
RB_2011 = RB_2011_1.append(RB_2011_2, ignore_index=True)

In [33]:
RB_sub1 = pd.merge(RB_2014, RB_2013, on='Name', how='outer', suffixes=('_2014', '_2013'))
RB_sub2 = pd.merge(RB_2012, RB_2011, on='Name', how='outer', suffixes=('_2012', '_2011'))
RB_Total = pd.merge(RB_sub1, RB_sub2, on='Name', how='outer')

In [34]:
RB_Total = RB_Total.fillna(0)
RB_Total

Unnamed: 0,Name,Team_2014,Games_2014,RunAtt_2014,RunYards_2014,RunTD_2014,Targets_2014,Rec_2014,RecYards_2014,RecTD_2014,FFP_2014,FFPPG_2014,Team_2013,Games_2013,RunAtt_2013,RunYards_2013,RunTD_2013,Targets_2013,Rec_2013,RecYards_2013,RecTD_2013,FFP_2013,FFPPG_2013,Team_2012,Games_2012,RunAtt_2012,RunYards_2012,RunTD_2012,Targets_2012,Rec_2012,RecYards_2012,RecTD_2012,FFP_2012,FFPPG_2012,Team_2011,Games_2011,RunAtt_2011,RunYards_2011,RunTD_2011,Targets_2011,Rec_2011,RecYards_2011,RecTD_2011,FFP_2011,FFPPG_2011
0,DeMarcoMurray,DAL,16,393,1845,13,64,57,416,0,294.1,18.4,DAL,14,217,1124,9,66,53,348,1,205.2,14.7,DAL,10,161,663,4,41,34,247,0,111.0,11.1,DAL,13,163,895,2,35,26,183,0,119.8,9.2
1,LeVeonBell,PIT,16,290,1361,8,105,83,854,3,287.5,18.0,PIT,13,244,860,8,66,45,408,0,172.8,13.3,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0
2,MarshawnLynch,SEA,16,280,1306,13,48,37,367,4,265.3,16.6,SEA,16,301,1257,12,44,36,316,2,239.3,15.0,SEA,16,315,1590,11,30,23,196,1,246.6,15.4,SEA,15,285,1204,12,41,28,212,1,215.6,14.4
3,MattForte,CHI,16,266,1038,6,130,102,808,4,240.6,15.0,CHI,16,288,1341,9,95,75,592,3,261.3,16.3,CHI,15,248,1094,5,60,44,340,1,177.4,11.8,CHI,12,204,999,3,76,52,490,1,168.9,14.1
4,ArianFoster,HOU,13,260,1246,8,59,38,327,5,231.3,17.8,HOU,8,121,542,1,35,22,183,1,84.5,10.6,HOU,16,351,1411,15,58,40,217,2,260.8,16.3,HOU,13,278,1224,10,72,53,617,2,250.1,19.2
5,EddieLacy,GB,16,246,1139,9,55,42,427,4,230.6,14.4,GB,15,284,1178,11,44,35,257,0,207.5,13.8,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0
6,JamaalCharles,KC,15,205,1039,9,59,40,291,5,213.0,14.2,KC,15,259,1288,12,104,70,693,7,308.1,20.5,KC,16,284,1513,5,49,36,236,1,204.9,12.8,0,0,0,0,0,0,0,0,0,0.0,0.0
7,JustinForsett,BAL,16,235,1266,8,59,44,263,0,200.9,12.6,0,0,0,0,0,0,0,0,0,0.0,0.0,SEA,16,46,145,1,34,23,128,0,33.3,2.1,SEA,16,46,145,1,34,23,128,0,33.3,2.1
8,LamarMiller,MIA,16,216,1099,8,52,38,275,1,185.4,11.6,MIA,16,177,709,2,35,26,170,0,97.9,6.1,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0
9,JeremyHill,CIN,16,222,1124,9,32,27,215,0,183.9,11.5,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0


In [35]:
stats_column_names = ['Games', 'RunAtt', 'RunYards', 'RunTD', 'Targets', 'Rec', 'RecYards', 'RecTD', 'FFP', 'FFPPG']
stats_column_names_2012 = [element + '_2012' for element in stats_column_names]
stats_column_names_2011 = [element + '_2011' for element in stats_column_names]

In [36]:
train_stat_colls = stats_column_names_2012 + stats_column_names_2011
train_stat_colls =  ['Name', 'FFP_2013', 'RunAtt_2013'] + train_stat_colls
train_stat_colls

['Name',
 'FFP_2013',
 'RunAtt_2013',
 'Games_2012',
 'RunAtt_2012',
 'RunYards_2012',
 'RunTD_2012',
 'Targets_2012',
 'Rec_2012',
 'RecYards_2012',
 'RecTD_2012',
 'FFP_2012',
 'FFPPG_2012',
 'Games_2011',
 'RunAtt_2011',
 'RunYards_2011',
 'RunTD_2011',
 'Targets_2011',
 'Rec_2011',
 'RecYards_2011',
 'RecTD_2011',
 'FFP_2011',
 'FFPPG_2011']

In [37]:
train_df = RB_Total[train_stat_colls]
train_df

Unnamed: 0,Name,FFP_2013,RunAtt_2013,Games_2012,RunAtt_2012,RunYards_2012,RunTD_2012,Targets_2012,Rec_2012,RecYards_2012,RecTD_2012,FFP_2012,FFPPG_2012,Games_2011,RunAtt_2011,RunYards_2011,RunTD_2011,Targets_2011,Rec_2011,RecYards_2011,RecTD_2011,FFP_2011,FFPPG_2011
0,DeMarcoMurray,205.2,217,10,161,663,4,41,34,247,0,111.0,11.1,13,163,895,2,35,26,183,0,119.8,9.2
1,LeVeonBell,172.8,244,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0
2,MarshawnLynch,239.3,301,16,315,1590,11,30,23,196,1,246.6,15.4,15,285,1204,12,41,28,212,1,215.6,14.4
3,MattForte,261.3,288,15,248,1094,5,60,44,340,1,177.4,11.8,12,204,999,3,76,52,490,1,168.9,14.1
4,ArianFoster,84.5,121,16,351,1411,15,58,40,217,2,260.8,16.3,13,278,1224,10,72,53,617,2,250.1,19.2
5,EddieLacy,207.5,284,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0
6,JamaalCharles,308.1,259,16,284,1513,5,49,36,236,1,204.9,12.8,0,0,0,0,0,0,0,0,0.0,0.0
7,JustinForsett,0.0,0,16,46,145,1,34,23,128,0,33.3,2.1,16,46,145,1,34,23,128,0,33.3,2.1
8,LamarMiller,97.9,177,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0
9,JeremyHill,0.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0


In [38]:
X_train = np.array(train_df.drop(['Name', 'FFP_2013'], axis=1))
y_train = np.ravel(train_df['FFP_2013'])

In [39]:
X_train

array([[ 217. ,   10. ,  161. , ...,    0. ,  119.8,    9.2],
       [ 244. ,    0. ,    0. , ...,    0. ,    0. ,    0. ],
       [ 301. ,   16. ,  315. , ...,    1. ,  215.6,   14.4],
       ..., 
       [   0. ,    0. ,    0. , ...,    2. ,   90.9,    6.5],
       [   0. ,    0. ,    0. , ...,    1. ,   85.7,   14.3],
       [   0. ,    0. ,    0. , ...,    0. ,   81. ,    7.4]])

In [40]:
gbr = GradientBoostingRegressor(n_estimators=500)
rfr = RandomForestRegressor(n_estimators=500)
score = cross_validation.cross_val_score(gbr, X_train, y_train, cv=3)

In [41]:
score

array([ 0.82447019,  0.65351916, -0.5817107 ])

In [42]:
gbr.fit(X_train, y_train)
rfr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [43]:
stats_column_names_2013 = [element + '_2013' for element in stats_column_names]
stats_column_names_2013 = stats_column_names_2013

In [44]:
pred_df = RB_Total[['Name', 'RunAtt_2014'] + stats_column_names_2013 + stats_column_names_2012]
pred_df

Unnamed: 0,Name,RunAtt_2014,Games_2013,RunAtt_2013,RunYards_2013,RunTD_2013,Targets_2013,Rec_2013,RecYards_2013,RecTD_2013,FFP_2013,FFPPG_2013,Games_2012,RunAtt_2012,RunYards_2012,RunTD_2012,Targets_2012,Rec_2012,RecYards_2012,RecTD_2012,FFP_2012,FFPPG_2012
0,DeMarcoMurray,393,14,217,1124,9,66,53,348,1,205.2,14.7,10,161,663,4,41,34,247,0,111.0,11.1
1,LeVeonBell,290,13,244,860,8,66,45,408,0,172.8,13.3,0,0,0,0,0,0,0,0,0.0,0.0
2,MarshawnLynch,280,16,301,1257,12,44,36,316,2,239.3,15.0,16,315,1590,11,30,23,196,1,246.6,15.4
3,MattForte,266,16,288,1341,9,95,75,592,3,261.3,16.3,15,248,1094,5,60,44,340,1,177.4,11.8
4,ArianFoster,260,8,121,542,1,35,22,183,1,84.5,10.6,16,351,1411,15,58,40,217,2,260.8,16.3
5,EddieLacy,246,15,284,1178,11,44,35,257,0,207.5,13.8,0,0,0,0,0,0,0,0,0.0,0.0
6,JamaalCharles,205,15,259,1288,12,104,70,693,7,308.1,20.5,16,284,1513,5,49,36,236,1,204.9,12.8
7,JustinForsett,235,0,0,0,0,0,0,0,0,0.0,0.0,16,46,145,1,34,23,128,0,33.3,2.1
8,LamarMiller,216,16,177,709,2,35,26,170,0,97.9,6.1,0,0,0,0,0,0,0,0,0.0,0.0
9,JeremyHill,222,0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0


In [45]:
X_pred = np.array(pred_df.drop(['Name'], axis=1))
y_pred = gbr.predict(X_pred)

In [46]:
y_pred

array([  2.69435592e+02,   2.54212388e+02,   2.16245735e+02,
         2.29317937e+02,   1.98608223e+02,   2.39086276e+02,
         1.39587032e+02,   1.40863222e+02,   1.71769406e+02,
         1.43601832e+02,   1.10187748e+02,   2.69385368e+02,
         1.47314117e+02,   1.94921515e+02,   1.32921660e+02,
         1.62406549e+02,   1.13234696e+02,   1.45161712e+02,
         1.09725052e+02,   1.40224803e+02,   1.34096981e+02,
         1.33436702e+02,   1.05274184e+02,   8.80235251e+01,
         1.10187748e+02,   7.41867543e+01,   7.80821888e+01,
         9.50408921e+01,   1.18191447e+02,   1.29256925e+02,
         9.96584694e+01,   1.66373906e+02,   9.42389588e+01,
         1.20432369e+02,   1.36110917e+02,   9.41885478e+01,
         7.99882602e+01,   2.96513468e+01,   1.28737856e+02,
         1.66373906e+02,   1.11088872e+02,   3.45174833e+01,
         1.17986697e+02,   6.10922654e+01,   8.18030604e+01,
         6.45739522e+01,   3.64442767e+01,   9.87486618e+01,
         9.98635579e+01,

In [47]:
val_df = RB_Total[['Name','FFP_2014']]
val_df['pred 2014'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [48]:
val_df

Unnamed: 0,Name,FFP_2014,pred 2014
0,DeMarcoMurray,294.1,269.435592
1,LeVeonBell,287.5,254.212388
2,MarshawnLynch,265.3,216.245735
3,MattForte,240.6,229.317937
4,ArianFoster,231.3,198.608223
5,EddieLacy,230.6,239.086276
6,JamaalCharles,213.0,139.587032
7,JustinForsett,200.9,140.863222
8,LamarMiller,185.4,171.769406
9,JeremyHill,183.9,143.601832


In [49]:
print(score)

[ 0.82447019  0.65351916 -0.5817107 ]
