In [21]:
#train
import read_player_stats
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.learning_curve import learning_curve
from sklearn import cross_validation
from sklearn import metrics
pd.options.display.max_columns = 1000
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
def make_total_data(seasons=range(2004,2014),pages=[0,1]):
	total_df = None
	for season in seasons:
		for page in pages:
			df = read_player_stats.rb_stats(season,page)
			if total_df is None:
				total_df = df
			else:
				total_df = total_df.append(df, ignore_index = True)
			
	total_df.sort_index(by=['Name','Season'], inplace=True)
	return total_df

def merge_seasons(df_season1, df_season2):
	df_dropped1 = df_season1.drop(['Season', 'Team'], axis=1)
	df_dropped2 = df_season2.drop(['Season', 'Team'], axis=1)
	
	merged = pd.merge(df_dropped1,df_dropped2, on='Name', how='outer', suffixes=('_1', '_2'))
	return merged

def make_training_df(total_df, seasons=range(2004,2013)):
	training_data_df = None
	for season in seasons[:-1]:
		df1 = total_df[total_df.Season == season]
		df2 = total_df[total_df.Season == season+1]
		label_df = total_df[total_df.Season == season+2]
		labeled = pd.merge(merge_seasons(df1, df2), label_df[['Name','FFP']], on='Name')
		if training_data_df is None:
			training_data_df = labeled
		else:
			training_data_df = training_data_df.append(labeled, ignore_index=True)
		
	training_data_nadropped = training_data_df.dropna() #dont train on missing seasons
	return training_data_nadropped

In [12]:
total_df = make_total_data(seasons=range(2004,2015), pages=[0,1,2,3])
train_df = make_training_df(total_df, seasons=range(2004,2012))

In [13]:
X_train = np.array(train_df.drop(['Name','FFP'], axis=1))
y_train = np.array(train_df['FFP'])

In [14]:
model = RandomForestRegressor(n_estimators=500)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [15]:
eval_df = make_training_df(total_df, seasons=range(2012,2015))
X_eval = np.array(eval_df.drop(['Name','FFP'], axis=1))

In [16]:
y_eval = model.predict(X_eval)

In [17]:
pred_summary = eval_df[['Name']]
pred_summary['Predicted'] = y_eval
actual = total_df[total_df.Season == 2014]
actual = actual[['Name','FFP']]
pred_summary = pd.merge(pred_summary, actual, on='Name', how='inner')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [18]:
pred_summary

Unnamed: 0,Name,Predicted,FFP
0,AdrianPeterson,151.963093,9.3
1,AhmadBradshaw,87.268600,120.5
2,AlfonsoSmith,33.163000,3.0
3,AlfredMorris,124.852500,170.9
4,AnthonyDixon,34.444800,60.1
5,AnthonySherman,12.896800,13.9
6,AntoneSmith,46.755500,66.6
7,ArianFoster,138.712821,235.3
8,BenTate,93.601850,67.0
9,BernardPierce,100.667320,49.6


In [19]:
full_train = make_training_df(total_df, seasons=range(2004,2015))
X_full = np.array(full_train.drop(['Name','FFP'], axis=1))
y_full = np.array(full_train['FFP'])
cv_scores = cross_validation.cross_val_score(RandomForestRegressor(n_estimators=500), X_full, y_full, cv=10)

In [20]:
cv_scores

array([ 0.54288366,  0.5257067 ,  0.46848983,  0.34646579,  0.52774278,
        0.33385591,  0.60291736,  0.54334547,  0.50768803,  0.41507144])