In [1]:
import pandas as pd
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline 



In [None]:
train_df = pd.read_csv('../Data/train_df_0313.csv',index=False)
y_train = pd.read_csv('../Data/y_train_0313.csv',index=False)
valid_df = pd.read_csv('../Data/valid_df_0313.csv',index=False)
y_valid = pd.read_csv('../Data/y_valid_0313.csv',index=False)

In [None]:
IDcol = 'Player_ID'
predictors = [x for x in data.columns if x not in [target, IDcol, 'fullName','Rank_dup']]

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [None]:
def modelfit(alg, train_df,y_train,predictors,useTrainCV=True, cv_folds=3,early_stopping_rounds=50): 
    #cross validation for selecting the number of estomators(trees)
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train_df.values[predictors], label= y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,\
                          early_stopping_rounds=early_stopping_rounds, show_progress=True)
        alg.set_params(n_estimators=cvresult.shape[0])
        print "number of estomators : "+ str(cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train_df[predictors], y_train, eval_metric='rmse')
                
    return alg

In [None]:
xgb1 = XGBRegressor(
        learning_rate =0.1,
        n_estimators=500,
        max_depth=4,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'reg:linear',
        nthread=-1,
        scale_pos_weight=1,
        seed=27)
model1 = modelfit(xgb1, train_df, y_train, predictors)

In [None]:
with open('../Data/xgboost_model1_0313.pickle', 'wb') as handle:
  pickle.dump(model1,handle)

##Model evaluation

In [None]:
def get_performance(xgbmodel,train_df, y_train, valid_df, y_valid,predictors):
    train_predictions = xgbmodel.predict(train_df[predictors])
    print "\nModel Report"
    print "RMSE(Train) : %.4g" % metrics.mean_squared_error(y_train, train_predictions)**0.5
    print "R^2 Score (Train): %f" % metrics.r2_score(y_train, train_predictions)
    test_result = xgbmodel.predict(valid_df[predictors])
    print "RMSE(Test) : %.4g" % metrics.mean_squared_error(y_valid, test_result)**0.5
    print 'R^2 Score (Test): %f' % metrics.r2_score(y_valid, test_result)

In [None]:
get_performance(model1,train_df, y_train, valid_df, y_valid, predictors)

In [None]:
from ggplot import *
def plot_predictions(xgbmodel,predictors,pred_label,data): 
    data[pred_label] = xgbmodel.predict(data[predictors])
    plot_obj = ggplot(aes(x='NewGameFanPTs', y=pred_label, colour='Rank_dup'), data=data)+geom_point()+\
    ggtitle(pred_label+" Prediction Performance")+ geom_abline(intercept = 0, slope = 1)
    return(plot_obj)

In [None]:
plot_predictions(model1,predictors,'Train_pred',train_df)

#Testing

In [None]:
def get_experiment_set(games_list):
    OpponentTeam = [x.split('@')[0] for x in games_list]
    HomeTeam = [x.split('@')[1] for x in games_list]
    teams = HomeTeam + OpponentTeam 
    matchup_map = {}
    for (x,y) in zip(HomeTeam,OpponentTeam):
        matchup_map[x]=y
        matchup_map[y]=x 
    
    GameMonth = 3
    
    #get all the players in the games today
    with open('Data/allplayerFantasyGameLogs.pickle', 'rb') as handle:
        playerBios = pickle.load(handle)
    todayPlayers_df = playerBios[playerBios['Team'].isin(pd.Series(teams))][['fullName','Player_ID','position1','Team']] 
    todayPlayers_df = todayPlayers_df.drop_duplicates()
    
    todayPlayers_df['Home'] = todayPlayers_df['Team'].map(lambda x:1 if x in HomeTeam else 0)
    todayPlayers_df['OpponentTeam'] = todayPlayers_df['Team'].map(lambda x: matchup_map[x])
    todayPlayers_df['GameMonth'] = 3
    
    #integrate the player feature table
    
    
    

In [20]:
with open('Data/allplayerFantasyGameLogs.pickle', 'rb') as handle:
        playerBios = pickle.load(handle)

In [48]:
todayPlayers_df = playerBios[playerBios['Team'].isin(pd.Series(teams))][['fullName','Player_ID','position1','Team']]

In [49]:
todayPlayers_df = todayPlayers_df.drop_duplicates()

In [51]:
todayPlayers_df['Home'] = todayPlayers_df['Team'].map(lambda x:1 if x in HomeTeam else 0)

In [71]:
todayPlayers_df['OpponentTeam'] = todayPlayers_df['Team'].map(lambda x: matchup_map[x])

In [79]:
games_list

['IND@ATL', 'UTA@SAC', 'MIL@BKN', 'NY@LAL']

In [10]:
import pickle
with open('Data/allPlayerBios.pickle', 'rb') as handle:
      playerBios = pickle.load(handle)

In [12]:
playerBios.head(3).transpose()

Unnamed: 0,0,1,2
PERSON_ID,203112,203919,203500
FIRST_NAME,Quincy,Jordan,Steven
LAST_NAME,Acy,Adams,Adams
DISPLAY_FIRST_LAST,Quincy Acy,Jordan Adams,Steven Adams
DISPLAY_LAST_COMMA_FIRST,"Acy, Quincy","Adams, Jordan","Adams, Steven"
DISPLAY_FI_LAST,Q. Acy,J. Adams,S. Adams
BIRTHDATE,1990-10-06T00:00:00,1994-07-08T00:00:00,1993-07-20T00:00:00
SCHOOL,Baylor,UCLA,Pittsburgh
COUNTRY,USA,USA,New Zealand
LAST_AFFILIATION,Baylor/USA,UCLA/USA,Pittsburgh/New Zealand
