In [221]:
import pandas as pd
import pylab as p
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit, train_test_split
import scipy
import random
import time
from multiprocessing import Pool

In [222]:
core_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/CORE.csv')
game_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/GAMES.csv')
team_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/TEAM.csv', index_col=[0])
plays_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/CORE.csv', index_col=[1])
pass_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/PASS.csv', index_col=[0])
rush_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/RUSH.csv', index_col=[0])
pass_df = pass_df.join(plays_df)
rush_df = rush_df.join(plays_df)


#win_orig_df = win_orig_df.join(game_df)
offense_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/OFFENSE.csv', index_col=[0])
defense_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/DEFENSE.csv', index_col=[0])
players_df = pd.read_csv('/Users/jostheim/workspace/kaggle/data/nfl/NFLData_2000-2012/PLAYERS.csv')
# players_df['player_id'] = players_df.index
# offense_df = offense_df.join(players_df)
# defense_df = defense_df.join(players_df)


In [223]:
def get_num_plays(row):
    num_plays = len(pass_df[(pass_df['GID'] == row['GID']) & 
                                ((pass_df['PSR'] == row['PLAYER']) | (pass_df['TRG'] == row['PLAYER']))])
    num_plays += len(rush_df[(rush_df['GID'] == row['GID']) & 
                                (rush_df['BC'] == row['PLAYER'])])
    return num_plays

def get_position(row):
    pos = players_df[players_df['PLAYER'] == row['PLAYER']]['POS1'].values[0]
    return pos

    
    
features = []
offense_df['SNP'] = offense_df.apply(lambda row: get_num_plays(row), axis=1)
offense_df['POS'] = offense_df.apply(lambda row: get_position(row), axis=1)
defense_df['SNP'] = defense_df.apply(lambda row: get_num_plays(row), axis=1)
defense_df['POS'] = defense_df.apply(lambda row: get_position(row), axis=1)

In [224]:
print offense_df.columns

Index([u'GID', u'PLAYER', u'PA', u'PC', u'PY', u'INT', u'TDP', u'RA', u'SRA', u'RY', u'TDR', u'TRG', u'REC', u'RECY', u'TDRE', u'FUML', u'PENY', u'FPTS', u'GAME', u'SEAS', u'YEAR', u'TEAM', u'SNP', u'POS'], dtype='object')


In [225]:
ignore_columns = ['PLAYER', 'GID', 'TID', 'TNAME', 'FPTS', 'POS']
categorical_to_binary_columns = ['V', 'H','STAD', 'WDIR', 'COND', 'SURF', 'TEAM', 'DAY']
convert_to_float_columns = ['HUMD', 'WSPD']


In [226]:
def get_game_features(index, game, my_team):
    feature = {}
    #setup the features from the game data (weather, point spread and stuff)
    if my_team == game['V'].values[0]:
        feature['is_visitor'] = 1.0
    else:
        feature['is_visitor'] = 0.0
    
    for key, val in game.iteritems():
        if key in ignore_columns:
            continue
        if key in categorical_to_binary_columns:
            if val.values[0] != "\N":
                feature['IS_{0}_{1}'.format(index, val.values[0])] = 1.0
        else:
            if str(val.dtype) == "object":
                if key in convert_to_float_columns:
                    if str.isdigit(val.values[0].strip()):
                        feature['{0}_{1}'.format(index, key)] = float(val.values[0])
                    else:
                        feature['{0}_{1}'.format(index, key)]  = 0.0
                else:
                    feature['{0}_{1}'.format(index, key)] = val.values[0]
            else:
                feature['{0}_{1}'.format(index, key)] = val.values[0]
    return feature

def get_team_game_features(index, team_game_data, game, players_team):
    feature = {}
    for team_game_row in team_game_data.iterrows():
        this_player_team = False
        # setup which team for this game is this_players team
        if players_team == team_game_row[1]['TNAME']:
            this_player_team = True
        if not this_player_team:
            continue
        # setup the features from the teams participating in the game
        for key, val in team_game_row[1].iteritems():
            if key in ignore_columns:
                continue
            if key in categorical_to_binary_columns:
                feature['IS_{0}_{1}_{2}'.format(index, val.values[0], 'my' if this_player_team else 'opp')] = 1.0
            else:
                feature['{0}_{1}_{2}'.format(index, 'my' if this_player_team else 'opp', key)] = val
    return feature

In [231]:
def get_features_for_player(index, offesne_player_row):
    feature = {}
    for key, val in offesne_player_row.iteritems():
        if key in ignore_columns:
            continue
        if key in categorical_to_binary_columns:
            feature["IS_{0}_{1}".format(val, index)] = 1.0
        else:
            feature["{0}_{1}".format(key, index)] = val
    return feature

def get_career_features_for_player(stats_df, player_id):
    feature = {}
    all_player = stats_df[stats_df['PLAYER'] == player_id]
    for i, column in enumerate(all_player.columns):
        col_name = all_player.columns[i]
        if col_name in ignore_columns or str(all_player.dtypes[i]) == "object":
            continue
        feature["career_{0}_sum".format(col_name)] = all_player[column].sum()
        feature["career_{0}_mean".format(col_name)] = all_player[column].mean()
        feature["career_{0}_std".format(col_name)] = all_player[column].std()
    return feature

def get_moving_mean_features(index, moving_mean):
    feature = {}
    for i, col_name in enumerate(moving_mean.columns):
        if col_name in ignore_columns:
            continue
        if col_name not in categorical_to_binary_columns:
            feature["moving_average_{0}_{1}_sum".format(index, col_name)] = moving_mean[col_name].sum()
            feature["moving_average_{0}_{1}_mean".format(index, col_name)] = moving_mean[col_name].mean()
            feature["moving_average_{0}_{1}_std".format(index, col_name)] = moving_mean[col_name].std()
    return feature
    
def get_historical_features_for_player(stats_df, player_id, game_id, my_team, opp_team):
    feature = {}
    player_plays = stats_df[(stats_df['PLAYER'] == player_id)]
#     defense_player_plays = defense_df[defense_df['PLAYER'] == player_id]
    if len(player_plays) > 0:
        feature = {'target':stats_df['FPTS']}
        feature.update(get_career_features_for_player(stats_df, player_id))
        player_plays = player_plays.sort(['GID'], ascending=[0])
        for i,(index, row) in enumerate(player_plays.iterrows()):
            player_id = row['PLAYER']
            # target is this games fantasy points
            
            pos = row['POS']
            # features are all the older games not including this one [i:] historical data
            # this means there is a feature for every game for each player
            moving_mean = pd.DataFrame(columns=player_plays.columns)
            for k, (index, player_row) in enumerate(player_plays[i+1:(i+5)].iterrows()):
                game_id = player_row['GID']
                # this will be the 2 teams that played
                team_game_data = team_df[team_df['GID'] == game_id]
                # this is basic info on the game, who is home and visitor
                game = game_df[game_df['GID'] == game_id]
#                 feature.update(get_game_features(k, game, my_team))
#                 feature.update(get_team_game_features(k, team_game_data, game, my_team))
                # setup the features for the player
                feature.update(get_features_for_player(k, player_row))
                moving_mean.loc[k] = player_row
                feature.update(get_moving_mean_features(k, moving_mean))
    return feature

def get_features_for_game(game_id, team, opp_team):
    game_features = []
    offense_game_players = offense_df[(offense_df['GID'] == game_id) & (offense_df['TEAM'] == team)]
    defense_game_players = defense_df[(defense_df['GID'] == game_id) & (defense_df['TEAM'] == opp_team)]

    print "game_id:", game_id, "num_players:", len(offense_game_players), len(defense_game_players)
    # so everything is ordered the same, order by position and then by snaps descending, so the
    # position with the most snaps is always first
    offense_game_players = offense_game_players.sort(['POS', 'SNP'], ascending=[1, 0])
    defense_game_players = defense_game_players.sort(['POS', 'SNP'], ascending=[1, 0])
    offense_game_players_features = {}
    defense_game_players_features = {}

    # offense players
    for j, (index, current_player) in enumerate(offense_game_players.iterrows()):
        current_player_id = current_player['PLAYER']
        # we go through the "team" players as offense
        offense_game_players_features[current_player_id] = (get_historical_features_for_player(offense_df, current_player_id, game_id, team, opp_team), current_player['POS'])

    #defensive players
    for j, (index, current_player) in enumerate(defense_game_players.iterrows()):        
        current_player_id = current_player['PLAYER']
        # and the "opp_team" is the defense
        defense_game_players_features[current_player_id] = (get_historical_features_for_player(defense_df, current_player_id, game_id, opp_team, team), current_player['POS'])
        
    for k, (player_id_focus, (features_focus, pos_focus)) in enumerate(offense_game_players_features.iteritems()):
        for j, (player_id, (features, pos)) in enumerate(offense_game_players_features.iteritems()):
            prefix = "off_{0}_{1}".format(pos, j)
            # if we are working on the current player as the focus then assign him the current tag
            if player_id_focus == player_id:
                prefix = "off_{0}_current".format(pos)
            for i, (key, val) in enumerate(features.iteritems()):
                feature['{0}_{1}'.format(prefix, key)] = val
        for j, (player_id, (features, pos)) in enumerate(defense_game_players_features.iteritems()):
            prefix = "def_{0}_{1}".format(pos, j)
            for i, (key, val) in enumerate(features.iteritems()):
                feature['{0}_{1}'.format(prefix, key)] = val
        game_features.append(feature)
    return game_features

In [None]:
total_count = len(game_df.index)
all_features = []
game_df = game_df.sort(['GID'], ascending=[0])
for game_count, (game_index, game) in enumerate(game_df.iterrows()):
    game_id = game['GID']
    teams = [game['H'], game['V']]
    for team in teams:
        opp_team = teams[0]
        if team == teams[0]:
            opp_team = teams[1]
        all_features += get_features_for_game(game_id, team, opp_team)        
        if game_count%1000 == 0:
            print "{0}/{1}".format(game_count, total_count)
        

game_id: 3455 num_players: 8 17
0/3455

In [None]:
features_df = pd.DataFrame(features)
features_df = features_df.fillna(0.0)
print features_df
# for i, col in enumerate(features_df.columns):
#     print features_df.columns[i], features_df.dtypes[i]

In [None]:
features_df = features_df.fillna(0.0)
if 'target' in features_df.columns:
    targets = features_df['target']
if 'target' in features_df.columns:
    del features_df['target']
x_train, x_test, y_train, y_test = train_test_split(features_df, targets, test_size=0.33, random_state=42)
cfr = ExtraTreesRegressor(
    oob_score = True, bootstrap=True, verbose = 1, max_features = None, n_estimators = 100, min_samples_leaf = 1,
    n_jobs=7,
    random_state=0,
)
cfr.fit(x_train, y_train)



In [None]:
score = cfr.score(x_test, y_test)
print score

In [None]:
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), cfr.feature_importances_), x_train.columns), 
             reverse=True)

In [None]:
predictions = cfr.predict(x_test)
plt.scatter(y_test, predictions)
plt.show()