In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier



In [2]:
teams_dict = {}
with open("teams.json", 'r') as _f:
    teams_dict = json.load(_f)

def convert_team(name):
    #name : full name 'Charlotte Hornets' --> return : abbreviation 'CHO'
    
    convert_dict = {}
    for team in teams_dict:
        convert_dict[team['teamName']] = team['abbreviation']
    
    #team name changes in the past few years
    name_changes = {
        'Charlotte Hornets': 'CHO',
        'Charlotte Bobcats': 'CHA',
        'New Orleans Hornets': 'NOH',
        'New Orleans Pelicans': 'NOP',
        'Brooklyn Nets': 'BRK'
    }
    if name in name_changes:
        return name_changes[name]
    
    return convert_dict[name]

In [3]:
schedules = pd.read_csv('2012_2017_schedules.csv', index_col=0)
played_schedule = schedules.dropna()
played_schedule['start_time'] = pd.to_datetime(played_schedule['start_time'])\
                                .dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

box_scores = pd.read_csv('2012_2017_box_scores.csv', index_col=0)
box_scores['date'] = pd.to_datetime(box_scores['date'])

season_stats = pd.read_csv('2012_2017_season_player_stats.csv', index_col=0)

In [4]:
played_schedule.head()

Unnamed: 0,home_team_name,home_team_score,start_time,visiting_team_name,visiting_team_score
0,Cleveland Cavaliers,94.0,2012-10-30 19:00:00-04:00,Washington Wizards,84.0
1,Los Angeles Lakers,91.0,2012-10-30 19:30:00-04:00,Dallas Mavericks,99.0
2,Miami Heat,120.0,2012-10-30 20:00:00-04:00,Boston Celtics,107.0
3,Chicago Bulls,93.0,2012-10-31 19:00:00-04:00,Sacramento Kings,87.0
4,Detroit Pistons,96.0,2012-10-31 19:30:00-04:00,Houston Rockets,105.0


In [5]:
def get_starters(home_team, visitor_team, date):
     
#     home_team: team abbreviation ex. 'CLE'
#     visitor_team: same thing ex.'BOS'
#     date: datetime object --> datetime.date(YYYY, MM, DD)
#     returns list of lists of tuples: [[('Lebron', 'James'), ...], [('Kyrie', 'Irving'), ...]]

    df_slice = box_scores.query('team=="'+ home_team +'" or team=="'+ visitor_team +'"')
    game_players = df_slice[df_slice['date'] == date] 
    
    home_players = game_players[game_players['team']==home_team]
    home_starters_df = home_players.sort_values('seconds_played', ascending=False).head(5)
    home_starters = [(p['first_name'], p['last_name']) for _,p in home_starters_df.iterrows()]
    
    visitor_players = game_players[game_players['team']==visitor_team]
    visitor_starters_df = visitor_players.sort_values('seconds_played', ascending=False).head(5)
    visitor_starters = [(p['first_name'], p['last_name']) for _,p in visitor_starters_df.iterrows()]
    return home_starters, visitor_starters

In [6]:
def safe_divide (x, y):
    if y == 0:
        return 0
    return x / y

In [34]:
def get_previous_wins(team, opponent, date): 
#     team: full name ex 'Cleveland Cavaliers'
#     opponent: same thing ex 'Boston Celtics'
#     date: datetime object --> datetime.date(YYYY, MM, DD)
#     returns dict of a bunch of statistics
    
    
    # NBA season starts in late fall and ends in May/June
    # Here, we get a lower bound for games in *this season*
    lower_year = date.year
    if date.month < 7:
        lower_year -= 1
    lower_bound = pd.to_datetime('09-01-' + str(lower_year)).date()
    
    # Grab only the games that our team was involved in
    df_slice = played_schedule[(played_schedule['start_time'] < date) & (played_schedule['start_time'] > lower_bound)]
    our_team_slice = df_slice.query('home_team_name=="' + team + '" or visiting_team_name=="' + team + '"')
    
    # Get our home and visiting wins
    home_wins = our_team_slice.apply(lambda row: row['home_team_name'] == team and\
                                     row['home_team_score'] > row['visiting_team_score'], axis=1)
    home_wins = home_wins.sum() if not home_wins.empty else 0
    visiting_wins = our_team_slice.apply(lambda row: row['visiting_team_name'] == team and\
                                     row['home_team_score'] < row['visiting_team_score'], axis=1)
    visiting_wins = visiting_wins.sum() if not visiting_wins.empty else 0
    
    #get total number of games at home and away
    home_games_played = our_team_slice.apply(lambda row: row['home_team_name'] == team, axis=1)
    home_games_played = home_games_played.sum() if not home_games_played.empty else 0
    visiting_games_played = our_team_slice.apply(lambda row: row['visiting_team_name'] == team, axis=1)
    visiting_games_played = visiting_games_played.sum() if not visiting_games_played.empty else 0
    
    # Calculate current win streak against all teams (if applicable)
    avg_home_points_for = []
    avg_points_given_up_at_home = []
    avg_away_points_for = []
    avg_points_given_up_away = []
    for _, row in our_team_slice.sort_values('start_time', ascending=False).iterrows():
        if row['home_team_name'] == team:
            avg_home_points_for.append(row['home_team_score'])
            avg_points_given_up_at_home.append(row['visiting_team_score'])
        elif row ['visiting_team_name'] == team:
            avg_away_points_for.append(row['visiting_team_score'])
            avg_points_given_up_away.append(row['home_team_score'])
        else:
            break
    
    win_streak = 0
    for _,row in our_team_slice.sort_values('start_time', ascending=False).iterrows():
        if row['home_team_name'] == team and row['home_team_score'] > row['visiting_team_score']:
            win_streak += 1
        elif row['visiting_team_name'] == team and row['home_team_score'] < row['visiting_team_score']:
            win_streak += 1
        else:
            break
    
    # Calculate current loss streak against all teams (if applicable)
    losing_streak = 0
    if win_streak > 0:
        pass
    else:
        for _,row in our_team_slice.sort_values('start_time', ascending=False).iterrows():
            if row['home_team_name'] == team and row['home_team_score'] < row['visiting_team_score']:
                losing_streak += 1
            elif row['visiting_team_name'] == team and row['home_team_score'] > row['visiting_team_score']:
                losing_streak += 1
            else:
                break
                
    # Calculate the result of the season series against this opponent (includes home + away games)
    season_series = our_team_slice.query('home_team_name=="' + opponent + '" or visiting_team_name=="' + opponent + '"')
    season_series_wins = 0
    season_series_losses = 0
    for _,row in season_series.iterrows():
        if row['home_team_name'] == team:
            if row['home_team_score'] > row['visiting_team_score']:
                season_series_wins += 1
            else:
                season_series_losses +=1
        elif row['visiting_team_name'] == team:
            if row['home_team_score'] < row['visiting_team_score']:
                season_series_wins += 1
            else:
                season_series_losses += 1
    
    return {
        'wins_as_home_team': home_wins,
        'wins_as_visiting_team': visiting_wins,
        'total_current_wins': (home_wins + visiting_wins),
        'current_win_streak': win_streak,
        'current_losing_streak': losing_streak,
        'season_series_wins': season_series_wins,
        'season_series_losses': season_series_losses,
        'home_win_percent': safe_divide(home_wins, home_games_played),
        'visiting_win_percent': safe_divide(visiting_wins, visiting_games_played),
        'total_win_percent': safe_divide((home_wins + visiting_wins),\
                                         (home_games_played + visiting_games_played)),
        'avg_points_scored_at_home': safe_divide(sum(avg_home_points_for), len(avg_home_points_for)),
        'avg_points_given_up_at_home': safe_divide(sum(avg_points_given_up_at_home), len(avg_points_given_up_at_home)),
        'avg_points_scored_away': safe_divide(sum(avg_away_points_for), len(avg_away_points_for)),
        'avg_points_given_up_away': safe_divide(sum(avg_points_given_up_away),len(avg_points_given_up_away)),
        'avg_points_for': safe_divide(sum(avg_home_points_for + avg_away_points_for),\
                                      len(avg_home_points_for + avg_away_points_for)),
        'avg_points_given_up': safe_divide(sum(avg_points_given_up_at_home + avg_points_given_up_away),\
                                          len(avg_points_given_up_at_home + avg_points_given_up_away))
    }

In [38]:
def get_player_to_date_stats(player,date):

#     player: player name in a tuple ('first_name','last_name')
#     date: datetime object --> datetime.date(YYYY, MM, DD)
#     returns dict of player to date features

    # NBA season starts in late fall and ends in May/June
    # Get a lower bound for games in *this season*
    lower_year = date.year
    if date.month < 7:
        lower_year -= 1
    lower_bound = pd.to_datetime('09-01-' + str(lower_year)).date()
    
    # Grab only the games that player was involved in
    df_slice = box_scores[(box_scores['date'] < date) & (box_scores['date'] > lower_bound)]
    player_slice = df_slice[(df_slice['first_name'] == player[0]) & (df_slice['last_name'] == player[1])]
    
    # Calculate player statistics
    if player_slice.points.mean() > 0:
        points_per_game = player_slice.points.mean()
    else:
        points_per_game = 0
    if player_slice.total_rebounds.mean() > 0:
        rebounds_per_game = player_slice.total_rebounds.mean()
    else:
        rebounds_per_game = 0
    if player_slice.assists.mean() > 0:
        assists_per_game = player_slice.assists.mean()
    else:
        assists_per_game = 0
    if player_slice.field_goal_attempts.mean() > 0:
        field_goal_percent = (player_slice.field_goals.mean()/player_slice.field_goal_attempts.mean())
    else:
        field_goal_percent = 0
    if player_slice.free_throw_attempts.mean() > 0:
        free_throw_percent = (player_slice.free_throws.mean()/player_slice.free_throw_attempts.mean())
    else:
        free_throw_percent = 0
    if player_slice.three_point_field_goal_attempts.mean() > 0:
        three_point_percent = (player_slice.three_point_field_goals.mean()/\
                               player_slice.three_point_field_goal_attempts.mean())
    else:
        three_point_percent = 0
    
    # Return statistics in a dictionary
    return {
        'points_per_game': points_per_game,
        'rebounds_per_game': rebounds_per_game,
        'assists_per_game': assists_per_game,
        'field_goal_percent': field_goal_percent,
        'free_throw_percent': free_throw_percent,
        'three_point_percent': three_point_percent
    }

In [10]:
# %%time

# X, Y = [], []
# df_regr_list = []

# for _, row in played_schedule.iterrows():   
#     # This dict contains all of the feature keys and values for this game
#     game_dict = {}
    
#     # Get the teams and starting players for this game
#     game_date = row['start_time'].date()
#     home = convert_team(row['home_team_name'])
#     visitors = convert_team(row['visiting_team_name'])
#     home_starters, visitor_starters = get_starters(home, visitors, game_date)
    
#     # Add some housekeeping features to track this particular game
#     game_dict['home_team_name'] = row['home_team_name']
#     game_dict['home_team_abbr'] = home
#     game_dict['visiting_team_name'] = row['visiting_team_name']
#     game_dict['visiting_team_abbr'] = visitors
#     game_dict['game_date'] = game_date
    
#     # Create some number of player-based features for our starters
#     game_dict['home_starters_points_per_game'] = 0
#     game_dict['home_starters_rebounds_per_game'] = 0
#     game_dict['home_starters_assists_per_game'] = 0
#     for player_i in range(len(home_starters)):
#         player_i_stats = get_player_to_date_stats(home_starters[player_i], game_date)
#         game_dict['home_starters_points_per_game'] += player_i_stats['points_per_game']
#         game_dict['home_starters_rebounds_per_game'] += player_i_stats['rebounds_per_game']
#         game_dict['home_starters_assists_per_game'] += player_i_stats['assists_per_game']

        
#     # Create some number of player-based features for the visitors
#     game_dict['visitor_starters_points_per_game'] = 0
#     game_dict['visitor_starters_rebounds_per_game'] = 0
#     game_dict['visitor_starters_assists_per_game'] = 0
#     for player_i in range(len(visitor_starters)):
#         player_i_stats = get_player_to_date_stats(visitor_starters[player_i],game_date)
#         game_dict['visitor_starters_points_per_game'] += player_i_stats['points_per_game']
#         game_dict['visitor_starters_rebounds_per_game'] += player_i_stats['rebounds_per_game']
#         game_dict['visitor_starters_assists_per_game'] += player_i_stats['assists_per_game']
    
    
        
#     # Create the win-based features for the teams
#     game_dict['home_won'] = row['home_team_score'] > row['visiting_team_score']
#     home_win_stats = get_previous_wins(row['home_team_name'], row['visiting_team_name'], game_date)
#     game_dict['home_team_total_wins'] = home_win_stats['total_current_wins']
#     game_dict['home_team_wins_as_home'] = home_win_stats['wins_as_home_team']
#     game_dict['home_team_wins_as_visitor'] = home_win_stats['wins_as_visiting_team']
#     game_dict['home_team_current_win_streak'] = home_win_stats['current_win_streak']
#     game_dict['home_team_current_losing_streak'] = home_win_stats['current_losing_streak']
#     game_dict['home_team_season_series_wins'] = home_win_stats['season_series_wins']
#     game_dict['home_team_season_series_losses'] = home_win_stats['season_series_losses']
#     visiting_win_stats = get_previous_wins(row['visiting_team_name'], row['home_team_name'], game_date)
#     game_dict['visiting_team_total_wins'] = visiting_win_stats['total_current_wins']
#     game_dict['visiting_team_wins_as_home'] = visiting_win_stats['wins_as_home_team']
#     game_dict['visiting_team_wins_as_visitor'] = visiting_win_stats['wins_as_visiting_team']
#     game_dict['visiting_team_current_win_streak'] = visiting_win_stats['current_win_streak']
#     game_dict['visiting_team_current_losing_streak'] = visiting_win_stats['current_losing_streak']
#     game_dict['visiting_team_season_series_wins'] = visiting_win_stats['season_series_wins']
#     game_dict['visiting_team_season_series_losses'] = visiting_win_stats['season_series_losses']
    
#     df_regr_list.append(game_dict)

In [None]:
# # Generate feature DataFrame and also save to CSV so we don't have to re-run later
# df_regr = pd.DataFrame(df_regr_list)
# df_regr.to_csv('df_regr.csv')

In [None]:
# %%time
# # Limit our feature set to these features for now
# features = ['home_team_current_losing_streak', 'home_team_current_win_streak', 'home_team_season_series_losses',
#            'home_team_season_series_wins', 'home_team_total_wins', 
#             'home_team_wins_as_home', 'home_team_wins_as_visitor','visiting_team_current_losing_streak', 
#             'visiting_team_current_win_streak', 'visiting_team_season_series_losses',
#             'visiting_team_season_series_wins', 'visiting_team_total_wins', 'visiting_team_wins_as_home', 
#             'visiting_team_wins_as_visitor']
# X_features_subset = df_regr[features]

# # Standard train/test/split, fit, and predict step
# #70% of all stats, 30% of all stats, 70% of win/loss, 30% of win/loss
# X_train, X_test, y_train, y_test = train_test_split(X_features_subset, df_regr['home_won'], test_size=.3)
# classifier = DecisionTreeClassifier()
# classifier = classifier.fit(X_train, y_train)
# predictions = classifier.predict(X_test)

# #predicted wins (true positive), actual wins (false positive)
# #predicted losses (false negative), actual losses (true negative)

# print(confusion_matrix(y_test, predictions))
# print("Accuracy: {0:.1f}%".format(accuracy_score(y_test, predictions) * 100))

In [100]:
from sklearn.model_selection import GridSearchCV

In [None]:
%%time
dtc_parameters = {
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[None,1,2,4,5,6,7,8,10,12],
    'min_samples_split':[2,3,4,5,6,7,10],
    'max_features':[None,'log2','sqrt',2,4,6,8,10,12]
}
#cv = number of cross validation folds
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_parameters, cv=5, verbose = 0) 

In [None]:
%%time
dtc_gs.fit(X_features_subset,df_regr['home_won'])

In [None]:
print(dtc_gs.best_score_)
print(dtc_gs.best_params_)
print(dtc_gs.best_estimator_)

In [None]:
#find feature importance
fi = pd.DataFrame({
    'feature' : X_features_subset.columns, 
    'importance': dtc_gs.best_estimator_.feature_importances_
})
fi.sort_values('importance', ascending = False, inplace = True)
fi

In [None]:
features = ['home_team_current_losing_streak', 'home_team_season_series_losses',
           'home_team_season_series_wins', 'home_team_total_wins', 'home_team_wins_as_home', 
            'home_team_wins_as_visitor', 'visiting_team_current_win_streak',
            'visiting_team_season_series_wins', 'visiting_team_total_wins', 
            'visiting_team_wins_as_home', 'visiting_team_wins_as_visitor']
X_features_subset = df_regr[features]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features_subset, df_regr['home_won'], test_size=.3)
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

#predicted wins (true positive), actual wins (false positive)
#predicted losses (false negative), actual losses (true negative)

print(confusion_matrix(y_test, predictions))
print("Accuracy: {0:.1f}%".format(accuracy_score(y_test, predictions) * 100))

In [None]:
%%time
dtc_parameters = {
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[None,1,2,4,5,6,7,8,10,12],
    'min_samples_split':[2,3,4,5,6,7,10],
    'max_features':[None,'log2','sqrt',2,4,6,8,9]
}
#cv = number of cross validation folds
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_parameters, cv=5, verbose = 0) 

In [None]:
%%time
dtc_gs.fit(X_features_subset,df_regr['home_won'])

In [None]:
print(dtc_gs.best_score_)
print(dtc_gs.best_params_)
print(dtc_gs.best_estimator_)

In [11]:
%%time

X, Y = [], []
df_regr_list = []

for _, row in played_schedule.iterrows():   
    # This dict contains all of the feature keys and values for this game
    game_dict = {}
    
    # Get the teams and starting players for this game
    game_date = row['start_time'].date()
    home = convert_team(row['home_team_name'])
    visitors = convert_team(row['visiting_team_name'])
    home_starters, visitor_starters = get_starters(home, visitors, game_date)
    
    # Add some housekeeping features to track this particular game
    game_dict['home_team_name'] = row['home_team_name']
    game_dict['home_team_abbr'] = home
    game_dict['visiting_team_name'] = row['visiting_team_name']
    game_dict['visiting_team_abbr'] = visitors
    game_dict['game_date'] = game_date
    
    # Create some number of player-based features for our starters
    game_dict['home_starters_points_per_game'] = 0
    game_dict['home_starters_rebounds_per_game'] = 0
    game_dict['home_starters_assists_per_game'] = 0
    for player_i in range(len(home_starters)):
        player_i_stats = get_player_to_date_stats(home_starters[player_i], game_date)
        game_dict['home_starters_points_per_game'] += player_i_stats['points_per_game']
        game_dict['home_starters_rebounds_per_game'] += player_i_stats['rebounds_per_game']
        game_dict['home_starters_assists_per_game'] += player_i_stats['assists_per_game']

        
    # Create some number of player-based features for the visitors
    game_dict['visitor_starters_points_per_game'] = 0
    game_dict['visitor_starters_rebounds_per_game'] = 0
    game_dict['visitor_starters_assists_per_game'] = 0
    for player_i in range(len(visitor_starters)):
        player_i_stats = get_player_to_date_stats(visitor_starters[player_i],game_date)
        game_dict['visitor_starters_points_per_game'] += player_i_stats['points_per_game']
        game_dict['visitor_starters_rebounds_per_game'] += player_i_stats['rebounds_per_game']
        game_dict['visitor_starters_assists_per_game'] += player_i_stats['assists_per_game']
    
    
        
    # Create the win-based features for the teams
    game_dict['home_won'] = row['home_team_score'] > row['visiting_team_score']
    home_win_stats = get_previous_wins(row['home_team_name'], row['visiting_team_name'], game_date)
    game_dict['home_team_total_wins'] = home_win_stats['total_current_wins']
    game_dict['home_team_wins_as_home'] = home_win_stats['wins_as_home_team']
    game_dict['home_team_wins_as_visitor'] = home_win_stats['wins_as_visiting_team']
    game_dict['home_team_current_win_streak'] = home_win_stats['current_win_streak']
    game_dict['home_team_current_losing_streak'] = home_win_stats['current_losing_streak']
    game_dict['home_team_season_series_wins'] = home_win_stats['season_series_wins']
    game_dict['home_team_season_series_losses'] = home_win_stats['season_series_losses']
    game_dict['home_team_home_win_percent'] = home_win_stats['home_win_percent']
    game_dict['home_team_away_win_percent'] = home_win_stats['visiting_win_percent']
    game_dict['home_team_total_win_percent'] = home_win_stats['total_win_percent']
    
    visiting_win_stats = get_previous_wins(row['visiting_team_name'], row['home_team_name'], game_date)
    game_dict['visiting_team_total_wins'] = visiting_win_stats['total_current_wins']
    game_dict['visiting_team_wins_as_home'] = visiting_win_stats['wins_as_home_team']
    game_dict['visiting_team_wins_as_visitor'] = visiting_win_stats['wins_as_visiting_team']
    game_dict['visiting_team_current_win_streak'] = visiting_win_stats['current_win_streak']
    game_dict['visiting_team_current_losing_streak'] = visiting_win_stats['current_losing_streak']
    game_dict['visiting_team_season_series_wins'] = visiting_win_stats['season_series_wins']
    game_dict['visiting_team_season_series_losses'] = visiting_win_stats['season_series_losses']
    game_dict['visiting_team_home_win_percent'] = visiting_win_stats['home_win_percent']
    game_dict['visiting_team_away_win_percent'] = visiting_win_stats['visiting_win_percent']
    game_dict['visiting_team_total_win_percent'] = visiting_win_stats['total_win_percent']
    
    df_regr_list.append(game_dict)

CPU times: user 14min 25s, sys: 1.8 s, total: 14min 27s
Wall time: 14min 29s


In [12]:
df_regr = pd.DataFrame(df_regr_list)
df_regr.to_csv('df_regr.csv')

In [13]:
%%time
# Limit our feature set to these features for now
features = ['home_team_current_losing_streak', 'home_team_current_win_streak', 'home_team_season_series_losses',
            'home_team_season_series_wins', 'home_team_total_wins', 'home_team_wins_as_home', 
            'home_team_wins_as_visitor','visiting_team_current_losing_streak', 
            'visiting_team_current_win_streak', 'visiting_team_season_series_losses',
            'visiting_team_season_series_wins', 'visiting_team_total_wins', 'visiting_team_wins_as_home', 
            'visiting_team_wins_as_visitor', 'home_team_home_win_percent', 'home_team_away_win_percent',
            'home_team_total_win_percent', 'visiting_team_home_win_percent', 'visiting_team_away_win_percent',
            'visiting_team_total_win_percent']
X_features_subset = df_regr[features]

# Standard train/test/split, fit, and predict step
#70% of all stats, 30% of all stats, 70% of win/loss, 30% of win/loss
X_train, X_test, y_train, y_test = train_test_split(X_features_subset, df_regr['home_won'], test_size=.3)
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

#predicted wins (true positive), actual wins (false positive)
#predicted losses (false negative), actual losses (true negative)

print(confusion_matrix(y_test, predictions))
print("Accuracy: {0:.1f}%".format(accuracy_score(y_test, predictions) * 100))

[[390 432]
 [421 728]]
Accuracy: 56.7%
CPU times: user 70 ms, sys: 0 ns, total: 70 ms
Wall time: 64.3 ms


In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
%%time
dtc_parameters = {
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[None,1,2,4,5,6,7,8,10,12],
    'min_samples_split':[2,3,4,5,6,7,10],
    'max_features':[None,'log2','sqrt',2,4,6,8,10,12,14,17,19]
}
#cv = number of cross validation folds
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_parameters, cv=5, verbose = 0) 

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 140 µs


In [16]:
%%time
dtc_gs.fit(X_features_subset,df_regr['home_won'])

CPU times: user 4min 12s, sys: 820 ms, total: 4min 12s
Wall time: 4min 13s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'splitter': ['best', 'random'], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 1, 2, 4, 5, 6, 7, 8, 10, 12], 'min_samples_split': [2, 3, 4, 5, 6, 7, 10], 'max_features': [None, 'log2', 'sqrt', 2, 4, 6, 8, 10, 12, 14, 17, 19]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [17]:
print(dtc_gs.best_score_)
print(dtc_gs.best_params_)
print(dtc_gs.best_estimator_)

0.652001826762
{'splitter': 'best', 'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 3, 'max_features': 14}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=14, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [18]:
#find feature importance
fi = pd.DataFrame({
    'feature' : X_features_subset.columns, 
    'importance': dtc_gs.best_estimator_.feature_importances_
})
fi.sort_values('importance', ascending = False, inplace = True)
fi

Unnamed: 0,feature,importance
16,home_team_total_win_percent,0.39915
19,visiting_team_total_win_percent,0.327836
15,home_team_away_win_percent,0.077509
18,visiting_team_away_win_percent,0.073964
17,visiting_team_home_win_percent,0.056541
4,home_team_total_wins,0.026049
6,home_team_wins_as_visitor,0.011141
1,home_team_current_win_streak,0.01073
14,home_team_home_win_percent,0.009597
8,visiting_team_current_win_streak,0.007483


In [19]:
features = ['home_team_current_win_streak','home_team_total_wins', 'home_team_wins_as_visitor', 
            'visiting_team_current_win_streak', 'home_team_home_win_percent', 
            'home_team_away_win_percent', 'home_team_total_win_percent', 
            'visiting_team_home_win_percent', 'visiting_team_away_win_percent',
            'visiting_team_total_win_percent']
X_features_subset = df_regr[features]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_features_subset, df_regr['home_won'], test_size=.3)
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

#predicted wins (true positive), actual wins (false positive)
#predicted losses (false negative), actual losses (true negative)

print(confusion_matrix(y_test, predictions))
print("Accuracy: {0:.1f}%".format(accuracy_score(y_test, predictions) * 100))

[[421 417]
 [415 718]]
Accuracy: 57.8%


In [21]:
%%time
dtc_parameters = {
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[None,1,2,3,4,5,6,7,8,9,10],
    'min_samples_split':[1,2,3,4,5,6,7,10],
    'max_features':[None,'log2','sqrt',2,3,4,5,6,7,8,9]
}
#cv = number of cross validation folds
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_parameters, cv=5, verbose = 0) 

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 64.8 µs


In [22]:
%%time
dtc_gs.fit(X_features_subset,df_regr['home_won'])

CPU times: user 4min 2s, sys: 760 ms, total: 4min 3s
Wall time: 4min 3s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'splitter': ['best', 'random'], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 10], 'max_features': [None, 'log2', 'sqrt', 2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [23]:
print(dtc_gs.best_score_)
print(dtc_gs.best_params_)
print(dtc_gs.best_estimator_)

0.651392906074
{'splitter': 'best', 'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 7, 'max_features': 3}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=3, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=7,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [24]:
#find feature importance
fi = pd.DataFrame({
    'feature' : X_features_subset.columns, 
    'importance': dtc_gs.best_estimator_.feature_importances_
})
fi.sort_values('importance', ascending = False, inplace = True)
fi

Unnamed: 0,feature,importance
9,visiting_team_total_win_percent,0.417359
4,home_team_home_win_percent,0.309739
6,home_team_total_win_percent,0.171321
5,home_team_away_win_percent,0.07672
7,visiting_team_home_win_percent,0.017501
3,visiting_team_current_win_streak,0.00736
0,home_team_current_win_streak,0.0
1,home_team_total_wins,0.0
2,home_team_wins_as_visitor,0.0
8,visiting_team_away_win_percent,0.0


In [39]:
%%time

X, Y = [], []
df_regr_list = []

for _, row in played_schedule.iterrows():   
    # This dict contains all of the feature keys and values for this game
    game_dict = {}
    
    # Get the teams and starting players for this game
    game_date = row['start_time'].date()
    home = convert_team(row['home_team_name'])
    visitors = convert_team(row['visiting_team_name'])
    home_starters, visitor_starters = get_starters(home, visitors, game_date)
    
    # Add some housekeeping features to track this particular game
    game_dict['home_team_name'] = row['home_team_name']
    game_dict['home_team_abbr'] = home
    game_dict['visiting_team_name'] = row['visiting_team_name']
    game_dict['visiting_team_abbr'] = visitors
    game_dict['game_date'] = game_date
    
    game_dict['home_starters_points_per_game'] = 0
    game_dict['home_starters_rebounds_per_game'] = 0
    game_dict['home_starters_assists_per_game'] = 0
    game_dict['home_starters_field_goal_percent'] = 0
    game_dict['home_starters_free_throw_percent'] = 0
    game_dict['home_starters_three_point_percent'] = 0
    for player_i in range(len(home_starters)):
        player_i_stats = get_player_to_date_stats(home_starters[player_i],game_date)
        game_dict['home_starters_points_per_game'] += player_i_stats['points_per_game']
        game_dict['home_starters_rebounds_per_game'] += player_i_stats['rebounds_per_game']
        game_dict['home_starters_assists_per_game'] += player_i_stats['assists_per_game']
        game_dict['home_starters_field_goal_percent'] += player_i_stats['field_goal_percent']
        game_dict['home_starters_free_throw_percent'] += player_i_stats['free_throw_percent']
        game_dict['home_starters_three_point_percent'] += player_i_stats['three_point_percent']
        

    game_dict['visitor_starters_points_per_game'] = 0
    game_dict['visitor_starters_rebounds_per_game'] = 0
    game_dict['visitor_starters_assists_per_game'] = 0
    game_dict['visitor_starters_field_goal_percent'] = 0
    game_dict['visitor_starters_free_throw_percent'] = 0
    game_dict['visitor_starters_three_point_percent'] = 0
    for player_i in range(len(visitor_starters)):
        player_i_stats = get_player_to_date_stats(visitor_starters[player_i],game_date)
        game_dict['visitor_starters_points_per_game'] += player_i_stats['points_per_game']
        game_dict['visitor_starters_rebounds_per_game'] += player_i_stats['rebounds_per_game']
        game_dict['visitor_starters_assists_per_game'] += player_i_stats['assists_per_game']
        game_dict['visitor_starters_field_goal_percent'] += player_i_stats['field_goal_percent']
        game_dict['visitor_starters_free_throw_percent'] += player_i_stats['free_throw_percent']
        game_dict['visitor_starters_three_point_percent'] += player_i_stats['three_point_percent']
        
        
    # Create the win-based features for the teams
    game_dict['home_won'] = row['home_team_score'] > row['visiting_team_score']
    game_dict['is_home'] = 1
    home_win_stats = get_previous_wins(row['home_team_name'], row['visiting_team_name'], game_date)
    visiting_win_stats = get_previous_wins(row['visiting_team_name'], row['home_team_name'], game_date)

    game_dict['home_team_home_win_percent'] = home_win_stats['home_win_percent']
    game_dict['home_team_away_win_percent'] = home_win_stats['visiting_win_percent']
    game_dict['home_team_total_win_percent'] = home_win_stats['total_win_percent']
    game_dict['home_team_home_points'] = home_win_stats['avg_points_scored_at_home']
    game_dict['home_team_home_points_given_up'] = home_win_stats['avg_points_given_up_at_home']
    game_dict['home_team_away_points'] = home_win_stats['avg_points_scored_away']
    game_dict['home_team_away_points_given_up'] = home_win_stats['avg_points_given_up_away']
    game_dict['home_team_points'] = home_win_stats['avg_points_for']
    game_dict['home_team_points_given_up'] = home_win_stats['avg_points_given_up']

    
    game_dict['visiting_team_home_win_percent'] = visiting_win_stats['home_win_percent']
    game_dict['visiting_team_away_win_percent'] = visiting_win_stats['visiting_win_percent']
    game_dict['visiting_team_total_win_percent'] = visiting_win_stats['total_win_percent']
    game_dict['visiting_team_home_points'] = visiting_win_stats['avg_points_scored_at_home']
    game_dict['visiting_team_home_points_given_up'] = visiting_win_stats['avg_points_given_up_at_home']
    game_dict['visiting_team_away_points'] = visiting_win_stats['avg_points_scored_away']
    game_dict['visiting_team_away_points_given_up'] = visiting_win_stats['avg_points_given_up_away']
    game_dict['visiting_team_points'] = visiting_win_stats['avg_points_for']
    game_dict['visiting_team_points_given_up'] = visiting_win_stats['avg_points_given_up']
    
    game_dict['win_percent_difference_home'] = (home_win_stats['total_win_percent']-visiting_win_stats['total_win_percent'])
    
    
    df_regr_list.append(game_dict)

CPU times: user 18min 4s, sys: 2.27 s, total: 18min 6s
Wall time: 18min 7s


In [41]:
df_regr = pd.DataFrame(df_regr_list)
df_regr.to_csv('df_regr.csv')
#back to back 1 or 0

In [42]:
%%time
# Limit our feature set to these features for now
features = ['home_starters_points_per_game', 'home_starters_rebounds_per_game', 'home_starters_assists_per_game',
            'home_starters_field_goal_percent', 'home_starters_free_throw_percent', 'home_starters_three_point_percent',
            'visitor_starters_points_per_game', 'visitor_starters_rebounds_per_game', 'visitor_starters_assists_per_game',
            'visitor_starters_field_goal_percent', 'visitor_starters_free_throw_percent', 'visitor_starters_three_point_percent',
            'is_home', 'home_team_home_win_percent', 'home_team_away_win_percent', 'home_team_total_win_percent',
            'home_team_home_points', 'home_team_home_points_given_up', 'home_team_away_points',
            'home_team_away_points_given_up', 'home_team_points', 'home_team_points_given_up',
            'visiting_team_home_win_percent', 'visiting_team_away_win_percent', 'visiting_team_total_win_percent',
            'visiting_team_home_points', 'visiting_team_home_points_given_up', 'visiting_team_away_points',
            'visiting_team_away_points_given_up', 'visiting_team_points', 'visiting_team_points_given_up',
            'win_percent_difference_home']
X_features_subset = df_regr[features]

# Standard train/test/split, fit, and predict step
#70% of all stats, 30% of all stats, 70% of win/loss, 30% of win/loss
X_train, X_test, y_train, y_test = train_test_split(X_features_subset, df_regr['home_won'], test_size=.3)
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

#predicted wins (true positive), actual wins (false positive)
#predicted losses (false negative), actual losses (true negative)

print(confusion_matrix(y_test, predictions))
print("Accuracy: {0:.1f}%".format(accuracy_score(y_test, predictions) * 100))

[[393 422]
 [396 760]]
Accuracy: 58.5%
CPU times: user 210 ms, sys: 0 ns, total: 210 ms
Wall time: 205 ms


In [43]:
%%time
dtc_parameters = {
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[None,1,4,7,10,13,16,19,22,25,28],
    'min_samples_split':[1,2,3,4,5,6,7,10],
    'max_features':[None,'log2','sqrt',4,7,10,13,16,19,22,25,28]
}
#cv = number of cross validation folds
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_parameters, cv=5, verbose = 0) 

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 85.4 µs


In [44]:
%%time
dtc_gs.fit(X_features_subset,df_regr['home_won'])

CPU times: user 21min 46s, sys: 1.12 s, total: 21min 47s
Wall time: 21min 48s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'splitter': ['best', 'random'], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 1, 4, 7, 10, 13, 16, 19, 22, 25, 28], 'min_samples_split': [1, 2, 3, 4, 5, 6, 7, 10], 'max_features': [None, 'log2', 'sqrt', 4, 7, 10, 13, 16, 19, 22, 25, 28]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [45]:
print(dtc_gs.best_score_)
print(dtc_gs.best_params_)
print(dtc_gs.best_estimator_)

0.657482112955
{'splitter': 'random', 'criterion': 'entropy', 'max_depth': 4, 'min_samples_split': 7, 'max_features': 25}
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=25, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=7,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')


In [46]:
#find feature importance
fi = pd.DataFrame({
    'feature' : X_features_subset.columns, 
    'importance': dtc_gs.best_estimator_.feature_importances_
})
fi.sort_values('importance', ascending = False, inplace = True)
fi

Unnamed: 0,feature,importance
15,home_team_total_win_percent,0.459102
24,visiting_team_total_win_percent,0.248863
22,visiting_team_home_win_percent,0.08793
9,visitor_starters_field_goal_percent,0.056188
6,visitor_starters_points_per_game,0.03124
13,home_team_home_win_percent,0.03059
7,visitor_starters_rebounds_per_game,0.022536
2,home_starters_assists_per_game,0.019903
31,win_percent_difference_home,0.018567
27,visiting_team_away_points,0.018421


In [94]:
%%time
# Limit our feature set to these features for now
features = ['home_team_total_win_percent', 'visiting_team_total_win_percent', 'visiting_team_home_win_percent',
            'visitor_starters_field_goal_percent', 'visitor_starters_points_per_game', 'home_team_home_win_percent',
            'visitor_starters_rebounds_per_game', 'home_starters_assists_per_game', 'win_percent_difference_home',
            'visiting_team_away_points', 'home_team_home_points_given_up']

X_features_subset = df_regr[features]

# Standard train/test/split, fit, and predict step
#70% of all stats, 30% of all stats, 70% of win/loss, 30% of win/loss
X_train, X_test, y_train, y_test = train_test_split(X_features_subset, df_regr['home_won'], test_size=.3)
classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

#predicted wins (true positive), actual wins (false positive)
#predicted losses (false negative), actual losses (true negative)

print(confusion_matrix(y_test, predictions))
print("Accuracy: {0:.1f}%".format(accuracy_score(y_test, predictions) * 100))

[[388 434]
 [427 722]]
Accuracy: 56.3%
CPU times: user 70 ms, sys: 0 ns, total: 70 ms
Wall time: 73.3 ms


In [96]:
%%time
dtc_parameters = {
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[None,1,2,4,6,8,10],
    'min_samples_split':[1,2,3,4,5,6],
    'max_features':[None,'log2','sqrt',2,4,5,6,7,8,9,10,11]
}
#cv = number of cross validation folds
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_parameters, cv=5, verbose = 0) 

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 81.3 µs


In [97]:
%%time
dtc_gs.fit(X_features_subset,df_regr['home_won'])

CPU times: user 3min 24s, sys: 360 ms, total: 3min 24s
Wall time: 3min 24s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'splitter': ['best', 'random'], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 1, 2, 4, 6, 8, 10], 'min_samples_split': [1, 2, 3, 4, 5, 6], 'max_features': [None, 'log2', 'sqrt', 2, 4, 5, 6, 7, 8, 9, 10, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [98]:
print(dtc_gs.best_score_)
print(dtc_gs.best_params_)
print(dtc_gs.best_estimator_)

0.655959811235
{'splitter': 'random', 'criterion': 'gini', 'max_depth': 6, 'min_samples_split': 3, 'max_features': 11}
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=11, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='random')


In [99]:
#find feature importance
fi = pd.DataFrame({
    'feature' : X_features_subset.columns, 
    'importance': dtc_gs.best_estimator_.feature_importances_
})
fi.sort_values('importance', ascending = False, inplace = True)
fi

Unnamed: 0,feature,importance
8,win_percent_difference_home,0.486368
5,home_team_home_win_percent,0.201502
1,visiting_team_total_win_percent,0.065011
2,visiting_team_home_win_percent,0.064029
0,home_team_total_win_percent,0.036137
7,home_starters_assists_per_game,0.033181
4,visitor_starters_points_per_game,0.030872
6,visitor_starters_rebounds_per_game,0.030741
10,home_team_home_points_given_up,0.028663
9,visiting_team_away_points,0.011841


In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [91]:
%%time
# Limit our feature set to these features for now
features = ['home_starters_points_per_game', 'home_starters_rebounds_per_game', 'home_starters_assists_per_game',
            'home_starters_field_goal_percent', 'home_starters_free_throw_percent', 'home_starters_three_point_percent',
            'visitor_starters_points_per_game', 'visitor_starters_rebounds_per_game', 'visitor_starters_assists_per_game',
            'visitor_starters_field_goal_percent', 'visitor_starters_free_throw_percent', 'visitor_starters_three_point_percent',
            'is_home', 'home_team_home_win_percent', 'home_team_away_win_percent', 'home_team_total_win_percent',
            'home_team_home_points', 'home_team_home_points_given_up', 'home_team_away_points',
            'home_team_away_points_given_up', 'home_team_points', 'home_team_points_given_up',
            'visiting_team_home_win_percent', 'visiting_team_away_win_percent', 'visiting_team_total_win_percent',
            'visiting_team_home_points', 'visiting_team_home_points_given_up', 'visiting_team_away_points',
            'visiting_team_away_points_given_up', 'visiting_team_points', 'visiting_team_points_given_up',
            'win_percent_difference_home']
X_features_subset = df_regr[features]

# Standard train/test/split, fit, and predict step
#70% of all stats, 30% of all stats, 70% of win/loss, 30% of win/loss
X_train, X_test, y_train, y_test = train_test_split(X_features_subset, df_regr['home_won'], test_size=.3)
classifier = LogisticRegression()
classifier = classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

#predicted wins (true positive), actual wins (false positive)
#predicted losses (false negative), actual losses (true negative)

print(confusion_matrix(y_test, predictions))
print("Accuracy: {0:.1f}%".format(accuracy_score(y_test, predictions) * 100))

[[394 403]
 [243 931]]
Accuracy: 67.2%
CPU times: user 180 ms, sys: 30 ms, total: 210 ms
Wall time: 192 ms
