## Feature Engineering

While this dataset contains some useful features in predicting game outcomes, there are some features that need to be created or extracted from the data before they can be used for analysis. 

The features available in the dataset are 'SEASON_ID' 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP', 'WL' (win/loss), 'MIN' (game length in minutes), 'PTS' (points scored by team), 'FGM' (field goal makes), 'FGA' (field goal attemps), 'FG_PCT' (field goal percentage), 'FG3M' (3-point makes), 'FG3A' (3-point attempts), 'FG3_PCT' (3-point percentage), 'FTM' (free throw makes), 'FTA' (free throw attemps), 'FT_PCT' (free throw percentage), 'OREB' (offensive rebounds), 'DREB' (defensive rebounds), 'REB' (total rebounds), 'AST' (assists), 'STL' (steals), 'BLK' (blocks), 'TOV' (turnovers), 'PF' (personal fouls), and 'PLUS_MINUS'. 

The point of this analysis is not to use current game statistics to predict the outcome of a game. For example, I would not want to use field goal attempts in the current game as an input into my model. Instead, I want my model to have information that is accessible before the start of the game, which includes contextual factors such as home/away and number of rest days, as well as statistics from previous games. For these purposes, I will be creating the below features:
* 'win_lose_margin': Margin of victory or loss 
* 'home': Home/away binary feature
* 'win': Win binary feature
* 'matchup_ID': Unique identifier for matchup pairs
* 'three_matchup_win_pct': Percent of games won by a team in the previous three matchups against the same opponent
* 'rest_days': Number of rest days since last game
* 'win_streak': Count of how many games in a row the team has won, resets to zero if they lost the last game 
* 'win_record', 'loss_record': Team's win and loss record for the season
* 'season_win_pct': Percent of games a team has won so far this season

In addition, I will create rolling averages for important team statistics. The rolling average is for the last 10 games a team played and is calculated for the following features: 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',m 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS', 'win_lose_margin', 'win', 'home', and 'rest_days'. 

In [242]:
# Creating win_lose_margin feature
game_ids = league_games['GAME_ID'].unique()

# Loop through each game ID and team combination
for game_id in game_ids:
    for team_id in league_games.loc[league_games['GAME_ID']==game_id, 'TEAM_ID'].unique():
        team_pts = league_games.loc[(league_games['GAME_ID']==game_id) & (league_games['TEAM_ID']==team_id), 'PTS'].iloc[0]
        opponent_pts = league_games.loc[(league_games['GAME_ID']==game_id) & (league_games['TEAM_ID']!=team_id), 'PTS'].iloc[0]
        
        # Calculating win/lose margin for the current team
        win_lose_margin = team_pts - opponent_pts
        
        # Assigning win/lose margin to the current row for the current team
        league_games.loc[(league_games['GAME_ID']==game_id) & (league_games['TEAM_ID']==team_id), 'win_lose_margin'] = win_lose_margin     

# Creating home/away feature
league_games['home'] = league_games['MATCHUP'].str.contains('vs.').astype(int)

# Creating win binary feature
league_games['win'] = league_games['WL'] == 'W'

# Creating matchup IDs
league_games['home_team_abv'] = league_games['MATCHUP'].str[-3:]
league_games['away_team_abv'] = league_games['MATCHUP'].str[:3]
league_games['matchup_id'] = league_games.apply(lambda row: '-'.join(sorted([row['home_team_abv'], row['away_team_abv']])), axis=1)
league_games = league_games.sort_values('GAME_DATE')

# Creating three-matchup win percent
league_games['three_matchup_win_pct' ] = league_games.groupby(['matchup_id', 'TEAM_ID'])['win'].apply(lambda x: x.rolling(3,
            min_periods=3).mean().shift(1))

# Creating rest_days feature 
league_games['GAME_DATE'] = pd.to_datetime(league_games['GAME_DATE'])
league_games = league_games.sort_values(by='GAME_DATE')
team_games = league_games.groupby('TEAM_ID')
league_games['last_game_date'] = team_games['GAME_DATE'].shift(1)
league_games['rest_days'] = (league_games['GAME_DATE'] - league_games['last_game_date']).dt.days

# Imputing outliers/missing values for rest_days with the most common (mode) number of rest days in dataset
# Some games are an entire summer season apart so it doesn't make sense to leave them, and we can't model if the df has missing values
from scipy import stats
league_games.loc[((league_games['rest_days'].isna()) | (league_games['rest_days'] > 100)), 'rest_days'] = 
                stats.mode(league_games['rest_days'], nan_policy='omit').mode[0]

# Drop 'last_game_date' feature
league_games = league_games.drop(columns='last_game_date')

# Creating win_streak features to count how many games in a row a team has won
league_games = league_games.reset_index()
league_games['win_streak'] = 0 
current_streak = {}
current_results = {}

for i in range(len(league_games)):
    team_id = league_games.at[i, 'TEAM_ID']
    if team_id not in current_streak:
        current_streak[team_id] = 0
    if team_id not in current_results:
        current_results[team_id] = None
        
    if current_results[team_id] is not None and current_results[team_id] == True:
        current_streak[team_id] += 1
        league_games.at[i, 'win_streak'] = current_streak[team_id]
    else: 
        current_streak[team_id] = 0 
        league_games.at[i, 'win_streak'] = current_streak[team_id]
        
    current_results[team_id] = league_games['win'][i]

# Creating rolling averages for features
rolling_features = ['MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
       'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS', 'win_lose_margin', 
       'TEAM_ID', 'SEASON_ID', 'win', 'home', 'rest_days']
rolling_df = league_games[rolling_features]

def find_team_averages(team):
    rolling = team.rolling(10).mean().shift(1)
    return rolling

rolling_df = rolling_df.groupby(['TEAM_ID', 'SEASON_ID'], group_keys=False).apply(find_team_averages)

rolling_cols = [f"{col}_MA" for col in rolling_df.columns]
rolling_df.columns = rolling_cols
league_games = pd.concat([league_games, rolling_df], axis=1)

league_games = league_games.drop(columns=['SEASON_ID_MA', 'TEAM_ID_MA'])

# Creating variables for season win/loss record 
team_season_record = {}

for i in range(len(league_games)):
    team_id = league_games.at[i, 'TEAM_ID']
    season_id = league_games.at[i, 'SEASON_ID']
    game_date = league_games.at[i, 'GAME_DATE']
    
    if (team_id, season_id) not in team_season_record:
        team_season_record[(team_id, season_id)] = ({}, {})
    
    # Initialize win and loss records to 0 for the first game of the season
    if game_date == league_games.loc[(league_games['TEAM_ID'] == team_id) & (league_games['SEASON_ID'] == season_id), 
    'GAME_DATE'].min():
        win_record = 0
        loss_record = 0
    else:
        win_record, loss_record = team_season_record[(team_id, season_id)]
    
    # Calculate win and loss records for the current game
    league_games.at[i, 'win_record'] = win_record
    league_games.at[i, 'loss_record'] = loss_record

    if league_games.at[i, 'win'] == True:
        win_record += 1
    else:
        loss_record += 1
            
    team_season_record[(team_id, season_id)] = (win_record, loss_record)

# Creating a variable for win percentage
league_games['season_win_pct'] = league_games['win_record'] / (league_games['win_record'] + league_games['loss_record'])

league_games['season_win_pct'] = league_games['season_win_pct'].fillna('0')