# Feature Generation
This notebooks contains the feature engineering for the second model iteration.

# Imports

In [1]:
# Necessary to import custom modules
import os
from pathlib import Path
os.chdir("/home/jovyan/work")

import pandas as pd


from src.features import get_feature

### Features per team

In [2]:
df_avg_starter_pts = get_feature("avg_starter_score_preceeding_season")
df_avg_starter_pts.head()

Unnamed: 0,avg_pts,team,team_abbreviation,game,game_id,game_type,season,preceeding_season
0,865.5,Atlanta Hawks,ATL,ATL at HOU,34683,regular_season,2016/2017,2015/2016
1,865.5,Atlanta Hawks,ATL,UTA at ATL,34712,regular_season,2016/2017,2015/2016
2,865.5,Atlanta Hawks,ATL,TOR at ATL,34909,regular_season,2016/2017,2015/2016
3,968.75,Atlanta Hawks,ATL,HOU at ATL,34024,regular_season,2016/2017,2015/2016
4,861.0,Atlanta Hawks,ATL,ATL at ORL,34807,regular_season,2016/2017,2015/2016


### All games with score

In [3]:
df_games = get_feature("all_games_with_id")
df_games.head()

Unnamed: 0,game_id,season,team_home,team_guest,score_home,score_guest,home_win
0,43285,2017/2018,Boston Celtics,Brooklyn Nets,110,97,1
1,43253,2017/2018,Chicago Bulls,Brooklyn Nets,96,124,0
2,43238,2017/2018,Milwaukee Bucks,Brooklyn Nets,111,119,0
3,43216,2017/2018,Philadelphia 76ers,Brooklyn Nets,121,95,1
4,43201,2017/2018,Miami Heat,Brooklyn Nets,109,110,0


### Merging

In [4]:
# avg home points of preceeding year based on starter 
df_home = df_avg_starter_pts.drop(["game", "season", "preceeding_season", "team_abbreviation"], axis=1).merge(
    df_games.drop(["team_guest", "season", "score_home", "score_guest", "home_win"], axis=1),
    left_on=["game_id", "team"],
    right_on=["game_id", "team_home"]
).drop(["team_home"], axis=1)

# avg guest points of preceeding year based on starter 
df_guest = df_avg_starter_pts.drop(["game", "season", "preceeding_season", "team_abbreviation"], axis=1).merge(
    df_games.drop(["team_home", "season", "score_home", "score_guest", "home_win"], axis=1),
    left_on=["game_id", "team"],
    right_on=["game_id", "team_guest"]
).drop(["team_guest"], axis=1)

### Final merging

In [5]:
# final merging
df_final = df_home.merge(
    df_guest,
    left_on="game_id",
    right_on="game_id",
    suffixes=["_home", "_guest"]
).merge(
    df_games.drop(["team_home", "team_guest"], axis=1),
    left_on="game_id",
    right_on="game_id"
)[["season", "team_home", "team_guest", "avg_pts_home", "avg_pts_guest", "score_home", "score_guest", "home_win"]]
df_final.head()

Unnamed: 0,season,team_home,team_guest,avg_pts_home,avg_pts_guest,score_home,score_guest,home_win
0,2016/2017,Atlanta Hawks,Utah Jazz,865.5,873.25,95,120,0
1,2016/2017,Atlanta Hawks,Toronto Raptors,865.5,890.25,105,99,1
2,2016/2017,Atlanta Hawks,Houston Rockets,968.75,1229.75,112,97,1
3,2016/2017,Atlanta Hawks,Brooklyn Nets,602.5,834.0,92,107,0
4,2016/2017,Atlanta Hawks,Boston Celtics,818.0,1148.25,123,116,1


### Persistence

In [34]:
p = Path("./data/features")
if not p.exists():
    p.mkdir(parents=True)
df_final.to_csv("./data/features/second_iteration.csv", index=False)

***
# Additional Stats
In addition to a simple point average, all stats shall be averaged and provided as features

### Features per team

In [7]:
df_avg_starter_complete = get_feature("avg_starter_complete_stats_preceeding_season")
df_avg_starter_complete.head()

Unnamed: 0,avg_GamesPlayed,avg_Fg2PtAtt,avg_Fg2PtAttPerGame,avg_Fg2PtMade,avg_Fg2PtMadePerGame,avg_Fg2PtPct,avg_Fg3PtAtt,avg_Fg3PtAttPerGame,avg_Fg3PtMade,avg_Fg3PtMadePerGame,...,avg_PlusMinusPerGame,avg_MinSeconds,avg_MinSecondsPerGame,team,team_abbreviation,game,game_id,game_type,season,preceeding_season
0,73.25,479.5,5.75,239.0,2.75,50.25,229.75,2.5,77.0,0.5,...,1.5,108224.5,1434.0,Atlanta Hawks,ATL,ATL at LAC,34786,regular_season,2016/2017,2015/2016
1,10.0,61.666667,5.666667,30.333333,2.666667,46.666667,27.333333,2.333333,7.333333,0.333333,...,-0.333333,12871.666667,1286.666667,Atlanta Hawks,ATL,ATL at WAS,40318,playoff,2016/2017,2015/2016
2,80.8,452.8,5.2,229.2,2.4,51.0,259.2,2.6,92.0,0.6,...,2.0,126112.6,1559.2,Atlanta Hawks,ATL,MIN at ATL,34370,regular_season,2016/2017,2015/2016
3,71.333333,304.0,3.666667,154.666667,1.666667,52.666667,168.333333,2.0,55.666667,0.333333,...,1.0,84851.666667,1166.0,Atlanta Hawks,ATL,ATL at WAS,35003,regular_season,2016/2017,2015/2016
4,80.5,511.0,6.0,259.0,2.75,51.25,224.75,2.25,75.5,0.5,...,1.25,121627.25,1510.0,Atlanta Hawks,ATL,ATL at NOP,34479,regular_season,2016/2017,2015/2016


### All games with score

In [8]:
df_games = get_feature("all_games_with_id")
df_games.head()

Unnamed: 0,game_id,season,team_home,team_guest,score_home,score_guest,home_win
0,43285,2017/2018,Boston Celtics,Brooklyn Nets,110,97,1
1,43253,2017/2018,Chicago Bulls,Brooklyn Nets,96,124,0
2,43238,2017/2018,Milwaukee Bucks,Brooklyn Nets,111,119,0
3,43216,2017/2018,Philadelphia 76ers,Brooklyn Nets,121,95,1
4,43201,2017/2018,Miami Heat,Brooklyn Nets,109,110,0


### Merging

In [32]:
# avg home points of preceeding year based on starter 
df_home_complete = df_avg_starter_complete.drop(["game", "season", "preceeding_season", "team_abbreviation"], axis=1).merge(
    df_games.drop(["team_guest", "season", "score_home", "score_guest", "home_win"], axis=1),
    left_on=["game_id", "team"],
    right_on=["game_id", "team_home"]
).drop(["team_home"], axis=1)

# avg guest points of preceeding year based on starter 
df_guest_complete = df_avg_starter_complete.drop(["game", "season", "preceeding_season", "team_abbreviation"], axis=1).merge(
    df_games.drop(["team_home", "season", "score_home", "score_guest", "home_win"], axis=1),
    left_on=["game_id", "team"],
    right_on=["game_id", "team_guest"]
).drop(["team_guest"], axis=1)

### Final Merging

In [45]:
# final merging
df_final_complete = df_home_complete.merge(
    df_guest_complete,
    left_on="game_id",
    right_on="game_id",
    suffixes=["_home", "_guest"]
).merge(
    df_games.drop(["team_home", "team_guest"], axis=1),
    left_on="game_id",
    right_on="game_id"
)

# select relevant columns
cols = \
    ["season", "team_home", "team_guest", "score_home", "score_guest", "home_win"]\
    + [c for c in df_final_complete.columns if c.startswith("avg")]
df_final_complete = df_final_complete[df_final_complete.columns.intersection(cols)]

### Persistence

In [46]:
p = Path("./data/features")
if not p.exists():
    p.mkdir(parents=True)
df_final_complete.to_csv("./data/features/second_iteration_advanced.csv", index=False)