Dataset from https://github.com/vaastav/Fantasy-Premier-League

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
%tensorflow_version 2.x
import tensorflow as tf
import pandas as pd
import os
import numpy as np

print(tf.__version__)

!git clone https://github.com/vaastav/Fantasy-Premier-League
 
!pip install -q tqdm

TensorFlow 2.x selected.
2.1.0-rc1
fatal: destination path 'Fantasy-Premier-League' already exists and is not an empty directory.


In [0]:
DATA_ROOT = os.path.join(os.getcwd(), 'Fantasy-Premier-League/data/')
PLAYER_ROOT = '/content/drive/My Drive/KTH   Studier/Åk 5/Scalable Machine Learning'

seasons = sorted(os.listdir(DATA_ROOT))[:-1]
season_file_root = [os.path.join(DATA_ROOT, season) for season in seasons]
season_file_root = season_file_root

## Read and Clean Data

In [0]:
def col_null_to_avg(data, column):
  numeric_column = pd.to_numeric(data[column], errors='coerce', downcast='float')
  avg = np.average(numeric_column[numeric_column.notnull()])
  numeric_column[numeric_column.isnull()] = avg
  return numeric_column


def get_players_dataset():
  season_players = []

  for path, season in zip(season_file_root, seasons):
      players = pd.read_csv(os.path.join(path, 'players_raw.csv'))
      players['season'] = season

      players['full_name'] = players['first_name'] + '_' + players['second_name']
      players['had_news'] = players['news'].notnull()

      season_players.append(players)
      

  # create series of all unique player names
  all_players = pd.concat(season_players, axis=0, ignore_index=True, sort=False)

  # Creating better values for chance_of_playing_next_round and chance_of_playing_this_round

  all_players['chance_of_playing_this_round'] = col_null_to_avg(all_players, 'chance_of_playing_this_round')
  all_players['chance_of_playing_next_round'] = col_null_to_avg(all_players, 'chance_of_playing_next_round')

  return all_players

def get_teams_dataset():
  teams = pd.read_csv(os.path.join(season_file_root[-1], 'teams.csv'))
  return teams[team_columns]

def get_gws_dataset():
  season_gws = []

  for path, season in zip(season_file_root, seasons):
    weeks_gws = []
    weeks = os.listdir(os.path.join(path, 'gws'))
    weeks = filter(lambda filename: filename.startswith('gw'), weeks)
    weeks = map(lambda filename: (int(filename.split('.')[0][2:]), filename), weeks)
    weeks = sorted(weeks)

    for week, week_path in weeks:
      gws = pd.read_csv(os.path.join(path, 'gws', week_path), encoding='latin-1')
      gws['season'] = season
      gws['week'] = week
      weeks_gws.append(gws)

    
    weeks_gws = pd.concat(weeks_gws, axis=0, ignore_index=True, sort=False)
    season_gws.append(weeks_gws)

  all_gws = pd.concat(season_gws, axis=0, ignore_index=True, sort=False)

  return all_gws

def get_players_for_season(path, season):
  players = pd.read_csv(os.path.join(path, 'players_raw.csv'))
  
  players['season'] = season
  players['full_name'] = players['first_name'] + '_' + players['second_name']
  players['had_news'] = players['news'].notnull()

  player_list_file_name = 'player_list_' + season + '.csv'
  player_teams = pd.read_csv(os.path.join(PLAYER_ROOT, player_list_file_name))[['team_code', 'team']]
  player_teams = player_teams.rename(columns={"team": "team_name"})

  players = pd.merge(players, player_teams, on="team_code", right_index=False, how='left', sort=False)

  return players

def get_gws_for_season(path, season):
  weeks_gws = []
  weeks = os.listdir(os.path.join(path, 'gws'))
  weeks = filter(lambda filename: filename.startswith('gw'), weeks)
  weeks = map(lambda filename: (int(filename.split('.')[0][2:]), filename), weeks)
  weeks = sorted(weeks)

  for week, week_path in weeks:
    gws = pd.read_csv(os.path.join(path, 'gws', week_path), encoding='latin-1')
    gws['season'] = season
    gws['week'] = week
    weeks_gws.append(gws)

  
  weeks_gws = pd.concat(weeks_gws, axis=0, ignore_index=True, sort=False)
  return weeks_gws

def get_dataset_for_season(path, season):
  gws_for_season = get_gws_for_season(path, season)
  players_for_season = get_players_for_season(path, season)

  team_name_list = players_for_season.groupby('team').first()[['team_name']]
  team_name_list = team_name_list.rename(columns={"team_name": "opponent_name"})
  gws_for_season = gws_for_season.merge(team_name_list, left_on='opponent_team', right_on="team")

  players_for_season = players_for_season[['id', 'element_type', 'team_name']]
  return gws_for_season.join(players_for_season.set_index('id'), on='element')

In [0]:
def clean_from_space(name):
  parts = name.split()
  return '_'.join(parts)

def ends_with_number(str):
  return str[-1].isdigit()

def clean_name(name):
  space_free_name = clean_from_space(name)
  if not ends_with_number(space_free_name):
    return space_free_name
  
  parts = space_free_name.split('_')
  filtered_parts = parts[:-1]
  return '_'.join(filtered_parts)

### Tokenize teams

In [0]:
def tokenize_teams(df):
  tokenized = df.copy()

  tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  tokenizer.fit_on_texts(tokenized['team_name'])

  tokenized['team_index'] = tokenized.apply (lambda row: tokenizer.word_index[row['team_name']], axis=1)
  tokenized['opponent_index'] = tokenized.apply (lambda row: tokenizer.word_index[row['opponent_name']], axis=1)

  return tokenized

### One hot encode teams

In [0]:
def one_hot_encode_team(df):
  teams = df['team_index'].unique()
  teams.sort()
  
  for team in teams:
    team_col = 'in_team_' + str(team)
    df[team_col] = np.repeat(0, len(df))

    df.loc[df['team_index'] == team, team_col] = 1

  return df

### Tokenize names

In [0]:
def tokenize_names(df):
  tokenized = df.copy()

  tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  tokenizer.fit_on_texts(tokenized['name'])

  tokenized['player_index'] = tokenized.apply (lambda row: tokenizer.word_index[row['name']], axis=1)

  return tokenized

### Figure out position

| Index | Position |
|-------|----------|
| 1     | GK       |
| 2     | DEF      |
| 3     | MID      |
| 4     | FWD      |

In [0]:
def store_postion(original):
  copy = original.copy()

  copy['is_forward'] = np.repeat(0, len(copy))
  copy['is_midfielder'] = np.repeat(0, len(copy))
  copy['is_goalkeeper'] = np.repeat(0, len(copy))
  copy['is_defense'] = np.repeat(0, len(copy))

  copy.loc[copy['element_type'] == 1, 'is_goalkeeper'] = 1
  copy.loc[copy['element_type'] == 2, 'is_defense'] = 1
  copy.loc[copy['element_type'] == 3, 'is_midfielder'] = 1
  copy.loc[copy['element_type'] == 4, 'is_forward'] = 1

  copy.drop(columns=['element', 'element_type'])

  return copy

### Calculate fixtures

$fixture = fixture - 10(week - 1)$

In [0]:
def calculate_fixture (row):
  return row['fixture'] - 10 * (row['week'] - 1)
   
def calculate_fixtures(original):
  copy = original.copy()

  copy['fixture'] = copy.apply (lambda row: calculate_fixture(row), axis=1)
  
  return copy

### Calculate next team matches

In [0]:
def add_opponent_matches(df):
  copy = df.copy()

  teams = copy['team_index'].unique()
  weeks = copy['week'].unique()
  teams.sort()
  weeks.sort()
  
  for team in teams:
    copy['w_opp_' + str(team)] = np.repeat(0, len(copy))

  for s in seasons:
    for t1 in teams:
      for week in weeks:
        opponent_weeks = copy[
          (copy['season'] == s)
          & (copy['team_index'] == t1)
          & (copy['week'] == week)
        ]['opponent_index'].value_counts() #.mode().values[0]


        # opponent_index = []
        for oponent, member_count in opponent_weeks.iteritems():
          if member_count < 10:
            break
          
          # Adding stats for this game across all weeks
          for w in weeks:
            delta = max(week - w, 0)
            
            factor = 0
            if (delta > 0):
              factor = 1 / delta

            # Break after a while
            if factor < 0.1:
              break

            copy.loc[
              (copy['season'] == s)
              & (copy['team_index'] == t1)
              & (copy['week'] == w),
              'w_opp_' + str(oponent)
            ] = factor

  return copy

### Filter Out Unnecessary Columns


In [0]:
def drop_columns(original):
  copy = original.copy()

  return copy.drop(columns=[
    'id',
    'kickoff_time',
    'kickoff_time_formatted',
    'ea_index',
    'element',
    'element_type',
    'opponent_team'
  ])

## Calculate labels

### Calculate points per week (long term)

In [0]:
def calculate_points_per_week(original):
  copy = original.copy()

  copy['long_term_points'] = np.repeat(0, len(copy))

  for s in seasons:
    this_season = copy[copy['season'] == s]
    players = this_season['player_index'].unique()
    
    for p in players:
      points = this_season[this_season['player_index'] == p]['total_points'].values
      
      current_sum = 0
      long_term_points = []

      for i, point in enumerate(reversed(points)):
        current_sum += point
        long_term_points.append(current_sum/(i + 1))

      long_term_points = list(reversed(long_term_points))

      # Add to player
      copy.loc[(copy['season'] == s) & (copy['player_index'] == p), 'long_term_points'] = long_term_points
      
  return copy

### Calculate points per week (short term)

In [0]:
MAX_RANGE_SHORT_TERM_POINTS = 4

def calculate_short_term_points_per_week(original):
  copy = original.copy()

  copy['short_term_points'] = np.repeat(0, len(copy))

  for s in seasons:
    this_season = copy[copy['season'] == s]
    players = this_season['player_index'].unique()
    
    for p in players:
      points = this_season[this_season['player_index'] == p]['total_points'].values
      
      short_term_points = []
      last_points = list()

      for i, point in enumerate(reversed(points)):
        last_points.insert(0, point)
        if len(last_points) > MAX_RANGE_SHORT_TERM_POINTS:
          last_points.pop()

        short_term_sum = 0
        for i, l_point in enumerate(last_points):
          short_term_sum += l_point / (i + 1)

        short_term_points.append(short_term_sum)

      short_term_points = list(reversed(short_term_points))

      # Add to player
      copy.loc[(copy['season'] == s) & (copy['player_index'] == p), 'short_term_points'] = short_term_points
      
  return copy

### Calculate predicted cost (short term)

In [0]:
MAX_RANGE_SHORT_TERM_COST = 4

def calculate_short_term_cost_changes(original):
  copy = original.copy()

  copy['short_term_cost_change'] = np.repeat(0, len(copy))

  for s in seasons:
    this_season = copy[copy['season'] == s]
    players = this_season['player_index'].unique()
    
    for p in [players[0]]:
      points = this_season[this_season['player_index'] == p]['value'].values
      
      short_term_cost_change = []
      last_cost_change = list()

      for i, point in enumerate(reversed(points)):
        last_cost_change.insert(0, point)
        if len(last_cost_change) > MAX_RANGE_SHORT_TERM_COST:
          last_cost_change.pop()

        short_term_sum = 0
        for l_point in last_cost_change:
          short_term_sum += l_point

        if i == 0:
          short_term_cost_change.append(0)
        else:
          short_term_cost_change.append(((short_term_sum - point) / (len(last_cost_change) - 1)) - point)


      short_term_cost_change = list(reversed(short_term_cost_change))

      # Add to player
      copy.loc[(copy['season'] == s) & (copy['player_index'] == p), 'short_term_cost_change'] = short_term_cost_change
      
  return copy

## Put it all together

In [0]:
def get_full_dataset():
  season_datasets = []

  for path, season in zip(season_file_root, seasons):
    season_dataset = get_dataset_for_season(path, season)
    season_datasets.append(season_dataset)

  all_dataset = pd.concat(season_datasets, axis=0, ignore_index=True, sort=False)
  
  all_dataset['name'] = all_dataset['name'].apply(clean_name)
  all_dataset['name'] = all_dataset['name'].str.lower()

  return drop_columns(
      add_opponent_matches(
        calculate_short_term_cost_changes(
          calculate_short_term_points_per_week(
            calculate_points_per_week(
              calculate_fixtures(
                store_postion(
                  tokenize_names(
                    one_hot_encode_team(
                      tokenize_teams(all_dataset))))))))))

df = get_full_dataset()

In [0]:
df

Unnamed: 0,name,assists,attempted_passes,big_chances_created,big_chances_missed,bonus,bps,clean_sheets,clearances_blocks_interceptions,completed_passes,creativity,dribbles,errors_leading_to_goal,errors_leading_to_goal_attempt,fixture,fouls,goals_conceded,goals_scored,ict_index,influence,key_passes,loaned_in,loaned_out,minutes,offside,open_play_crosses,own_goals,penalties_conceded,penalties_missed,penalties_saved,recoveries,red_cards,round,saves,selected,tackled,tackles,target_missed,team_a_score,team_h_score,...,in_team_26,in_team_27,player_index,is_forward,is_midfielder,is_goalkeeper,is_defense,long_term_points,short_term_points,short_term_cost_change,w_opp_1,w_opp_2,w_opp_3,w_opp_4,w_opp_5,w_opp_6,w_opp_7,w_opp_8,w_opp_9,w_opp_10,w_opp_11,w_opp_12,w_opp_13,w_opp_14,w_opp_15,w_opp_16,w_opp_17,w_opp_18,w_opp_19,w_opp_20,w_opp_21,w_opp_22,w_opp_23,w_opp_24,w_opp_25,w_opp_26,w_opp_27,week_raw,total_points_raw,cost_raw
0,aaron_cresswell,0.0,0.000000,0.0,0.0,0.0,0.142857,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.865546,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.003810,0.000000,0.000,0.0,0.142857,0.285714,...,0,0,1,0,0,0,1,1.578947,2.333333,-3.0,0.333333,0.0,0.142857,0.111111,0.0,0.0,0.2,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.1,0.0,0.0,0.0,0.125,0.0,0.166667,0.0,0.0,0.25,0.0,0.0,1,0,55
1,adrián_san_miguel_del_castillo,0.0,0.155556,0.0,0.0,0.0,0.263158,0.0,0.000000,0.071856,0.000000,0.000000,0.0,0.0,0.865546,0.000000,0.285714,0.0,0.096463,0.182152,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.0,0.0,0.285714,0.025670,0.000000,0.000,0.0,0.142857,0.285714,...,0,0,2,0,0,1,0,1.684211,4.500000,0.0,0.333333,0.0,0.142857,0.111111,0.0,0.0,0.2,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.1,0.0,0.0,0.0,0.125,0.0,0.166667,0.0,0.0,0.25,0.0,0.0,1,2,50
2,andré_ayew,0.0,0.038889,0.0,0.0,0.0,0.180451,0.0,0.000000,0.017964,0.007022,0.000000,0.0,0.0,0.865546,0.000000,0.000000,0.0,0.016077,0.001222,0.000000,0.0,0.0,0.377778,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.000000,0.083230,0.000000,0.125,0.0,0.142857,0.285714,...,0,0,293,0,1,0,0,2.342105,5.166667,0.0,0.333333,0.0,0.142857,0.111111,0.0,0.0,0.2,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.1,0.0,0.0,0.0,0.125,0.0,0.166667,0.0,0.0,0.25,0.0,0.0,1,1,75
3,andy_carroll,0.0,0.155556,0.0,0.0,0.0,0.195489,0.0,0.107143,0.083832,0.099473,0.076923,0.0,0.0,0.865546,0.111111,0.285714,0.0,0.102894,0.064792,0.083333,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.227273,0.0,0.0,0.000000,0.072952,0.333333,0.125,0.0,0.142857,0.285714,...,0,0,3,1,0,0,0,1.815789,3.000000,0.0,0.333333,0.0,0.142857,0.111111,0.0,0.0,0.2,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.1,0.0,0.0,0.0,0.125,0.0,0.166667,0.0,0.0,0.25,0.0,0.0,1,2,65
4,angelo_ogbonna,0.0,0.000000,0.0,0.0,0.0,0.142857,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.865546,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.012408,0.000000,0.000,0.0,0.142857,0.285714,...,0,0,4,0,0,0,1,1.184211,0.333333,0.0,0.333333,0.0,0.142857,0.111111,0.0,0.0,0.2,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,0.1,0.0,0.0,0.0,0.125,0.0,0.166667,0.0,0.0,0.25,0.0,0.0,1,0,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67931,stefan_johansen,0.0,0.000000,0.0,0.0,0.0,0.142857,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.815126,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.000000,0.000780,0.000000,0.000,0.0,0.571429,0.000000,...,0,1,823,0,1,0,0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,38,0,50
67932,timothy_fosu-mensah,0.0,0.000000,0.0,0.0,0.0,0.142857,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.815126,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.000000,0.005743,0.000000,0.000,0.0,0.571429,0.000000,...,0,1,114,0,0,0,1,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,38,0,45
67933,tim_ream,0.0,0.000000,0.0,0.0,0.0,0.142857,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.815126,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.000000,0.005426,0.000000,0.000,0.0,0.571429,0.000000,...,0,1,824,0,0,0,1,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,38,0,42
67934,tom_cairney,0.0,0.416667,0.0,0.0,0.0,0.263158,0.0,0.035714,0.395210,0.212405,0.153846,0.0,0.0,0.815126,0.222222,0.571429,0.0,0.241158,0.088020,0.166667,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.227273,0.0,1.0,0.000000,0.008117,0.083333,0.250,0.0,0.571429,0.000000,...,0,1,825,0,1,0,0,2.000000,2.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,38,2,46


## Preprocess

- Scale numerical values
- Transform boolean values to categorical values
- One hot encode categorical values

In [0]:
teams = df['team_index'].unique()
teams.sort()

opponent_matches_columns = map(lambda team: 'w_opp_' + str(team), teams)
team_index_columns = map(lambda team: 'in_team_' + str(team), teams)

information_columns = [
  'name',
  'season',
  'player_index',
  'team_name',
  'opponent_name',
  'opponent_index',
  'week_raw',
  'total_points_raw',
  'cost_raw'
]

boolean_columns = [
  'was_home'
]

categorical_columns = [
  *boolean_columns,
  *team_index_columns,
  'is_forward',
  'is_midfielder',
  'is_goalkeeper',
  'is_defense'
]

numerical_columns = [
  *opponent_matches_columns,
  'assists',
  'attempted_passes',
  'big_chances_created',
  'big_chances_missed',
  'bonus',
  'bps',
  'clean_sheets',
  'clearances_blocks_interceptions',
  'completed_passes',
  'creativity',
  'dribbles',
  'errors_leading_to_goal',
  'errors_leading_to_goal_attempt',
  'fixture',
  'fouls',
  'goals_conceded',
  'goals_scored',
  'ict_index',
  'influence',
  'key_passes',
  'loaned_in',
  'loaned_out',
  'minutes',
  'offside',
  'open_play_crosses',
  'own_goals',
  'penalties_conceded',
  'penalties_missed',
  'penalties_saved',
  'recoveries',
  'red_cards',
  'round',
  'saves',
  'selected',
  'tackled',
  'tackles',
  'target_missed',
  'team_a_score',
  'team_h_score',
  'threat',
  'total_points',
  'transfers_balance',
  'transfers_in',
  'transfers_out',
  'value',
  'winning_goals',
  'yellow_cards',
  'week'
]

labels = [
  'long_term_points',
  'short_term_points',
  'short_term_cost_change'
]

features = [
  *numerical_columns,
  *categorical_columns
]

# df.copy().drop(columns=[
#   *information_columns,
#   *features,
#   *labels
# ]).dtypes

### Numerical Values

In [0]:
from sklearn.preprocessing import MinMaxScaler

def normalize_numerical_values(df):
  df['week_raw'] = df['week']
  df['total_points_raw'] = df['total_points']
  df['cost_raw'] = df['value']

  scaler = MinMaxScaler()
  df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

  return df

### Boolean values

In [0]:
def cast_boolean_to_int(df):
  df[boolean_columns] = df[boolean_columns].apply(lambda was_home: was_home * 1)
  return df

### Applying scaling and preprocessing

In [0]:
df = cast_boolean_to_int(normalize_numerical_values(df))

## Save and Read to and from CSV file

In [0]:
processed_dataset_path = os.path.join(PLAYER_ROOT, 'processed_dataset.csv')

In [0]:
df.to_csv(processed_dataset_path)

In [0]:
df = pd.read_csv(processed_dataset_path)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

count    67936.000000
mean         0.520496
std          0.298672
min          0.000000
25%          0.270270
50%          0.513514
75%          0.783784
max          1.000000
Name: week, dtype: float64

## Create Tensorflow Dataset

In [0]:
from tqdm import tqdm

WINDOW_SIZE = 38

def get_windowed_dataset(dataframe, group_key, features, labels):
  _dataframe = dataframe[[group_key, *features, *labels, 'week_raw', 'total_points_raw', 'season']]
  grouping = _dataframe.groupby(_dataframe[group_key])
  
  dataset_features = []
  dataset_labels = []
  meta_data = []
  for g_id, g_data in tqdm(grouping):
    for stride in range(len(g_data) - WINDOW_SIZE + 1):
      feature_values = g_data[features][stride: stride + WINDOW_SIZE].values
      label_values = g_data[labels][stride: stride + WINDOW_SIZE].values
      week = g_data['week_raw'][stride: stride + WINDOW_SIZE].values
      total_points = g_data['total_points_raw'][stride: stride + WINDOW_SIZE].values
      season = g_data['season'][stride: stride + WINDOW_SIZE].values
      
      dataset_features.append(feature_values)
      dataset_labels.append(label_values[-1])

      meta_data.append((g_id, week[-1], total_points[-1], season[-1]))

  dataset = tf.data.Dataset.from_tensor_slices(((dataset_features), dataset_labels))

  return dataset, meta_data

df_train = df[(df['season'] == '2016-17') | (df['season'] == '2017-18')].copy()
df_test = df[(df['season'] == '2017-18') | (df['season'] == '2018-19')].copy()
ds_train, _meta_data = get_windowed_dataset(df_train, 'player_index', features, labels)
ds_test, ds_test_meta_data = get_windowed_dataset(df_test, 'player_index', features, labels)

ds_train = ds_train.shuffle(500).batch(50)
ds_train = ds_train.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)


  0%|          | 0/897 [00:00<?, ?it/s][A
  0%|          | 2/897 [00:00<01:18, 11.39it/s][A
  0%|          | 4/897 [00:00<01:18, 11.38it/s][A
  1%|          | 6/897 [00:00<01:16, 11.69it/s][A
  1%|          | 8/897 [00:00<01:14, 11.93it/s][A
  1%|          | 10/897 [00:00<01:14, 11.97it/s][A
  1%|▏         | 12/897 [00:01<01:13, 11.97it/s][A
  2%|▏         | 14/897 [00:01<01:12, 12.12it/s][A
  2%|▏         | 16/897 [00:01<01:13, 12.04it/s][A
  2%|▏         | 18/897 [00:01<01:12, 12.06it/s][A
  2%|▏         | 20/897 [00:01<01:11, 12.19it/s][A
  2%|▏         | 22/897 [00:01<01:11, 12.32it/s][A
  3%|▎         | 24/897 [00:01<01:11, 12.24it/s][A
  3%|▎         | 26/897 [00:02<01:10, 12.34it/s][A
  3%|▎         | 28/897 [00:02<01:14, 11.71it/s][A
  3%|▎         | 30/897 [00:02<01:14, 11.57it/s][A
  4%|▎         | 32/897 [00:02<01:12, 11.86it/s][A
  4%|▍         | 34/897 [00:02<01:11, 12.06it/s][A
  4%|▍         | 36/897 [00:02<01:11, 12.08it/s][A
  4%|▍         | 38/897 

## Create Tensorflow Model

In [0]:
# from tensorflow.keras import utils

num_features = len(features)
num_labels = len(labels)

input_1 = tf.keras.layers.Input((WINDOW_SIZE, num_features), name="input_1")
dense_1 = tf.keras.layers.Dense(num_features * 2, "relu", name="dense_1")(input_1)
batch_1 = tf.keras.layers.BatchNormalization(name="batch_1")(dense_1)
dense_2 = tf.keras.layers.Dense(num_features, "relu", name="dense_2")(batch_1)
batch_2 = tf.keras.layers.BatchNormalization(name="batch_2")(dense_2)
lstm_1 = tf.keras.layers.LSTM(num_features * 2, name="lstm_1")(batch_2)
dense_3 = tf.keras.layers.Dense(num_features, "relu", name="dense_3")(lstm_1)
batch_3 = tf.keras.layers.BatchNormalization(name="batch_3")(dense_3)
dense_4 = tf.keras.layers.Dense(num_features * 2, "relu", name="dense_4")(batch_3)
batch_4 = tf.keras.layers.BatchNormalization(name="batch_4")(dense_4)
dense_5 = tf.keras.layers.Dense(num_features, "relu", name="dense_5")(batch_4)
batch_5 = tf.keras.layers.BatchNormalization(name="batch_5")(dense_5)
output_1 =  tf.keras.layers.Dense(num_labels, "linear", name="output_1")(batch_5)

model = tf.keras.Model(inputs=input_1, outputs=output_1, name="model")
model.compile("adam", "mse", metrics=["accuracy"])

# utils.plot_model(model, show_shapes=True)

## Train model

In [0]:
model.fit(ds_train, epochs=30)

Train for 317 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fe16574bd68>

## Compile final results

In [0]:
_val, lab = zip(*ds_test.as_numpy_iterator())
lab = list(lab)

real_values = pd.DataFrame.from_records(lab, columns=["long_real", "short_real", "cost_real"])

prediction = model.predict(ds_test.batch(100))

prediction_values = pd.DataFrame.from_records(prediction, columns=["long_pred", "short_pred", "cost_pred"])

In [0]:
results = pd.merge(real_values, prediction_values, left_index=True, right_index=True).sort_index(axis=1)
df_meta = pd.DataFrame.from_records(ds_test_meta_data, columns=["player_id", "week", "total_points", "season"])

results = pd.merge(df_meta, results, left_index=True, right_index=True)

results

Unnamed: 0,player_id,week,total_points,season,cost_pred,cost_real,long_pred,long_real,short_pred,short_real
0,1,38,2,2017-18,0.006080,0.000000,1.886834,2.000000,3.332764,2.000000
1,1,1,0,2018-19,-0.012661,-4.666667,1.544490,0.894737,2.242678,1.666667
2,1,25,2,2018-19,-0.005661,1.333333,1.853603,0.918919,3.614124,3.000000
3,1,19,2,2018-19,0.003674,0.000000,1.148359,0.888889,2.846313,2.250000
4,1,37,0,2018-19,-0.006991,1.333333,0.500508,0.857143,1.528017,0.583333
...,...,...,...,...,...,...,...,...,...,...
15374,865,25,2,2018-19,-0.009120,0.000000,0.853458,2.000000,1.801127,2.000000
15375,866,25,0,2018-19,0.001557,0.000000,0.632868,0.000000,1.119330,0.000000
15376,867,37,2,2018-19,0.012554,0.000000,2.531351,2.000000,4.955649,2.000000
15377,868,37,2,2018-19,0.004106,0.000000,1.003423,2.000000,4.630850,2.000000


## Save Final Results and Model

Save

In [0]:
file_path_results = os.path.join(PLAYER_ROOT, 'final_results.csv')
results.to_csv(file_path_results)

In [0]:
file_path_model = os.path.join(PLAYER_ROOT, 'model.ckpt')
model.save_weights(file_path_model)

Load

In [0]:
file_path_results = os.path.join(PLAYER_ROOT, 'final_results.csv')
results = pd.read_csv(file_path_results)
results = results.loc[:, ~results.columns.str.contains('^Unnamed')]

In [0]:
file_path_model = os.path.join(PLAYER_ROOT, 'model.ckpt')
model.load_weights(file_path_model)