<a href="https://colab.research.google.com/github/konamilk/mlb-player-digital-engagement-forecasting/blob/main/mlb_v1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import sys
from pathlib import Path
if 'google.colab' in sys.modules:
    INPUT = Path('/content/input/')
elif 'kaggle_web_client' in sys.modules:
    INPUT = Path('../input/mlb-player-digital-engagement-forecasting')

In [28]:
if 'google.colab' in sys.modules:
  !mkdir ~/.kaggle
  !cp /content/drive/MyDrive/.kaggle/kaggle.json ~/.kaggle
  !chmod 600 ~/.kaggle/kaggle.json
  !pip install kaggle > /dev/null

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [29]:
if 'google.colab' in sys.modules:
    !pip install git+https://github.com/pfnet-research/xfeat.git > /dev/null
else:
    !pip install ../input/xfeatwhl/xfeat-0.1.1-py3-none-any.whl > /dev/null

  Running command git clone -q https://github.com/pfnet-research/xfeat.git /tmp/pip-req-build-pvnnnljw


In [30]:
if 'google.colab' in sys.modules:
  !kaggle competitions download -c mlb-player-digital-engagement-forecasting
  !mkdir input
  !unzip -o '*.zip' -d ./input/
  !rm *.zip
  !mv *.csv ./input/

competition.cpython-37m-x86_64-linux-gnu.so: Skipping, found more recently modified local copy (use --force to force download)
__init__.py: Skipping, found more recently modified local copy (use --force to force download)
Downloading awards.csv to /content
  0% 0.00/820k [00:00<?, ?B/s]
100% 820k/820k [00:00<00:00, 57.7MB/s]
Downloading teams.csv to /content
  0% 0.00/3.68k [00:00<?, ?B/s]
100% 3.68k/3.68k [00:00<00:00, 3.68MB/s]
Downloading train.csv.zip to /content
 97% 454M/470M [00:03<00:00, 141MB/s]
100% 470M/470M [00:03<00:00, 133MB/s]
Downloading seasons.csv to /content
  0% 0.00/824 [00:00<?, ?B/s]
100% 824/824 [00:00<00:00, 785kB/s]
Downloading players.csv to /content
  0% 0.00/173k [00:00<?, ?B/s]
100% 173k/173k [00:00<00:00, 55.8MB/s]
Downloading example_sample_submission.csv to /content
  0% 0.00/191k [00:00<?, ?B/s]
100% 191k/191k [00:00<00:00, 179MB/s]
Downloading example_test.csv.zip to /content
  0% 0.00/3.96M [00:00<?, ?B/s]
100% 3.96M/3.96M [00:00<00:00, 110MB/s]
mkdi

In [31]:
import numpy as np
import pandas as pd
import gc
import pickle
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.functional as F
from torch.utils import data

In [32]:
SEED = 1234

TRAIN = INPUT/'train.csv'
TEAMS = INPUT/'awards.csv'
PLAYERS = INPUT/'players.csv'
AWARDS = INPUT/'awards.csv'
SEASONS = INPUT/'seasons.csv'

In [33]:
df = pd.read_csv(TRAIN)

In [34]:
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

In [35]:
import datetime
def next_date_as_int(date_as_int: int):
  dd = date_as_int % 100
  date_as_int = date_as_int // 100
  mm = date_as_int % 100
  yyyy = date_as_int // 100
  dt = datetime.datetime(yyyy, mm, dd)
  next_dt = dt + datetime.timedelta(days=1)
  return next_dt.year * 10000 + next_dt.month * 100 + next_dt.day


def int_date(dt: datetime):
  return dt.year * 10000 + dt.month * 100 + dt.day

In [36]:
class TrainDataset(object):
  def __init__(self, df_train, batch_size=5):
    self.df_train = df_train
    self.current = 0
    self.batch_size = batch_size
    df_player = pd.read_csv(PLAYERS)
    df_player[df_player['playerForTestSetAndFuturePreds'] == True]
    self.playerId =  df_player['playerId']
  
  def __iter__(self):
    return self

  def __next__(self):
    start = self.current * self.batch_size
    end = (self.current + 1) * self.batch_size
    self.current += 1

    if start >= self.df_train.shape[0]:
      raise StopIteration()
    
    dates = self.df_train[start:end].date.unique()

    temps = []
    for date in dates:
      df_temp = pd.DataFrame(self.playerId)
      df_temp['date'] = date
      df_temp['date_playerId'] = str(next_date_as_int(date)) + '_' + df_temp['playerId'].astype(str)
      df_temp['target1'] = 0.0
      df_temp['target2'] = 0.0
      df_temp['target3'] = 0.0
      df_temp['target4'] = 0.0
      df_temp = df_temp.drop(['playerId'], axis=1)
      temps.append(df_temp)

    return  self.df_train[start:end].drop('nextDayPlayerEngagement', axis=1).set_index('date'), pd.concat(temps, axis=0).set_index('date'), self.df_train[start:end].nextDayPlayerEngagement

In [112]:
from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, \
    ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer

class PreProcessor(object):
  def __init__(self):
    self.label_encoder = Pipeline([
      SelectCategorical(exclude_cols=['engagementMetricsDate']),
      LabelEncoder(output_suffix=""),
      ])
    self.df_rosters = pd.DataFrame()
    self.df_playerTwitterFollowers = pd.DataFrame()
    self.df_teamTwitterFollowers = pd.DataFrame()
    self.df_games = pd.DataFrame()

  def pre_process(self, df_dairy: pd.DataFrame, df_submission: pd.DataFrame, phase = "train") -> pd.DataFrame:

    X = df_submission.drop(['target1', 'target2', 'target3', 'target4'], axis=1)
    X['today'] = pd.to_datetime(X.index.to_numpy().astype(str))
    X['year'] = X['today'].dt.year
    X['month'] = X['today'].dt.month
    X['day'] = X['today'].dt.day
    X['dayofweek'] = X['today'].dt.dayofweek
    X['engagementMetricsDate'] = pd.to_datetime(X['date_playerId'].str.split('_', expand = True)[0])
    X['playerId'] = X['date_playerId'].str.split('_', expand = True)[1].astype(int)

    # join rosters

    if df_dairy['rosters'].notnull().sum() > 0:
      df_rosters = pd.concat(map(lambda x: unpack_json(x), df_dairy[df_dairy['rosters'].notnull()]['rosters']), axis=0)
      df_rosters['gameDate'] = pd.to_datetime(df_rosters['gameDate'])
      self.df_rosters = pd.concat([self.df_rosters, df_rosters],axis=0).drop_duplicates()
    X = pd.merge(X, self.df_rosters, how='left', left_on=['today', 'playerId'], right_on=['gameDate', 'playerId']).drop(['status'], axis=1)

    # join player twitter followers
    if df_dairy['playerTwitterFollowers'].notnull().sum() > 0:
      df_playerTwitterFollowers = pd.concat(map(lambda x: unpack_json(x),df_dairy[df_dairy['playerTwitterFollowers'].notnull()]['playerTwitterFollowers']),axis=0)
      df_playerTwitterFollowers = df_playerTwitterFollowers[['date', 'playerId', 'numberOfFollowers']]
      df_playerTwitterFollowers['year'] = df_playerTwitterFollowers['date'].dt.year
      df_playerTwitterFollowers['month'] = df_playerTwitterFollowers['date'].dt.month
      df_playerTwitterFollowers = df_playerTwitterFollowers.drop('date', axis=1)
      self.df_playerTwitterFollowers = pd.concat([self.df_playerTwitterFollowers, df_playerTwitterFollowers], axis=0).drop_duplicates()
    X = pd.merge(X, self.df_playerTwitterFollowers, how='left', on=['playerId', 'year', 'month'])

    # join team twitter followers
    if df_dairy['teamTwitterFollowers'].notnull().sum() > 0:
      df_teamTwitterFollowers = pd.concat(map(lambda x: unpack_json(x),df_dairy[df_dairy['teamTwitterFollowers'].notnull()]['teamTwitterFollowers']),axis=0)
      df_teamTwitterFollowers = df_teamTwitterFollowers[['date', 'teamId', 'numberOfFollowers']]
      df_teamTwitterFollowers['year'] = df_teamTwitterFollowers['date'].dt.year
      df_teamTwitterFollowers['month'] = df_teamTwitterFollowers['date'].dt.month
      df_teamTwitterFollowers = df_teamTwitterFollowers.drop('date', axis=1)
      self.df_teamTwitterFollowers = pd.concat([self.df_teamTwitterFollowers, df_teamTwitterFollowers], axis=0).drop_duplicates()
    X = pd.merge(X, self.df_teamTwitterFollowers, how='left', on=['teamId', 'year', 'month'])

    # join games
    if df_dairy['games'].notnull().sum() > 0:
      df_games = pd.concat(map(lambda x: unpack_json(x),df_dairy[df_dairy['games'].notnull()]['games']),axis=0)
      new_columns = ['gamePk', 'gameType', 'season', 'gameDate', 'gameTimeUTC', 'resumeDate',
            'resumedFrom', 'codedGameState', 'detailedGameState', 'isTie',
            'gameNumber', 'doubleHeader', 'dayNight', 'scheduledInnings',
            'gamesInSeries', 'seriesDescription', 'teamId', 'teamName',
            'teamAbbrev', 'teamWins', 'teamLosses', 'teamWinPct', 'teamWinner',
            'teamScore', 'opponentId', 'opponentName', 'opponentAbbrev', 'opponentWins',
            'opponentLosses', 'opponentWinPct', 'opponentWinner', 'opponentScore']
      df_games_home = df_games.copy()
      df_games_home.columns = new_columns
      df_games_away = df_games.copy()[['gamePk', 'gameType', 'season', 'gameDate', 'gameTimeUTC', 'resumeDate',
            'resumedFrom', 'codedGameState', 'detailedGameState', 'isTie',
            'gameNumber', 'doubleHeader', 'dayNight', 'scheduledInnings',
            'gamesInSeries', 'seriesDescription', 'awayId', 'awayName', 'awayAbbrev', 'awayWins',
            'awayLosses', 'awayWinPct', 'awayWinner', 'awayScore', 'homeId', 'homeName',
            'homeAbbrev', 'homeWins', 'homeLosses', 'homeWinPct', 'homeWinner',
            'homeScore']]
      df_games_away.columns = new_columns
      df_games_2 = pd.concat([df_games_home, df_games_away], axis=0).drop(['teamName', 'teamAbbrev', 'opponentName', 'opponentAbbrev', 'opponentWinner'], axis=1)
      df_games_2['today'] = pd.to_datetime(df_games_2['gameDate'])
      self.df_games = pd.concat([self.df_games, df_games_2], axis=0).drop_duplicates(['teamId', 'today'])
    X = pd.merge(X, self.df_games, how='left', on=['teamId', 'today'])
    del new_columns, df_games, df_games_home, df_games_away, df_games_2

    # label encoding
    if phase == 'train':
      X = X.drop(['today', 'date_playerId'], axis=1)
      encoded_X = self.label_encoder.fit_transform(X)
      X = pd.concat([X['engagementMetricsDate'], encoded_X, SelectNumerical().fit_transform(X)], axis=1)
    elif phase == 'eval':
      X = X.drop(['today', 'date_playerId'], axis=1)
      encoded_X = self.label_encoder.transform(X)
      X = pd.concat([X['engagementMetricsDate'], encoded_X, SelectNumerical().fit_transform(X)], axis=1)
    
    return X


In [113]:
# make Traing
df_train = df[df.date <= 20210331]

dts = TrainDataset(df_train, df_train.shape[0])
df_dairy, df_submission, targets = next(dts)
pre = PreProcessor()

X = pre.pre_process(df_dairy, df_submission)

# join targets
df_targets = pd.concat(map(lambda z: unpack_json(z), targets), axis=0)
df_targets['engagementMetricsDate'] = pd.to_datetime(df_targets['engagementMetricsDate'])
y = pd.merge(X[['engagementMetricsDate', 'playerId']], df_targets, on=['engagementMetricsDate', 'playerId'], how='left')
del df_targets



In [114]:
# make Validation
df_valid = df[(df.date >= 20210401) & (df.date <= 20210430)]
dts = TrainDataset(df_valid, df_valid.shape[0])

df_dairy, df_submission, targets = next(dts)

X_val = pre.pre_process(df_dairy, df_submission, phase='eval')
df_targets = pd.concat(map(lambda z: unpack_json(z), targets), axis=0)
df_targets['engagementMetricsDate'] = pd.to_datetime(df_targets['engagementMetricsDate'])
y_val = pd.merge(X_val[['engagementMetricsDate', 'playerId']], df_targets, on=['engagementMetricsDate', 'playerId'], how='left')

del df_targets

print(df_valid.shape, X_val.shape, y_val.shape)

(30, 12) (61830, 37) (61830, 6)


In [115]:
gc.collect()

256

In [116]:
import lightgbm as lgb

params = {'objective': 'regression',
             'metric': 'rmse',
             'verbose': -1,
             'feature_pre_filter': False,
            #  'lambda_l1': 1.9246603611247695,
            #  'lambda_l2': 0.0015207873611208637,
             'num_leaves': 45,
            #  'feature_fraction': 0.616,
             'bagging_fraction': 1.0,
             'bagging_freq': 0,
             'min_child_samples': 20,
             'seed': SEED,
}

target_columns = ['target1', 'target2', 'target3', 'target4']

models = []
for target_column in target_columns:
  lgb_train = lgb.Dataset(X.drop(['engagementMetricsDate'], axis=1), y[target_column])
  lgb_valid = lgb.Dataset(X_val.drop(['engagementMetricsDate'], axis=1), y_val[target_column])
  model = lgb.train(params, lgb_train, num_boost_round=200, valid_sets=lgb_valid, early_stopping_rounds=10)
  models.append(model)

[1]	valid_0's rmse: 4.63459
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's rmse: 4.60187
[3]	valid_0's rmse: 4.5751
[4]	valid_0's rmse: 4.55896
[5]	valid_0's rmse: 4.54009
[6]	valid_0's rmse: 4.51929
[7]	valid_0's rmse: 4.5096
[8]	valid_0's rmse: 4.50052
[9]	valid_0's rmse: 4.48827
[10]	valid_0's rmse: 4.47512
[11]	valid_0's rmse: 4.47126
[12]	valid_0's rmse: 4.46642
[13]	valid_0's rmse: 4.45969
[14]	valid_0's rmse: 4.45072
[15]	valid_0's rmse: 4.43942
[16]	valid_0's rmse: 4.43198
[17]	valid_0's rmse: 4.43108
[18]	valid_0's rmse: 4.42532
[19]	valid_0's rmse: 4.41867
[20]	valid_0's rmse: 4.41102
[21]	valid_0's rmse: 4.40675
[22]	valid_0's rmse: 4.40212
[23]	valid_0's rmse: 4.3991
[24]	valid_0's rmse: 4.39603
[25]	valid_0's rmse: 4.39237
[26]	valid_0's rmse: 4.39031
[27]	valid_0's rmse: 4.39026
[28]	valid_0's rmse: 4.3879
[29]	valid_0's rmse: 4.38607
[30]	valid_0's rmse: 4.38632
[31]	valid_0's rmse: 4.38669
[32]	valid_0's rmse: 4.38682
[33]	valid_0's rmse: 4.

In [117]:
import json
cv_scores = {}
for t_col, model in zip(target_columns, models):
  print(model.best_score['valid_0']['rmse'])
  cv_scores[t_col] = model.best_score['valid_0']['rmse']

with open('cv_scores.json', 'w') as fp:
  json.dump(cv_scores, fp)

4.348670739193442
5.507390538954544
4.181408051138141
4.466732911119326


In [118]:
VIRTUAL_TEST = True

if VIRTUAL_TEST:
  df_test = df[(df.date >= 20210401) & (df.date <= 20210430)]
  iter_test = TrainDataset(df_valid, 5)
  for df_dairy, df_submission, _ in iter_test:
    X = pre.pre_process(df_dairy, df_submission, phase='eval')
    X = X.drop('engagementMetricsDate', axis=1)
    for (i, model) in enumerate(models):
      df_submission[target_columns[i]] = np.clip(model.predict(X), 0.0, 100.0)    

In [121]:
if 'kaggle_secrets' in sys.modules:  # only run while on Kaggle
  import mlb

  env = mlb.make_env()
  iter_test = env.iter_test()

  for df_dairy, df_submission in iter_test:
    X = pre.pre_process(df_dairy, df_submission, phase='eval')
    X = X.drop('engagementMetricsDate', axis=1)

    for (i, model) in enumerate(models):
      df_submission[target_columns[i]] = np.clip(model.predict(X), 0.0, 100.0)

    env.predict(df_submission)