<a href="https://colab.research.google.com/github/konamilk/mlb-player-digital-engagement-forecasting/blob/main/MLB_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from requests import get
NOTEBOOK = get('http://172.28.0.2:9000/api/sessions').json()[0]['name'].split('.')[0]
NOTEBOOK

'MLB_baseline'

In [2]:
import sys
from pathlib import Path
if 'google.colab' in sys.modules:
    INPUT = Path('/content/input/')
elif 'kaggle_web_client' in sys.modules:
    INPUT = Path('../input/')

In [3]:
if 'google.colab' in sys.modules:
  !mkdir ~/.kaggle
  !cp /content/drive/MyDrive/.kaggle/kaggle.json ~/.kaggle
  !chmod 600 ~/.kaggle/kaggle.json
  !pip install kaggle



In [4]:
if 'google.colab' in sys.modules:
  !kaggle competitions download -c mlb-player-digital-engagement-forecasting
  !mkdir input
  !unzip -o '*.zip' -d ./input/
  !rm *.zip
  !mv *.csv ./input/

Downloading competition.cpython-37m-x86_64-linux-gnu.so to /content
  0% 0.00/443k [00:00<?, ?B/s]
100% 443k/443k [00:00<00:00, 42.7MB/s]
Downloading __init__.py to /content
  0% 0.00/59.0 [00:00<?, ?B/s]
100% 59.0/59.0 [00:00<00:00, 32.2kB/s]
Downloading seasons.csv to /content
  0% 0.00/824 [00:00<?, ?B/s]
100% 824/824 [00:00<00:00, 2.19MB/s]
Downloading example_test.csv.zip to /content
  0% 0.00/3.96M [00:00<?, ?B/s]
100% 3.96M/3.96M [00:00<00:00, 64.7MB/s]
Downloading players.csv to /content
  0% 0.00/173k [00:00<?, ?B/s]
100% 173k/173k [00:00<00:00, 145MB/s]
Downloading train.csv.zip to /content
 96% 452M/470M [00:04<00:00, 134MB/s]
100% 470M/470M [00:04<00:00, 106MB/s]
Downloading awards.csv to /content
  0% 0.00/820k [00:00<?, ?B/s]
100% 820k/820k [00:00<00:00, 191MB/s]
Downloading teams.csv to /content
  0% 0.00/3.68k [00:00<?, ?B/s]
100% 3.68k/3.68k [00:00<00:00, 2.66MB/s]
Downloading example_sample_submission.csv to /content
  0% 0.00/191k [00:00<?, ?B/s]
100% 191k/191k [00:0

In [5]:
import numpy as np
import pandas as pd
import gc
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.functional as F
from torch.utils import data

In [6]:
TRAIN = INPUT/'train.csv'
TEAMS = INPUT/'awards.csv'
PLAYERS = INPUT/'players.csv'
AWARDS = INPUT/'awards.csv'
SEASONS = INPUT/'seasons.csv'

In [7]:
df = pd.read_csv(TRAIN)

In [8]:
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

In [9]:
import datetime
def next_date_as_int(date_as_int: int):
  dd = date_as_int % 100
  date_as_int = date_as_int // 100
  mm = date_as_int % 100
  yyyy = date_as_int // 100
  dt = datetime.datetime(yyyy, mm, dd)
  next_dt = dt + datetime.timedelta(days=1)
  return next_dt.year * 10000 + next_dt.month * 100 + next_dt.day

In [10]:
class TrainDataset(object):
  def __init__(self,df_train, batch_size=5):
    self.df_train = df_train
    self.current = 0
    self.batch_size = batch_size
    self.df_example_sample_submission = pd.read_csv(INPUT/'example_sample_submission.csv')
    self.playerId =  self.df_example_sample_submission[self.df_example_sample_submission.date == self.df_example_sample_submission.date[0]].date_playerId.map(lambda x: int(x.split('_')[1]))
    self.playerId.name = 'playerId'
  
  def __iter__(self):
    return self

  def __next__(self):
    start = self.current * self.batch_size
    end = (self.current + 1) * self.batch_size
    self.current += 1

    if start >= self.df_train.shape[0]:
      raise StopIteration()
    
    dates = self.df_train[start:end].date.unique()

    temps = []
    for date in dates:
      df_temp = pd.DataFrame(self.playerId)
      df_temp['date'] = date
      df_temp['date_playerId'] = str(next_date_as_int(date)) + '_' + df_temp['playerId'].astype(str)
      df_temp['target1'] = 0.0
      df_temp['target2'] = 0.0
      df_temp['target3'] = 0.0
      df_temp['target4'] = 0.0
      df_temp = df_temp.drop(['playerId'], axis=1)
      temps.append(df_temp)

    return  self.df_train[start:end].drop('nextDayPlayerEngagement', axis=1), pd.concat(temps, axis=0), self.df_train[start:end].nextDayPlayerEngagement

In [16]:
def pre_process(df_dairy: pd.DataFrame, df_submission: pd.DataFrame) -> pd.DataFrame:
  X = df_submission.drop(['target1', 'target2', 'target3', 'target4'], axis=1)
  X['date'] = pd.to_datetime(X['date'].astype(str))
  X['year'] = X['date'].dt.year
  X['month'] = X['date'].dt.month
  X['day'] = X['date'].dt.day
  X['dayofweek'] = X['date'].dt.dayofweek
  X['engagementMetricsDate'] = pd.to_datetime(X['date_playerId'].str.split('_', expand = True)[0])
  X['playerId'] = X['date_playerId'].str.split('_', expand = True)[1].astype(int)

  # join rosters
  df_rosters = pd.concat(map(lambda x: unpack_json(x), df_dairy[df_dairy['rosters'].notnull()]['rosters']), axis=0)
  df_rosters['gameDate'] = pd.to_datetime(df_rosters['gameDate'])
  X = pd.merge(X, df_rosters, how='left', left_on=['date', 'playerId'], right_on=['gameDate', 'playerId']).drop(['gameDate','status'], axis=1)
  del df_rosters

  # join player twitter followers
  df_playerTwitterFollowers = pd.concat(map(lambda x: unpack_json(x),df_dairy[df_dairy['playerTwitterFollowers'].notnull()]['playerTwitterFollowers']),axis=0)
  df_playerTwitterFollowers = df_playerTwitterFollowers[['date', 'playerId', 'numberOfFollowers']]
  df_playerTwitterFollowers['year'] = df_playerTwitterFollowers['date'].dt.year
  df_playerTwitterFollowers['month'] = df_playerTwitterFollowers['date'].dt.month
  df_playerTwitterFollowers = df_playerTwitterFollowers.drop('date', axis=1)
  X = pd.merge(X, df_playerTwitterFollowers, how='left', on=['playerId', 'year', 'month'])
  del df_playerTwitterFollowers

  # join team twitter followers
  df_teamTwitterFollowers = pd.concat(map(lambda x: unpack_json(x),df_dairy[df_dairy['teamTwitterFollowers'].notnull()]['teamTwitterFollowers']),axis=0)
  df_teamTwitterFollowers = df_teamTwitterFollowers[['date', 'teamId', 'numberOfFollowers']]
  df_teamTwitterFollowers['year'] = df_teamTwitterFollowers['date'].dt.year
  df_teamTwitterFollowers['month'] = df_teamTwitterFollowers['date'].dt.month
  df_teamTwitterFollowers = df_teamTwitterFollowers.drop('date', axis=1)
  X = pd.merge(X, df_teamTwitterFollowers, how='left', on=['teamId', 'year', 'month'])
  del df_teamTwitterFollowers

  # join games
  df_games = pd.concat(map(lambda x: unpack_json(x),df_dairy[df_dairy['games'].notnull()]['games']),axis=0)
  new_columns = ['gamePk', 'gameType', 'season', 'gameDate', 'gameTimeUTC', 'resumeDate',
        'resumedFrom', 'codedGameState', 'detailedGameState', 'isTie',
        'gameNumber', 'doubleHeader', 'dayNight', 'scheduledInnings',
        'gamesInSeries', 'seriesDescription', 'teamId', 'teamName',
        'teamAbbrev', 'teamWins', 'teamLosses', 'teamWinPct', 'teamWinner',
        'teamScore', 'opponentId', 'opponentName', 'opponentAbbrev', 'opponentWins',
        'opponentLosses', 'opponentWinPct', 'opponentWinner', 'opponentScore']
  df_games_home = df_games.copy()
  df_games_home.columns = new_columns
  df_games_away = df_games.copy()[['gamePk', 'gameType', 'season', 'gameDate', 'gameTimeUTC', 'resumeDate',
        'resumedFrom', 'codedGameState', 'detailedGameState', 'isTie',
        'gameNumber', 'doubleHeader', 'dayNight', 'scheduledInnings',
        'gamesInSeries', 'seriesDescription', 'awayId', 'awayName', 'awayAbbrev', 'awayWins',
        'awayLosses', 'awayWinPct', 'awayWinner', 'awayScore', 'homeId', 'homeName',
        'homeAbbrev', 'homeWins', 'homeLosses', 'homeWinPct', 'homeWinner',
        'homeScore']]
  df_games_away.columns = new_columns
  df_games_2 = pd.concat([df_games_home, df_games_away], axis=0).drop(['teamName', 'teamAbbrev', 'opponentName', 'opponentAbbrev', 'opponentWinner'], axis=1)
  df_games_2['date'] = pd.to_datetime(df_games_2['gameDate'])
  X = pd.merge(X, df_games_2, how='left', on=['teamId', 'date'])
  del new_columns, df_games, df_games_home, df_games_away, df_games_2

  gc.collect()

  return X


In [17]:
df_train = df[(df.date >= 20200401) & (df.date <= 20210331)]
dts = TrainDataset(df_train, df_train.shape[0])

df_dairy, df_submission, targets = next(dts)

X = pre_process(df_dairy, df_submission)

# join targets
df_targets = pd.concat(map(lambda z: unpack_json(z), targets), axis=0)
df_targets['engagementMetricsDate'] = pd.to_datetime(df_targets['engagementMetricsDate'])
y = pd.merge(X[['engagementMetricsDate', 'playerId']], df_targets, on=['engagementMetricsDate', 'playerId'], how='left')
del df_targets



In [18]:
y

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4
0,2020-04-02,656669,0.007064,6.289308,0.003501,0.491642
1,2020-04-02,543475,0.017660,5.503145,0.008168,0.721075
2,2020-04-02,592866,0.038851,4.874214,0.005835,0.917732
3,2020-04-02,452678,0.109490,12.421384,0.036174,2.556539
4,2020-04-02,570257,0.000000,3.144654,0.009335,0.360538
...,...,...,...,...,...,...
437786,2021-04-01,593590,0.000000,0.048041,0.000000,0.000000
437787,2021-04-01,642180,0.000972,5.133484,0.000398,0.834084
437788,2021-04-01,663399,0.000707,0.267655,0.000398,0.039159
437789,2021-04-01,664199,0.013513,2.100062,0.000796,0.203626


In [19]:
X

Unnamed: 0,date,date_playerId,year,month,day,dayofweek,engagementMetricsDate,playerId,teamId,statusCode,numberOfFollowers_x,numberOfFollowers_y,gamePk,gameType,season,gameDate,gameTimeUTC,resumeDate,resumedFrom,codedGameState,detailedGameState,isTie,gameNumber,doubleHeader,dayNight,scheduledInnings,gamesInSeries,seriesDescription,teamWins,teamLosses,teamWinPct,teamWinner,teamScore,opponentId,opponentWins,opponentLosses,opponentWinPct,opponentScore
0,2020-04-01,20200402_656669,2020,4,1,2,2020-04-02,656669,114.0,A,4964.0,1072516.0,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2020-04-01,20200402_543475,2020,4,1,2,2020-04-02,543475,140.0,A,,1435517.0,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2020-04-01,20200402_592866,2020,4,1,2,2020-04-02,592866,134.0,A,34102.0,754173.0,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2020-04-01,20200402_452678,2020,4,1,2,2020-04-02,452678,120.0,A,,767011.0,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2020-04-01,20200402_570257,2020,4,1,2,2020-04-02,570257,140.0,A,,1435517.0,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437786,2021-03-31,20210401_593590,2021,3,31,2,2021-04-01,593590,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
437787,2021-03-31,20210401_642180,2021,3,31,2,2021-04-01,642180,147.0,A,41318.0,3465925.0,,,,,,,,,,,,,,,,,,,,,,,,,,
437788,2021-03-31,20210401_663399,2021,3,31,2,2021-04-01,663399,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
437789,2021-03-31,20210401_664199,2021,3,31,2,2021-04-01,664199,109.0,A,1137.0,586762.0,,,,,,,,,,,,,,,,,,,,,,,,,,


In [33]:
if 'google.colab' in sys.modules:
  !pip install git+https://github.com/pfnet-research/xfeat.git

Collecting git+https://github.com/pfnet-research/xfeat.git
  Cloning https://github.com/pfnet-research/xfeat.git to /tmp/pip-req-build-hp32k8c4
  Running command git clone -q https://github.com/pfnet-research/xfeat.git /tmp/pip-req-build-hp32k8c4
Building wheels for collected packages: xfeat
  Building wheel for xfeat (setup.py) ... [?25l[?25hdone
  Created wheel for xfeat: filename=xfeat-0.1.1-cp37-none-any.whl size=39635 sha256=b804d3931a8732df2cd44e2f995dac30f235393241dc3a3b1f5ac31aa2dbf614
  Stored in directory: /tmp/pip-ephem-wheel-cache-e2lm4l_o/wheels/82/f3/da/060c6ceac1125aa285b041284b7ec5324be345b865c713da9c
Successfully built xfeat


In [34]:
from xfeat import SelectCategorical, LabelEncoder, Pipeline, ConcatCombination, SelectNumerical, \
    ArithmeticCombinations, TargetEncoder, aggregation, GBDTFeatureSelector, GBDTFeatureExplorer

ImportError: ignored

In [None]:
import lightgbm as lgb

params = {'objective': 'regression',
             'metric': 'rmse',
             'verbose': -1,
             'feature_pre_filter': False,
             'lambda_l1': 1.9246603611247695,
             'lambda_l2': 0.0015207873611208637,
             'num_leaves': 45,
             'feature_fraction': 0.616,
             'bagging_fraction': 1.0,
             'bagging_freq': 0,
             'min_child_samples': 20,
}

In [20]:
gc.collect()

828

In [15]:
if 'kaggle_secrets' in sys.modules:  # only run while on Kaggle
    import mlb

    env = mlb.make_env()
    iter_test = env.iter_test()

    for (test_df, sample_prediction_df) in iter_test:
    
        # Example: unpack a dataframe from a json column
        today_games = unpack_json(test_df['games'].iloc[0])
    
        # Make your predictions for the next day's engagement
        sample_prediction_df['target1'] = 100.00
    
        # Submit your predictions 
        env.predict(sample_prediction_df)