# Import package

In [1]:
# Import Packages

import os
import mlb
import glob
import csv
import gc
import time

from pathlib import Path
from functools import reduce

from tqdm import tqdm
import lightgbm as lgbm

import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch import optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# For data preprocess
import numpy as np
import pandas as pd
import csv
import os

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from datetime import datetime, timedelta

from sklearn.model_selection import GroupShuffleSplit
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
import pickle
pd.options.mode.chained_assignment = None

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/model-ckpt/cb2
/kaggle/input/model-ckpt/be_lgbm3.pkl
/kaggle/input/model-ckpt/af_lgbm2.pkl
/kaggle/input/model-ckpt/be_lgbm2.pkl
/kaggle/input/model-ckpt/down_lgbm3.pkl
/kaggle/input/model-ckpt/down_lgbm4.pkl
/kaggle/input/model-ckpt/af_lgbm3.pkl
/kaggle/input/model-ckpt/up_lgbm2.pkl
/kaggle/input/model-ckpt/up_lgbm1.pkl
/kaggle/input/model-ckpt/be_lgbm1.pkl
/kaggle/input/model-ckpt/up_lgbm3.pkl
/kaggle/input/model-ckpt/af_lgbm1.pkl
/kaggle/input/model-ckpt/cb3
/kaggle/input/model-ckpt/down_lgbm2.pkl
/kaggle/input/model-ckpt/down_lgbm1.pkl
/kaggle/input/model-ckpt/af_lgbm4.pkl
/kaggle/input/model-ckpt/cb4
/kaggle/input/model-ckpt/cb1
/kaggle/input/model-ckpt/be_lgbm4.pkl
/kaggle/input/model-ckpt/up_lgbm4.pkl
/kaggle/input/mlbscheduleformattedfordigitalengagement/schedule_2021.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/players.csv
/kaggle/input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv
/kaggle/input/mlb-player-digital-engageme

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int64)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [4]:
def stat_feature_player(df, col, target = 'target', group = 'playerId'):

    for i in col:
        mean = pd.DataFrame(df.groupby([i, group, 'year'])['target1', 'target2', 'target3', 'target4'].mean())
        mean.reset_index(inplace=True)
        mean.drop_duplicates(subset = [group, i, 'year'], inplace = True)
        mean = mean.rename(columns = {'target1': f'mean_{i}_target1', 'target2': f'mean_{i}_target2', 'target3': f'mean_{i}_target3', 'target4': f'mean_{i}_target4'})
        
        std = pd.DataFrame(df.groupby([i, group, 'year'])['target1', 'target2', 'target3', 'target4'].std())
        std.reset_index(inplace=True)
        std.drop_duplicates(subset = [group, i, 'year'], inplace = True)
        std = std.rename(columns = {'target1': f'std_{i}_target1', 'target2': f'std_{i}_target2', 'target3': f'std_{i}_target3', 'target4': f'std_{i}_target4'})
        
        med = pd.DataFrame(df.groupby([i, group, 'year'])['target1', 'target2', 'target3', 'target4'].median())
        med.reset_index(inplace=True)
        med.drop_duplicates(subset = [group, i, 'year'], inplace = True)
        med = med.rename(columns = {'target1': f'med_{i}_target1', 'target2': f'med_{i}_target2', 'target3': f'med_{i}_target3', 'target4': f'med_{i}_target4'})
        
        min = pd.DataFrame(df.groupby([i, group, 'year'])['target1', 'target2', 'target3', 'target4'].min())
        min.reset_index(inplace=True)
        min.drop_duplicates(subset = [group, i, 'year'], inplace = True)
        min = min.rename(columns = {'target1': f'min_{i}_target1', 'target2': f'min_{i}_target2', 'target3': f'min_{i}_target3', 'target4': f'min_{i}_target4'})

        max = pd.DataFrame(df.groupby([i, group, 'year'])['target1', 'target2', 'target3', 'target4'].max())
        max.reset_index(inplace=True)
        max.drop_duplicates(subset = [group, i, 'year'], inplace = True)
        max = max.rename(columns = {'target1': f'max_{i}_target1', 'target2': f'max_{i}_target2', 'target3': f'max_{i}_target3', 'target4': f'max_{i}_target4'})

        df = df.merge(mean, how = 'left', on = [group, i, 'year'])
        df = df.merge(std, how = 'left', on = [group, i, 'year'])
        df = df.merge(med, how = 'left', on = [group, i, 'year'])
        df = df.merge(min, how = 'left', on = [group, i, 'year'])
        df = df.merge(max, how = 'left', on = [group, i, 'year'])
 
    return df

def stat_feature(df, col, target = 'target', group = 'playerId'):
    for i in col:
        mean = pd.DataFrame(df.groupby([i, 'year'])[f'{target}1', f'{target}2', f'{target}3', f'{target}4'].mean())
        mean.reset_index(inplace=True)
        mean.drop_duplicates(subset = [i, 'year'], inplace = True)
        mean = mean.rename(columns = {f'{target}1': f'mean_{i}_target1', f'{target}2': f'mean_{i}_target2', f'{target}3': f'mean_{i}_target3', f'{target}4': f'mean_{i}_target4'})
        
        std = pd.DataFrame(df.groupby([i, 'year'])[f'{target}1', f'{target}2', f'{target}3', f'{target}4'].std())
        std.reset_index(inplace=True)
        std.drop_duplicates(subset = [i, 'year'], inplace = True)
        std = std.rename(columns = {f'{target}1': f'std_{i}_target1', f'{target}2': f'std_{i}_target2', f'{target}3': f'std_{i}_target3', f'{target}4': f'std_{i}_target4'})
        
        med = pd.DataFrame(df.groupby([i, 'year'])[f'{target}1', f'{target}2', f'{target}3', f'{target}4'].median())
        med.reset_index(inplace=True)
        med.drop_duplicates(subset = [i, 'year'], inplace = True)
        med = med.rename(columns = {f'{target}1': f'med_{i}_target1', f'{target}2': f'med_{i}_target2', f'{target}3': f'med_{i}_target3', f'{target}4': f'med_{i}_target4'})
        
        min = pd.DataFrame(df.groupby([i, 'year'])[f'{target}1', f'{target}2', f'{target}3', f'{target}4'].min())
        min.reset_index(inplace=True)
        min.drop_duplicates(subset = [i, 'year'], inplace = True)
        min = min.rename(columns = {f'{target}1': f'min_{i}_target1', f'{target}2': f'min_{i}_target2', f'{target}3': f'min_{i}_target3', f'{target}4': f'min_{i}_target4'})

        max = pd.DataFrame(df.groupby([i, 'year'])[f'{target}1', f'{target}2', f'{target}3', f'{target}4'].max())
        max.reset_index(inplace=True)
        max.drop_duplicates(subset = [i, 'year'], inplace = True)
        max = max.rename(columns = {f'{target}1': f'max_{i}_target1', f'{target}2': f'max_{i}_target2', f'{target}3': f'max_{i}_target3', f'{target}4': f'max_{i}_target4'})

        df = df.merge(mean, how = 'left', on = [i, 'year'])
        df = df.merge(std, how = 'left', on = [i, 'year'])
        df = df.merge(med, how = 'left', on = [i, 'year'])
        df = df.merge(min, how = 'left', on = [i, 'year'])
        df = df.merge(max, how = 'left', on = [i, 'year'])
    return df

def get_stat(merge, date_left, date_right, merge_col, reg2020, reg2021):
    tmp1 = reg2020.merge(merge, how = 'left', left_on = [date_left, 'playerId'], right_on=[date_right, 'playerId'])
    tmp2 = reg2021.merge(merge, how = 'left', left_on = [date_left, 'playerId'], right_on=[date_right, 'playerId'])
    
    tmp1 = stat_feature_player(tmp1, col = [f'{merge_col}'])
    tmp1.drop_duplicates(subset=['playerId', merge_col], keep = 'last', inplace = True)
    

    tmp2 = stat_feature_player(tmp2, col = [f'{merge_col}'])
    tmp2.drop_duplicates(subset=['playerId', f'{merge_col}'], keep = 'last', inplace = True)
    
    stat = pd.concat([tmp1,tmp2])
    stat.drop_duplicates(subset = ['playerId', f'{merge_col}'], keep = 'last', inplace = True)

    return stat 

def stat_year(merge, date_left, date_right, merge_col, reg):
    tmp = reg.merge(merge, how = 'left', left_on = [date_left, 'playerId'], right_on=[date_right, 'playerId'])
    tmp1 = stat_feature_player(tmp, col = [f'{merge_col}'])
    tmp1.drop_duplicates(subset = ['playerId', f'{merge_col}', 'year'], keep = 'last', inplace = True)

    return tmp1

def get_transactions(df, col, target = 'target', group = 'playerId'):

    for i in col:
        mean = pd.DataFrame(df.groupby([i, group])['target1', 'target2', 'target3', 'target4'].mean())
        mean.reset_index(inplace=True)
        mean.drop_duplicates(subset = [group, i], inplace = True)
        mean = mean.rename(columns = {'target1': f'mean_{i}_target1', 'target2': f'mean_{i}_target2', 'target3': f'mean_{i}_target3', 'target4': f'mean_{i}_target4'})
        
        std = pd.DataFrame(df.groupby([i, group])['target1', 'target2', 'target3', 'target4'].std())
        std.reset_index(inplace=True)
        std.drop_duplicates(subset = [group, i], inplace = True)
        std = std.rename(columns = {'target1': f'std_{i}_target1', 'target2': f'std_{i}_target2', 'target3': f'std_{i}_target3', 'target4': f'std_{i}_target4'})
        
        med = pd.DataFrame(df.groupby([i, group])['target1', 'target2', 'target3', 'target4'].median())
        med.reset_index(inplace=True)
        med.drop_duplicates(subset = [group, i], inplace = True)
        med = med.rename(columns = {'target1': f'med_{i}_target1', 'target2': f'med_{i}_target2', 'target3': f'med_{i}_target3', 'target4': f'med_{i}_target4'})
        
        min = pd.DataFrame(df.groupby([i, group])['target1', 'target2', 'target3', 'target4'].min())
        min.reset_index(inplace=True)
        min.drop_duplicates(subset = [group, i], inplace = True)
        min = min.rename(columns = {'target1': f'min_{i}_target1', 'target2': f'min_{i}_target2', 'target3': f'min_{i}_target3', 'target4': f'min_{i}_target4'})

        max = pd.DataFrame(df.groupby([i, group])['target1', 'target2', 'target3', 'target4'].max())
        max.reset_index(inplace=True)
        max.drop_duplicates(subset = [group, i], inplace = True)
        max = max.rename(columns = {'target1': f'max_{i}_target1', 'target2': f'max_{i}_target2', 'target3': f'max_{i}_target3', 'target4': f'max_{i}_target4'})

        df = df.merge(mean, how = 'left', on = [group, i])
        df = df.merge(std, how = 'left', on = [group, i])
        df = df.merge(med, how = 'left', on = [group, i])
        df = df.merge(min, how = 'left', on = [group, i])
        df = df.merge(max, how = 'left', on = [group, i])
 
    return df
pd.options.mode.chained_assignment = None

## Load Data

In [5]:
input_file_path = Path('/kaggle/input/mlb-player-digital-engagement-forecasting/')

In [6]:
for file in ['train_updated']:
    # drop playerTwitterFollowers, teamTwitterFollowers from example_test
    df = pd.read_csv(input_file_path / f"{file}.csv").dropna(axis=1,how='all')
    daily_data_nested_df_names = df.drop('date', axis = 1).columns.values.tolist()
    
    for df_name in daily_data_nested_df_names:
        date_nested_table = df[['date', df_name]]
        date_nested_table = (date_nested_table[~pd.isna(date_nested_table[df_name])].reset_index(drop = True))
        daily_dfs_collection = []

        for date_index, date_row in date_nested_table.iterrows():
            daily_df = pd.read_json(date_row[df_name])
            daily_df['dailyDataDate'] = date_row['date']
            daily_dfs_collection = daily_dfs_collection + [daily_df]

        # Concatenate all daily dfs into single df for each row
        unnested_table = (pd.concat(daily_dfs_collection,ignore_index = True).
          # Set and reset index to move 'dailyDataDate' to front of df
          set_index('dailyDataDate').
          reset_index()
          )
        
        if df_name == 'nextDayPlayerEngagement':
            targets = unnested_table.copy()
        elif df_name == 'teamBoxScores':
            team_scores = unnested_table.copy()
        elif df_name == 'playerBoxScores':
            scores = unnested_table.copy()
        elif df_name == 'rosters':
            rosters = unnested_table.copy()

        #reduce_mem_usage(unnested_table).to_csv(f"{file}_{df_name}.csv")
        #print('\n'*2)

        # Clean up tables and collection of daily data frames for this df
        del(date_nested_table, daily_dfs_collection, unnested_table)
        gc.collect()


In [7]:
targets.drop(columns = 'dailyDataDate', inplace = True)
scores.drop(columns = 'dailyDataDate', inplace = True)
team_scores.drop(columns = 'dailyDataDate', inplace = True)
rosters.drop(columns = 'dailyDataDate', inplace = True)


In [8]:
my_players = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/players.csv')
players = pd.read_csv('../input/mlb-player-digital-engagement-forecasting/players.csv')

# Select Interval

In [9]:
targets['engagementMetricsDate']=pd.to_datetime(targets['engagementMetricsDate'])
targets['date'] = targets['engagementMetricsDate']- timedelta(days=1)
targets['year'] = pd.to_datetime(targets['date']).dt.year
rosters['gameDate'] = pd.to_datetime(rosters['gameDate'])
scores['gameDate'] = pd.to_datetime(scores['gameDate'])
team_scores['gameDate'] = pd.to_datetime(team_scores['gameDate'])


In [10]:
temp2018 = targets[(targets['date'] > '2018-03-28') & (targets['date'] < '2018-09-28')]
temp2019 = targets[(targets['date'] > '2019-03-19') & (targets['date'] < '2019-09-30')]
temp2020 = targets[(targets['date'] > "2020-07-22") & (targets['date'] < "2020-09-28")]
temp4 = targets[(targets['date'] >= '2021-03-31') & (targets['date'] < '2021-04-30')]
temp5 = targets[(targets['date'] >= '2021-04-30') & (targets['date'] < '2021-05-31')]
temp6 = targets[(targets['date'] >= '2021-05-31') & (targets['date'] < '2021-06-30')]
temp7 = targets[(targets['date'] >= '2021-06-30')]

frames = [temp2018, temp2019, temp2020, temp4, temp5, temp6, temp7]

targets = pd.concat(frames)
del temp2018
del temp2019
del temp2020
gc.collect()

0

# Set id

In [11]:
id = pd.DataFrame(players.loc[players['playerForTestSetAndFuturePreds'] == True]['playerId'])
id_test = pd.DataFrame(players.loc[players['playerForTestSetAndFuturePreds'] == True]['playerId'])

# Pre process

In [12]:
targets = targets.merge(id, how = 'right', on = ['playerId'])
scores = scores.merge(id, how = 'right', on = ['playerId'])
my_players = my_players.merge(id, how = 'right', on = ['playerId'])
players = players.merge(id, how ='right', on = ['playerId'])


## my_players

In [13]:
a = targets[['playerId', 'target1', 'target2', 'target3', 'target4', 'year']]
tmp = a.groupby(['playerId', 'year']).mean().reset_index()
my_players = my_players.merge(tmp, how = 'left', on = 'playerId')
my_players = my_players.rename(columns = {'target1': 'player_target1', 'target2': 'player_target2', 'target3': 'player_target3', 'target4': 'player_target4'})

for i in range(1, 5):
    my_players[f'tier_target{i}'] = np.ones(len(my_players))*7
    my_players[f'tier_target{i}'] = np.where(my_players[f'player_target{i}'] > 0.15, 6, my_players[f'tier_target{i}'])
    my_players[f'tier_target{i}'] = np.where(my_players[f'player_target{i}'] > 0.6, 5, my_players[f'tier_target{i}'])
    my_players[f'tier_target{i}'] = np.where(my_players[f'player_target{i}'] > 1.5, 4, my_players[f'tier_target{i}'])
    my_players[f'tier_target{i}'] = np.where(my_players[f'player_target{i}'] > 4, 3, my_players[f'tier_target{i}'])
    my_players[f'tier_target{i}'] = np.where(my_players[f'player_target{i}'] > 10, 2, my_players[f'tier_target{i}'])
    my_players[f'tier_target{i}'] = np.where(my_players[f'player_target{i}'] > 30, 1, my_players[f'tier_target{i}'])

In [14]:
my_players['mlbDebutDate']=pd.to_datetime(my_players['mlbDebutDate'])
my_players['debut_year'] = 2021-my_players['mlbDebutDate'].dt.year
my_players['debut_year'] = my_players['debut_year'].fillna(0)

In [15]:
my_players = my_players.drop(['playerName', 'DOB', 'mlbDebutDate', 'birthCity', 'primaryPositionCode','birthCountry',
 'playerForTestSetAndFuturePreds','birthStateProvince', 'heightInches', 'weight','primaryPositionName', 'player_target1', 'player_target2', 'player_target3', 'player_target4'], axis = 1)

## ply_box, team_box

In [16]:
scores = scores.groupby(['playerId', 'gameDate']).sum().reset_index()
team_scores = team_scores.groupby(['teamId', 'gameDate']).sum().reset_index()

In [17]:
team_scores = team_scores.add_suffix('_team')
team_scores.rename(columns={'gamePk_team': 'gamePk', 'home_team':'home','teamId_team' : 'teamId', 'gameDate_team':'gameDate'}, inplace = True)

In [18]:
bat_cols = ['battingOrder', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs','saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',  'gamesPlayedBatting',]
pitch_cols = [ 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching','gamesPlayedPitching',]

In [19]:
for i in bat_cols:
    scores[i] = np.where(scores['gamesPlayedBatting'] > 1, scores[i] // 2, scores[i])
for i in pitch_cols:
    scores[i] = np.where(scores['gamesPlayedPitching'] > 1, scores[i] // 2, scores[i])

## Stat_bp

In [20]:
stat_bp = targets.merge(scores, left_on = ['playerId', 'date'], right_on = ['playerId', 'gameDate'], how ='left')
stat_bp['bat/pitch'] = np.where((stat_bp['gamesPlayedBatting'] == 1) & (stat_bp['gamesPlayedPitching'] == 1), 3, -1)
stat_bp['bat/pitch'] = np.where((stat_bp['gamesPlayedBatting'] == 0) & (stat_bp['gamesPlayedPitching'] == 1), 2, stat_bp['bat/pitch'])
stat_bp['bat/pitch'] = np.where((stat_bp['gamesPlayedBatting'] == 1) & (stat_bp['gamesPlayedPitching'] == 0), 1, stat_bp['bat/pitch'])
stat_bp['bat/pitch'] = np.where((stat_bp['gamesPlayedBatting'] != stat_bp['gamesPlayedBatting']) & (stat_bp['gamesPlayedPitching'] != stat_bp['gamesPlayedPitching']), 0, stat_bp['bat/pitch'])

stat_bp = stat_feature_player(stat_bp[['target1', 'target2', 'target3', 'target4', 'bat/pitch', 'playerId', 'year']], col = [f'bat/pitch'])
stat_bp.drop_duplicates(subset = ['playerId', f'bat/pitch', 'year'], keep = 'last', inplace = True)
stat_bp.drop(columns = ['target1', 'target2', 'target3', 'target4'], inplace = True)
stat_bp = stat_bp.reset_index(drop = True)
gc.collect()

  after removing the cwd from sys.path.
  if __name__ == '__main__':
  


0

In [21]:
homeRuns_cols = [ 'year', 'playerId', 'gameDate', 'homeRuns', 'target1', 'target2', 'target3', 'target4']
stat_homeRuns = targets.merge(scores, left_on = ['playerId', 'date'], right_on = ['playerId', 'gameDate'], how ='left')
stat_homeRuns = stat_feature_player(stat_homeRuns[homeRuns_cols], col = ['homeRuns'], target = 'target', group = 'playerId')
stat_homeRuns.drop(columns = ['target1', 'target2', 'target3', 'target4'], inplace = True)
stat_homeRuns.drop_duplicates(subset = ['playerId', 'homeRuns', 'year'], keep = 'last', inplace = True)
stat_homeRUns = stat_homeRuns.reset_index(drop = True)
gc.collect()
stat_homeRuns

  after removing the cwd from sys.path.
  if __name__ == '__main__':
  


Unnamed: 0,year,playerId,gameDate,homeRuns,mean_homeRuns_target1,mean_homeRuns_target2,mean_homeRuns_target3,mean_homeRuns_target4,std_homeRuns_target1,std_homeRuns_target2,...,med_homeRuns_target3,med_homeRuns_target4,min_homeRuns_target1,min_homeRuns_target2,min_homeRuns_target3,min_homeRuns_target4,max_homeRuns_target1,max_homeRuns_target2,max_homeRuns_target3,max_homeRuns_target4
182,2018,593590,NaT,,,,,,,,...,,,,,,,,,,
376,2019,593590,NaT,,,,,,,,...,,,,,,,,,,
443,2020,593590,NaT,,,,,,,,...,,,,,,,,,,
539,2021,593590,2021-07-04,1.0,2.296119,2.136592,0.084838,0.761338,0.850626,1.758569,...,0.084838,0.761338,1.694636,0.893096,0.003460,0.173215,2.897602,3.380088,0.166216,1.349460
551,2021,593590,2021-07-16,0.0,0.470284,2.042623,0.249732,0.530611,1.112139,2.196526,...,0.002185,0.291463,0.000369,0.174956,0.000307,0.052601,4.145526,9.246246,2.800154,3.099156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656040,2018,680430,NaT,,,,,,,,...,,,,,,,,,,
656234,2019,680430,NaT,,,,,,,,...,,,,,,,,,,
656301,2020,680430,NaT,,,,,,,,...,,,,,,,,,,
656395,2021,680430,2021-07-02,0.0,0.041593,1.577498,0.000000,0.394730,,,...,0.000000,0.394730,0.041593,1.577498,0.000000,0.394730,0.041593,1.577498,0.000000,0.394730


In [22]:
gamesPlayedBatting_cols = ['year', 'playerId', 'gameDate', 'gamesPlayedBatting', 'target1', 'target2', 'target3', 'target4']
stat_gamesPlayedBatting = targets.merge(scores, left_on = ['playerId', 'date'], right_on = ['playerId', 'gameDate'], how ='left')
stat_gamesPlayedBatting = stat_feature_player(stat_gamesPlayedBatting[gamesPlayedBatting_cols], ['gamesPlayedBatting'], target = 'target', group = 'playerId')
stat_gamesPlayedBatting.drop(columns = ['target1', 'target2', 'target3', 'target4'], inplace = True)
stat_gamesPlayedBatting.drop_duplicates(subset = ['playerId', 'gamesPlayedBatting', 'year'], keep = 'last', inplace = True)
gc.collect()
stat_gamesPlayedBatting

  after removing the cwd from sys.path.
  if __name__ == '__main__':
  


Unnamed: 0,year,playerId,gameDate,gamesPlayedBatting,mean_gamesPlayedBatting_target1,mean_gamesPlayedBatting_target2,mean_gamesPlayedBatting_target3,mean_gamesPlayedBatting_target4,std_gamesPlayedBatting_target1,std_gamesPlayedBatting_target2,...,med_gamesPlayedBatting_target3,med_gamesPlayedBatting_target4,min_gamesPlayedBatting_target1,min_gamesPlayedBatting_target2,min_gamesPlayedBatting_target3,min_gamesPlayedBatting_target4,max_gamesPlayedBatting_target1,max_gamesPlayedBatting_target2,max_gamesPlayedBatting_target3,max_gamesPlayedBatting_target4
182,2018,593590,NaT,,,,,,,,...,,,,,,,,,,
376,2019,593590,NaT,,,,,,,,...,,,,,,,,,,
443,2020,593590,NaT,,,,,,,,...,,,,,,,,,,
551,2021,593590,2021-07-16,1.0,0.673155,2.053064,0.23141,0.556248,1.217589,2.107119,...,0.002832,0.291463,0.000369,0.174956,0.000307,0.052601,4.145526,9.246246,2.800154,3.099156
552,2021,593590,NaT,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656040,2018,680430,NaT,,,,,,,,...,,,,,,,,,,
656234,2019,680430,NaT,,,,,,,,...,,,,,,,,,,
656301,2020,680430,NaT,,,,,,,,...,,,,,,,,,,
656395,2021,680430,2021-07-02,0.0,0.041593,1.577498,0.00000,0.394730,,,...,0.000000,0.394730,0.041593,1.577498,0.000000,0.394730,0.041593,1.577498,0.000000,0.394730


In [23]:
gamesPlayedPitching_cols = ['year', 'playerId', 'gameDate', 'gamesPlayedPitching', 'target1', 'target2', 'target3', 'target4']
stat_gamesPlayedPitching = targets.merge(scores, left_on = ['playerId', 'date'], right_on = ['playerId', 'gameDate'], how ='left')
stat_gamesPlayedPitching = stat_feature_player(stat_gamesPlayedPitching[gamesPlayedPitching_cols], ['gamesPlayedPitching'], target = 'target', group = 'playerId')
stat_gamesPlayedPitching.drop(columns = ['target1', 'target2', 'target3', 'target4'], inplace = True)
stat_gamesPlayedPitching.drop_duplicates(subset = ['playerId', 'gamesPlayedPitching', 'year'], keep = 'last', inplace = True)
gc.collect()
stat_gamesPlayedPitching

  after removing the cwd from sys.path.
  if __name__ == '__main__':
  


Unnamed: 0,year,playerId,gameDate,gamesPlayedPitching,mean_gamesPlayedPitching_target1,mean_gamesPlayedPitching_target2,mean_gamesPlayedPitching_target3,mean_gamesPlayedPitching_target4,std_gamesPlayedPitching_target1,std_gamesPlayedPitching_target2,...,med_gamesPlayedPitching_target3,med_gamesPlayedPitching_target4,min_gamesPlayedPitching_target1,min_gamesPlayedPitching_target2,min_gamesPlayedPitching_target3,min_gamesPlayedPitching_target4,max_gamesPlayedPitching_target1,max_gamesPlayedPitching_target2,max_gamesPlayedPitching_target3,max_gamesPlayedPitching_target4
182,2018,593590,NaT,,,,,,,,...,,,,,,,,,,
376,2019,593590,NaT,,,,,,,,...,,,,,,,,,,
443,2020,593590,NaT,,,,,,,,...,,,,,,,,,,
551,2021,593590,2021-07-16,0.0,0.673155,2.053064,0.23141,0.556248,1.217589,2.107119,...,0.002832,0.291463,0.000369,0.174956,0.000307,0.052601,4.145526,9.246246,2.800154,3.099156
552,2021,593590,NaT,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656040,2018,680430,NaT,,,,,,,,...,,,,,,,,,,
656234,2019,680430,NaT,,,,,,,,...,,,,,,,,,,
656301,2020,680430,NaT,,,,,,,,...,,,,,,,,,,
656395,2021,680430,2021-07-02,1.0,0.041593,1.577498,0.00000,0.394730,,,...,0.000000,0.394730,0.041593,1.577498,0.000000,0.394730,0.041593,1.577498,0.000000,0.394730


In [24]:
stat_players = stat_feature(targets, col = ['playerId'])
stat_players.drop_duplicates(subset = ['playerId', 'year'], keep = 'last', inplace = True)
stat_players.drop(columns = ['engagementMetricsDate', 'date', 'target1', 'target2', 'target3',
       'target4'], inplace = True)



# Train Data preparing

In [25]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date', 'year']
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status', 'gameDate', 'statusCode']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'gameDate']

feature_cols = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',
                                 'mean_homeRuns_target1',
       'mean_homeRuns_target2', 'mean_homeRuns_target3',
       'mean_homeRuns_target4', 'std_homeRuns_target1', 'std_homeRuns_target2',
       'std_homeRuns_target3', 'std_homeRuns_target4', 'med_homeRuns_target1',
       'med_homeRuns_target2', 'med_homeRuns_target3', 'med_homeRuns_target4',
       'min_homeRuns_target1', 'min_homeRuns_target2', 'min_homeRuns_target3',
       'min_homeRuns_target4', 'max_homeRuns_target1', 'max_homeRuns_target2',
       'max_homeRuns_target3', 'max_homeRuns_target4',
                 'mean_gamesPlayedPitching_target1',
       'mean_gamesPlayedPitching_target2', 'mean_gamesPlayedPitching_target3',
       'mean_gamesPlayedPitching_target4', 'std_gamesPlayedPitching_target1',
       'std_gamesPlayedPitching_target2', 'std_gamesPlayedPitching_target3',
       'std_gamesPlayedPitching_target4', 'med_gamesPlayedPitching_target1',
       'med_gamesPlayedPitching_target2', 'med_gamesPlayedPitching_target3',
       'med_gamesPlayedPitching_target4', 'min_gamesPlayedPitching_target1',
       'min_gamesPlayedPitching_target2', 'min_gamesPlayedPitching_target3',
       'min_gamesPlayedPitching_target4', 'max_gamesPlayedPitching_target1',
       'max_gamesPlayedPitching_target2', 'max_gamesPlayedPitching_target3',
       'max_gamesPlayedPitching_target4',
                 'mean_gamesPlayedBatting_target1',
       'mean_gamesPlayedBatting_target2', 'mean_gamesPlayedBatting_target3',
       'mean_gamesPlayedBatting_target4', 'std_gamesPlayedBatting_target1',
       'std_gamesPlayedBatting_target2', 'std_gamesPlayedBatting_target3',
       'std_gamesPlayedBatting_target4', 'med_gamesPlayedBatting_target1',
       'med_gamesPlayedBatting_target2', 'med_gamesPlayedBatting_target3',
       'med_gamesPlayedBatting_target4', 'min_gamesPlayedBatting_target1',
       'min_gamesPlayedBatting_target2', 'min_gamesPlayedBatting_target3',
       'min_gamesPlayedBatting_target4', 'max_gamesPlayedBatting_target1',
       'max_gamesPlayedBatting_target2', 'max_gamesPlayedBatting_target3',
       'max_gamesPlayedBatting_target4',
    #    'player_target1', 'player_target2', 'player_target3', 'player_target4',
     'tier_target1', 'tier_target2', 'tier_target3', 'tier_target4',
     'mean_playerId_target1',
       'mean_playerId_target2', 'mean_playerId_target3',
       'mean_playerId_target4', 'std_playerId_target1', 'std_playerId_target2',
       'std_playerId_target3', 'std_playerId_target4', 'med_playerId_target1',
       'med_playerId_target2', 'med_playerId_target3', 'med_playerId_target4',
       'min_playerId_target1', 'min_playerId_target2', 'min_playerId_target3',
       'min_playerId_target4', 'max_playerId_target1', 'max_playerId_target2',
       'max_playerId_target3', 'max_playerId_target4',
                       'mean_bat/pitch_target1',
       'mean_bat/pitch_target2', 'mean_bat/pitch_target3',
       'mean_bat/pitch_target4', 'std_bat/pitch_target1',
       'std_bat/pitch_target2', 'std_bat/pitch_target3',
       'std_bat/pitch_target4', 'med_bat/pitch_target1',
       'med_bat/pitch_target2', 'med_bat/pitch_target3',
       'med_bat/pitch_target4', 'min_bat/pitch_target1',
       'min_bat/pitch_target2', 'min_bat/pitch_target3',
       'min_bat/pitch_target4', 'max_bat/pitch_target1',
       'max_bat/pitch_target2', 'max_bat/pitch_target3',
       'max_bat/pitch_target4',
       #'home',
                'teamId',
       'flyOuts_team', 'groundOuts_team', 'runsScored_team', 'doubles_team',
       'triples_team', 'homeRuns_team', 'strikeOuts_team', 'baseOnBalls_team',
       'intentionalWalks_team', 'hits_team', 'hitByPitch_team', 'atBats_team',
       'caughtStealing_team', 'stolenBases_team', 'groundIntoDoublePlay_team',
       'groundIntoTriplePlay_team', 'plateAppearances_team', 'totalBases_team',
       'rbi_team', 'leftOnBase_team', 'sacBunts_team', 'sacFlies_team',
       'catchersInterference_team', 'pickoffs_team', 'airOutsPitching_team',
       'groundOutsPitching_team', 'runsPitching_team', 'doublesPitching_team',
       'triplesPitching_team', 'homeRunsPitching_team',
       'strikeOutsPitching_team', 'baseOnBallsPitching_team',
       'intentionalWalksPitching_team', 'hitsPitching_team',
       'hitByPitchPitching_team', 'atBatsPitching_team',
       'caughtStealingPitching_team', 'stolenBasesPitching_team',
       'inningsPitched_team', 'earnedRuns_team', 'battersFaced_team',
       'outsPitching_team', 'hitBatsmen_team', 'balks_team',
       'wildPitches_team', 'pickoffsPitching_team', 'rbiPitching_team',
       'inheritedRunners_team', 'inheritedRunnersScored_team',
       'catchersInterferencePitching_team', 'sacBuntsPitching_team',
       'sacFliesPitching_team',

               'debut_year', 'year']

In [26]:
# creat dataset
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='right')
train = train.merge(id, how = 'right', on = ['playerId'])
train = train.merge(rosters[rosters_cols], left_on=['playerId', 'date'], right_on = ['playerId', 'gameDate'], how='left')
train = train.merge(scores[scores_cols], left_on=['playerId', 'date'], right_on = ['playerId', 'gameDate'], how='left')
train = train.merge(my_players, how='inner', on=["playerId", 'year'])
train = train.merge(team_scores, how = 'left', left_on = ['teamId', 'date'], right_on = ['teamId', 'gameDate'])

#train = train.merge(debut_year, how = 'left', on = ['playerId'])
# train = train.merge(stat_statusCode, how = 'left', on = ['playerId', 'statusCode'])
train['bat/pitch'] = np.where((train['gamesPlayedBatting'] == 1) & (train['gamesPlayedPitching'] == 1), 3, -1)
train['bat/pitch'] = np.where((train['gamesPlayedBatting'] == 0) & (train['gamesPlayedPitching'] == 1), 2, train['bat/pitch'])
train['bat/pitch'] = np.where((train['gamesPlayedBatting'] == 1) & (train['gamesPlayedPitching'] == 0), 1, train['bat/pitch'])
train['bat/pitch'] = np.where((train['gamesPlayedBatting'] != train['gamesPlayedBatting']) & (train['gamesPlayedPitching'] != train['gamesPlayedPitching']), 0, train['bat/pitch'])


train = train.merge(stat_bp, how = 'left', on = ['playerId', 'year', 'bat/pitch'])
train = train.merge(stat_players, how='inner', on=["playerId", 'year'])
train = train.merge(stat_homeRuns, how = 'left', on = ['playerId', 'homeRuns', 'year'])
train = train.merge(stat_gamesPlayedBatting, how = 'left', on = ['playerId', 'gamesPlayedBatting', 'year'])
train = train.merge(stat_gamesPlayedPitching, how = 'left', on = ['playerId', 'gamesPlayedPitching', 'year'])

# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}



train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)


In [27]:
gc.collect()

20

## Select interval

In [28]:
train_X = train[feature_cols]
train_y = train[['target1', 'target2', 'target3', 'target4']]


x_train1 = train_X.reset_index(drop=True)
y_train1 = train_y.reset_index(drop=True)
#index = ((train['date'] >= '2021-04-30') & (train['date'] < '2021-05-31'))
index = ((train['date'] >= '2021-06-14'))# & (train['date'] < '2021-06-30'))
x_valid1 = train_X.loc[index].reset_index(drop=True)
y_valid1 = train_y.loc[index].reset_index(drop=True)


# Train

## stage 1

In [29]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, target, pos, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float64)
    if (False):#os.path.isfile(f'../input/model-ckpt/{pos}_lgbm{target}.pkl'):
        with open(f'../input/model-ckpt/{pos}_lgbm{target}.pkl', 'rb') as fin:
            model = pickle.load(fin)
    else:
        model = lgbm.LGBMRegressor(**params)
        model.fit(x_train, y_train, 
            eval_set=[(x_valid, y_valid)],  
            early_stopping_rounds=100, 
            verbose=200)
        with open(f'{pos}_lgbm{target}.pkl', 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


# training lightgbm

params1 = {'objective':'mae',
           'reg_alpha': 0.14947461820098767, 
           'reg_lambda': 0.10185644384043743, 
           'n_estimators': 3633, 
           'learning_rate': 0.08046301304430488, 
           'num_leaves': 674, 
           'feature_fraction': 0.9101240539122566, 
           'bagging_fraction': 0.9884451442950513, 
           'bagging_freq': 8, 
           'min_child_samples': 51}

# params2 = {
#  'objective':'mae',
#  'reg_alpha': 0.1,
#  'reg_lambda': 0.1, 
#  'n_estimators': 80,
#  'learning_rate': 0.1,
#  'random_state': 42,
#  "num_leaves": 22
# }
params2 = {
    'objective':'mae',
    'reg_alpha': 0.14947461820098767, 
    'reg_lambda': 0.10185644384043743, 
    'n_estimators': 3633, 
    'learning_rate': 0.08046301304430488, 
    'num_leaves': 570, 
    'feature_fraction': 0.9101240539122566, 
    'bagging_fraction': 0.9884443999999999, 
    'bagging_freq': 8, 
    'min_child_samples': 23
}
params3 = {
    'objective':'mae',
    'reg_alpha': 0.14947461820098767,
    'reg_lambda': 0.10185644384043743,
    'n_estimators': 500,
    'learning_rate': 0.08046301304430488,
    'num_leaves': 680,
    'feature_fraction': 0.9101240539122566, 
    'bagging_fraction': 0.9884451442950513,
    'bagging_freq': 8,
    'min_child_samples': 27} 

params4 = {'objective':'mae',
           'reg_alpha': 0.016468100279441976, 
           'reg_lambda': 0.09128335764019105, 
           'n_estimators': 9868, 
           'learning_rate': 0.10528150510326864, 
           'num_leaves': 157, 
           'feature_fraction': 0.5419185713426886, 
           'bagging_fraction': 0.2637405128936662, 
           'bagging_freq': 19, 
           'min_child_samples': 71}




oof3, model3, score3 = fit_lgbm(
    x_train1, y_train1['target3'],
    x_valid1, y_valid1['target3'], 3, 'be',
   params3,
)

pred3 = model3.predict(x_train1)
pred3 = np.clip(pred3, 0, 100)
x_train1['pred_target3'] = pred3
x_valid1['pred_target3'] = oof3


oof1, model1, score1 = fit_lgbm( 
    x_train1, y_train1['target1'],  
    x_valid1, y_valid1['target1'], 1, 'be',
    params1,
)

pred1 = model1.predict(x_train1)
pred1 = np.clip(pred1, 0, 100)
x_train1['pred_target1'] = pred1
x_valid1['pred_target1'] = oof1


oof4, model4, score4 = fit_lgbm(
    x_train1, y_train1['target4'],
    x_valid1, y_valid1['target4'], 4, 'be',
    params4
)

pred4 = model4.predict(x_train1)
pred4 = np.clip(pred4,0,100)
x_train1['pred_target4'] = pred4
x_valid1['pred_target4'] = oof4

oof2, model2, score2 = fit_lgbm(
    x_train1, y_train1['target2'],
    x_valid1, y_valid1['target2'], 2, 'be',
    params2,
)

pred2 = model2.predict(x_train1)
pred2 = np.clip(pred2,0,100)
x_train1['pred_target2'] = pred2
x_valid1['pred_target2'] = oof2

print(f'tg1: {score1}')
print(f'tg2: {score2}')
print(f'tg3: {score3}')
print(f'tg4: {score4}')

score = (score1+score2+score3+score4) / 4
print(f'score: {score}')

Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.686328
[400]	valid_0's l1: 0.665675
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 0.657318
mae: 0.657318438529979
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 0.802971
[400]	valid_0's l1: 0.775581
[600]	valid_0's l1: 0.757258
[800]	valid_0's l1: 0.742619
[1000]	valid_0's l1: 0.731272
[1200]	valid_0's l1: 0.719658
[1400]	valid_0's l1: 0.711194
[1600]	valid_0's l1: 0.703792
[1800]	valid_0's l1: 0.697493
[2000]	valid_0's l1: 0.692324
[2200]	valid_0's l1: 0.690114
[2400]	valid_0's l1: 0.687496
[2600]	valid_0's l1: 0.684978
[2800]	valid_0's l1: 0.684021
[3000]	valid_0's l1: 0.682078
[3200]	valid_0's l1: 0.679316
[3400]	valid_0's l1: 0.677637
[3600]	valid_0's l1: 0.676287
Did not meet early stopping. Best iteration is:
[3633]	valid_0's l1: 0.676168
mae: 0.6761681387674613
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1

In [30]:
del x_train1
del y_train1
del x_valid1
del y_valid1
gc.collect()

32

### stage 2

# Submission

In [31]:
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']

scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

null = np.nan
true = True
false = False

In [32]:
import copy

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sub = copy.deepcopy(sample_prediction_df.reset_index())
    sample_prediction_df = copy.deepcopy(sample_prediction_df.reset_index(drop=True))
    
    # LGBM summit
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
        
        test_scores = test_scores.groupby('playerId').sum().reset_index()
        for i in bat_cols:
            test_scores[i] = np.where(test_scores['gamesPlayedBatting'] > 1, test_scores[i] // 2, test_scores[i])
        for i in pitch_cols:
            test_scores[i] = np.where(test_scores['gamesPlayedPitching'] > 1, test_scores[i] // 2, test_scores[i])
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
            
            
    if test_df['teamBoxScores'].iloc[0] == test_df['teamBoxScores'].iloc[0]:
        test_team_scores = pd.DataFrame(eval(test_df['teamBoxScores'].iloc[0]))
        test_team_scores = test_team_scores.groupby('teamId').sum().reset_index()
        
        test_team_scores = test_team_scores.add_suffix('_team')
        test_team_scores.rename(columns={'gamePk_team': 'gamePk', 'home_team':'home','teamId_team' : 'teamId', 'gameDate_team':'gameDate'}, inplace = True)
    else:
        test_team_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in team_scores.columns:
            if col == 'playerId': continue
            test_team_scores[col] = np.nan
        
    
        
    test_scores['year'] = np.ones(len(test_scores))*2021
    test = sample_prediction_df[['playerId']].copy()
    test['year'] =  np.ones(len(test))*2021
    
    test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')
    test = test.merge(test_scores[scores_cols], on='playerId', how='left')
    test = test.merge(players[players_cols], how = 'left', on = ['playerId'])
    test = test.merge(my_players, how = 'left', on = ['playerId', 'year'])
    test = test.merge(test_team_scores, how = 'left', left_on = ['teamId'], right_on = ['teamId'])
   
    # train = train.merge(debut_year, how = 'left', on = ['playerId'])
    # train = train.merge(stat_statusCode, how = 'left', on = ['playerId', 'statusCode'])
    test['bat/pitch'] = np.where((test['gamesPlayedBatting'] == 1) & (test['gamesPlayedPitching'] == 1), 3, -1)
    test['bat/pitch'] = np.where((test['gamesPlayedBatting'] == 0) & (test['gamesPlayedPitching'] == 1), 2, test['bat/pitch'])
    test['bat/pitch'] = np.where((test['gamesPlayedBatting'] == 1) & (test['gamesPlayedPitching'] == 0), 1, test['bat/pitch'])
    test['bat/pitch'] = np.where((test['gamesPlayedBatting'] != test['gamesPlayedBatting']) & (test['gamesPlayedPitching'] != test['gamesPlayedPitching']), 0, test['bat/pitch'])

    
    test = test.merge(stat_bp, how ='left', on = ['playerId', 'year', 'bat/pitch'])
    test = test.merge(stat_players, how='inner', on=["playerId", 'year'])
    test = test.merge(stat_homeRuns, how = 'left', on = ['playerId', 'homeRuns', 'year'])
    test = test.merge(stat_gamesPlayedBatting, how = 'left', on = ['playerId', 'gamesPlayedBatting', 'year'])
    test = test.merge(stat_gamesPlayedPitching, how = 'left', on = ['playerId', 'gamesPlayedPitching', 'year'])
    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)

    test_X = test[feature_cols]
    # predict
    
    pred3 = model3.predict(test_X)
    pred3 = np.clip(pred3,0,100)
#     pred_lgd3 = model_lgb3.predict(test_X)
#     pred_cat3 = model_cb3.predict(test_X)
#     pred_lgd3 = np.clip(pred_lgd3,0,100)
#     pred_cat3 = np.clip(pred_cat3,0,100)
#     test_X['pred_target3'] = 0.65*pred3 + 0.25*pred_lgd3 + 0.1*pred_cat3
    test_X['pred_target3'] = pred3
    
    pred1 = model1.predict(test_X)
    pred1 = np.clip(pred1,0,100)
#     pred_lgd1 = model_lgb1.predict(test_X)
#     pred_cat1 = model_cb1.predict(test_X)
#     pred_lgd1 = np.clip(pred_lgd1,0,100)
#     pred_cat1 = np.clip(pred_cat1,0,100)
#     test_X['pred_target1'] = 1*pred1 + 0.*pred_lgd1 + 0.*pred_cat1
    test_X['pred_target1'] = pred1 
    
    
    pred4 = model4.predict(test_X)
    pred4 = np.clip(pred4,0,100)
#     pred_lgd4 = model_lgb4.predict(test_X)
#     pred_cat4 = model_cb4.predict(test_X)
#     pred_lgd4 = np.clip(pred_lgd4,0,100)
#     pred_cat4 = np.clip(pred_cat4,0,100)
    
#     test_X['pred_target4'] = 0.65*pred4 + 0.25*pred_lgd4 + 0.1*pred_cat4
    test_X['pred_target4'] = pred4
    
    
    pred2 = model2.predict(test_X)
    pred2 = np.clip(pred2,0,100)
#     pred_lgd2 = model_lgb2.predict(test_X)
#     pred_cat2 = model_cb2.predict(test_X)
#     pred_lgd2 = np.clip(pred_lgd2,0,100)
#     pred_cat2 = np.clip(pred_cat2,0,100)
#     test_X['pred_target2'] = 0.1*pred2 + 0.65*pred_lgd2 + 0.25*pred_cat2
    test_X['pred_target2'] = pred2
    
    
#     af3 = model3_af.predict(test_X)
#     af3 = np.clip(af3,0,100)
    
#     test_X['pred_target3'] = af3
    
#     af1 = model1_af.predict(test_X)
#     af1 = np.clip(af1,0,100)
    
#     test_X['pred_target3'] = af1
    
#     af4 = model4_af.predict(test_X)
#     af4 = np.clip(af4,0,100)
    
#     test_X['pred_target4'] = af4
    
#     af2 = model2_af.predict(test_X)
#     af2 = np.clip(af2,0,100)
    
#     test_X['pred_target2'] = af2
    # merge submission
#     sample_prediction_df['target1'] = 0.5*af1 + 0.5*pred1
#     sample_prediction_df['target2'] = 0.5*af2 + 0.5*pred2
#     sample_prediction_df['target3'] = 0.5*af3 + 0.5*pred3
#     sample_prediction_df['target4'] = 0.5*af4 + 0.5*pred4
    sample_prediction_df['target1'] = pred1
    sample_prediction_df['target2'] = pred2
    sample_prediction_df['target3'] = pred3
    sample_prediction_df['target4'] = pred4
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId']
    
    env.predict(sample_prediction_df)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [33]:
sample_prediction_df

Unnamed: 0,date_playerId,target1,target2,target3,target4
0,20210501_488726,1.768755e+00,6.589195,1.744935e-01,2.966626
1,20210501_605218,8.955438e-04,0.266450,4.501128e-03,0.960824
2,20210501_621563,1.156469e-02,1.841456,1.010965e-02,0.797355
3,20210501_670084,3.845766e-03,0.515023,5.169058e-04,0.344201
4,20210501_670970,2.916639e-03,0.183618,9.754550e-03,0.183883
...,...,...,...,...,...
1182,20210501_596049,1.644371e-11,0.012169,1.942793e-09,0.034888
1183,20210501_642851,5.028681e-07,0.030417,1.278659e-08,0.079219
1184,20210501_596071,2.149994e-04,0.059647,1.418573e-09,0.081072
1185,20210501_664901,6.568635e-03,0.223338,0.000000e+00,0.185793


# End of notebook