In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

data_dir = '/kaggle/input/scrabble-player-rating/'
games = pd.read_csv(data_dir+'games.csv')

submission = pd.read_csv(data_dir+'sample_submission.csv')
test = pd.read_csv(data_dir+'test.csv')
train = pd.read_csv(data_dir+'train.csv')
turns = pd.read_csv(data_dir+'turns.csv')

In [107]:
def racks_score(racks):
    score = 0
    if str(racks) =='Nan':
        return 0
    for rack in str(racks):
        if rack in ['?']:
            score += 0
        elif rack in ['E', 'A', 'I', 'O', 'N', 'R', 'T', 'L', 'S', 'U']:
            score += 1
        elif rack in ['D', 'G']:
            score += 2
        elif rack in ['B', 'C', 'M', 'P']:
            score += 3
        elif rack in ['F', 'H', 'V', 'W', 'Y']:
            score += 4
        elif rack in ['K']:
            score += 5
        elif rack in ['J', 'X']:
            score += 8
        elif rack in ['Q', 'Z']:
            score += 10
            
    return score

In [108]:
turns['racks_score'] = turns['rack'].apply(racks_score)

In [109]:
def pre_turns(game_id, nickname, win):
    df = turns[turns['game_id']==game_id].reset_index()
    row = {}
    bot_name = nickname
    bot_wins = True
    
    if nickname in ['BetterBot', 'STEEBot', 'HastyBot']:
        nickname = df['nickname'][1]
        bot_wins = 1 if (win == 1) else 0
        
    else:
        bot_name = df['nickname'][1]
        bot_wins = 1 if (win != 1) else 0
    
    row['game_id'] = game_id    
    row['Bot_Wins'] = bot_wins
    row['Bot_Name'] = bot_name
    
    df_bot = df[df['nickname']==bot_name]
    df_player = df[df['nickname']==nickname]
    
    row['Bot_Tot_Score'] = df_bot['score'].iloc[::-1].reset_index().iloc[0]['score']
    row['Player_Tot_Score'] = df_player['score'].iloc[::-1].reset_index().iloc[0]['score']
    
    row['turns'] = len(df)
    row['Bot_Max_Score'] = df_bot['points'].max()
    row['Player_Max_Score'] = df_player['points'].max()
    row['Bot_Mean_Score'] = df_bot['points'].mean()
    row['Player_Mean_Score'] = df_player['points'].mean()
    
    row['racks_mean_Bot'] = df_bot['racks_score'].mean()
    row['racks_mean_Player'] = df_player['racks_score'].mean()
    
    return row

In [5]:
feats = ['game_id', 'first', 'winner']
rows = []

for idx, row in tqdm(games[feats].iterrows()):
    rows.append(pre_turns(row['game_id'], row['first'], row['winner']))
    
df = pd.DataFrame(rows)

72773it [10:47, 112.46it/s]


In [110]:
games = games.merge(df, on='game_id', how='left')

In [111]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler
oh_en = OneHotEncoder()
or_en = OrdinalEncoder()

mm_sc = MinMaxScaler()

games = games.drop(['created_at', 'first', 'winner'], axis=1)

ords = ['time_control_name']
games[ords] = or_en.fit_transform(games[ords])

cats = ['game_end_reason', 'rating_mode', 'lexicon', 'Bot_Name', 'Bot_Wins']
for c in cats:
    l_en = LabelEncoder()
    games[c] = l_en.fit_transform(games[c])
    
mms = ['time_control_name', 'Bot_Tot_Score', 'Player_Tot_Score',
       'turns', 'Bot_Max_Score', 'Player_Max_Score',
       'Bot_Mean_Score', 'Player_Mean_Score',
       'initial_time_seconds', 'increment_seconds', 'max_overtime_minutes', 'game_duration_seconds',
       'racks_mean_Bot', 'racks_mean_Player']
games[mms] = mm_sc.fit_transform(games[mms])

In [112]:
train['dummy'] = train['nickname'].apply(lambda x: (x in ['BetterBot', 'STEEBot', 'HastyBot']))
y = train[train['dummy']==False]['rating'].values
X = train[train['dummy']==True]
X = X.merge(games, on='game_id', how='left')
X = X.drop(['game_id', 'nickname', 'dummy', 'score'], axis=1)
X['rating'] = X['rating'].astype('float64')

In [120]:
X

Unnamed: 0,rating,time_control_name,game_end_reason,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,Bot_Wins,Bot_Name,Bot_Tot_Score,Player_Tot_Score,turns,Bot_Max_Score,Player_Max_Score,Bot_Mean_Score,Player_Mean_Score,racks_mean_Bot,racks_mean_Player
0,1637.0,0.666667,2,3,0.330544,0.0,0,0.1,0.150305,1,15,0.286080,0.628827,0.111111,0.184669,0.315113,0.306777,0.463760,0.423880,0.147177
1,2071.0,0.666667,2,0,0.246862,0.0,1,0.5,0.109155,1,15,0.271563,0.642857,0.111111,0.212544,0.331190,0.291209,0.474278,0.439303,0.177874
2,1936.0,0.666667,1,0,1.000000,0.0,0,0.1,0.077284,0,15,0.408198,0.233418,0.148148,0.205575,0.083601,0.379365,0.167336,0.275132,0.197408
3,1844.0,0.666667,2,3,0.330544,0.0,0,0.1,0.143057,0,77,0.364646,0.496173,0.185185,0.229965,0.163987,0.338889,0.325469,0.145920,0.239617
4,2143.0,0.666667,2,0,0.246862,0.0,1,0.1,0.094433,1,77,0.364646,0.563776,0.074074,0.195122,0.237942,0.391026,0.475234,0.223379,0.245315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50405,1952.0,0.666667,3,0,1.000000,0.0,0,0.1,0.825364,0,15,0.353544,0.515306,0.203704,0.205575,0.273312,0.308036,0.338019,0.285714,0.115084
50406,2237.0,0.666667,3,0,1.000000,0.0,1,0.1,0.823713,0,32,0.503843,0.205357,0.203704,0.317073,0.077170,0.438988,0.134705,0.348371,0.391010
50407,1614.0,0.666667,2,1,0.246862,0.0,0,1.0,0.051426,0,32,0.335611,0.579082,0.185185,0.271777,0.266881,0.311905,0.379853,0.333055,0.181013
50408,1674.0,0.666667,2,3,0.246862,0.0,1,0.1,0.151776,1,15,0.377455,0.576531,0.166667,0.229965,0.215434,0.375850,0.378179,0.329872,0.078457


In [113]:
test['dummy'] = test['nickname'].apply(lambda x: (x in ['BetterBot', 'STEEBot', 'HastyBot']))
y_test = test[test['dummy']==False]['rating'].values
X_test = test[test['dummy']==True]
X_test = X_test.merge(games, on='game_id', how='left')
X_test = X_test.drop(['game_id', 'nickname', 'dummy', 'score'], axis=1)

In [121]:
X_test

Unnamed: 0,rating,time_control_name,game_end_reason,lexicon,initial_time_seconds,increment_seconds,rating_mode,max_overtime_minutes,game_duration_seconds,Bot_Wins,Bot_Name,Bot_Tot_Score,Player_Tot_Score,turns,Bot_Max_Score,Player_Max_Score,Bot_Mean_Score,Player_Mean_Score,racks_mean_Bot,racks_mean_Player
0,2000.0,0.666667,2,0,1.000000,0.0,1,0.1,0.080294,0,15,0.342442,0.704082,0.092593,0.243902,0.302251,0.397817,0.520176,0.438875,0.194618
1,2082.0,0.666667,2,0,0.347280,0.0,1,0.1,0.085114,0,77,0.321947,0.565051,0.111111,0.163763,0.218650,0.320578,0.443827,0.277360,0.234545
2,1829.0,0.666667,2,3,0.330544,0.0,0,0.1,0.111290,0,77,0.285226,0.670918,0.148148,0.212544,0.212219,0.265079,0.495315,0.168198,0.272756
3,2136.0,0.666667,2,0,0.246862,0.0,1,0.1,0.064300,0,77,0.344150,0.539541,0.148148,0.247387,0.315113,0.342687,0.373940,0.282134,0.146991
4,2258.0,0.666667,2,0,0.330544,0.0,1,0.1,0.044175,0,32,0.386849,0.663265,0.037037,0.233449,0.289389,0.449405,0.608495,0.299638,0.350387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22358,2030.0,0.666667,2,0,0.330544,0.0,1,0.3,0.028433,1,77,0.329633,0.711735,0.018519,0.163763,0.279743,0.382937,0.714859,0.271791,0.238152
22359,2044.0,0.666667,2,0,0.330544,0.0,1,0.1,0.057956,1,15,0.400512,0.377551,0.407407,0.219512,0.237942,0.253788,0.201441,0.268246,0.122061
22360,1710.0,0.666667,2,3,1.000000,0.0,0,0.1,0.085223,0,15,0.255337,0.549745,0.148148,0.097561,0.215434,0.237302,0.404475,0.226121,0.152758
22361,2356.0,0.666667,2,0,1.000000,0.0,1,0.1,0.484117,0,32,0.447481,0.536990,0.037037,0.233449,0.266881,0.567100,0.451807,0.608491,0.108573


In [114]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

In [115]:
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error
params_bounds = {'num_leaves': (30, 40),
                 'lambda_l1': (0.2, 0.6),
                 'lambda_l2': (0.6, 1.5),
                 'feature_fraction': (0.2, 1.0),
                 'bagging_fraction': (0.6, 1.0),
                 'min_child_samples': (3, 10),
                 'min_child_weight': (10, 60)}
fixed_params = {'metric' : 'rmse', 'force_col_wise': True, 'bagging_freq': 5, 'learning_rate': 0.005}

dtrain = lgbm.Dataset(X_train, y_train)
dvalid = lgbm.Dataset(X_valid, y_valid)


def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction, bagging_fraction, min_child_samples, min_child_weight):
    params = {'num_leaves': int(round(num_leaves)),
                 'lambda_l1': lambda_l1,
                 'lambda_l2': lambda_l2,
                 'feature_fraction': feature_fraction,
                 'bagging_fraction': bagging_fraction,
                 'min_child_samples': int(round(min_child_samples)),
                 'min_child_weight': min_child_weight,
             'feature_pre_filter':False}
    params.update(fixed_params)
    model = lgbm.train(params=params, train_set=dtrain, num_boost_round=3000, valid_sets=dvalid, categorical_feature=cats, early_stopping_rounds=150, verbose_eval=500)
    preds = model.predict(X_valid)
    score = mean_squared_error(preds, y_valid, squared=False)
    
    return -score

In [None]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=params_bounds)

optimizer.maximize(init_points=3, n_iter=10)

In [40]:
max_params = optimizer.max['params']

In [43]:
max_params.update(fixed_params)

In [84]:
max_params

{'bagging_fraction': 0.9520389426775255,
 'feature_fraction': 0.8291485880973118,
 'lambda_l1': 0.2,
 'lambda_l2': 0.8550443556016436,
 'min_child_samples': 3,
 'min_child_weight': 20.363861041909683,
 'num_leaves': 33,
 'metric': 'rmse',
 'force_col_wise': True,
 'bagging_freq': 5,
 'learning_rate': 0.005}

In [46]:
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))

In [85]:
drtrain = lgbm.Dataset(X, y, free_raw_data=False)

In [116]:
model = lgbm.train(params=max_params, train_set=dtrain, num_boost_round=20000, valid_sets=dvalid, categorical_feature=cats, early_stopping_rounds=150, verbose_eval=500)

New categorical_feature is ['Bot_Name', 'Bot_Wins', 'game_end_reason', 'lexicon', 'rating_mode']


[LightGBM] [Info] Total Bins 2447
[LightGBM] [Info] Number of data points in the train set: 45369, number of used features: 20
[LightGBM] [Info] Start training from score 1790.714761
Training until validation scores don't improve for 150 rounds




[500]	valid_0's rmse: 120.168
[1000]	valid_0's rmse: 112.893
[1500]	valid_0's rmse: 110.997
[2000]	valid_0's rmse: 110.041
[2500]	valid_0's rmse: 109.284
[3000]	valid_0's rmse: 108.715
[3500]	valid_0's rmse: 108.333
[4000]	valid_0's rmse: 108.065
[4500]	valid_0's rmse: 107.727
[5000]	valid_0's rmse: 107.488
[5500]	valid_0's rmse: 107.261
[6000]	valid_0's rmse: 107.006
[6500]	valid_0's rmse: 106.792
[7000]	valid_0's rmse: 106.69
[7500]	valid_0's rmse: 106.627
[8000]	valid_0's rmse: 106.554
[8500]	valid_0's rmse: 106.454
[9000]	valid_0's rmse: 106.364
[9500]	valid_0's rmse: 106.296
[10000]	valid_0's rmse: 106.214
[10500]	valid_0's rmse: 106.157
Early stopping, best iteration is:
[10499]	valid_0's rmse: 106.157


In [122]:
preds = model.predict(X_test)

In [123]:
submission['rating'] = preds
submission.to_csv('submission.csv', index=False)