In [1]:
# https://www.kaggle.com/c/ncaaw-march-mania-2021

import lightgbm as lgb
import xgboost as xgb
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import gc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn import metrics
from datetime import datetime
from sklearn.metrics import log_loss

base_path = 'C:\\Users\\koki2\\Desktop\\DS\\ML_Competition\\NCAA_2022_W'
train_01 = pd.read_csv(os.path.join(base_path, 'data', 'train_df_02.csv')).drop('Pred', axis=1)
test_01 = pd.read_csv(os.path.join(base_path, 'data', 'test_df_staget2.csv')).drop('Pred', axis=1).drop('target', axis=1)

C:\Users\koki2\anaconda3\envs\py38-gpu\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\koki2\anaconda3\envs\py38-gpu\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll


In [2]:
test_01.shape

(2278, 11)

In [3]:
train_01.shape

(1270, 12)

In [4]:
train_01.head().T

Unnamed: 0,0,1,2,3,4
Season,2016,2016,2016,2016,2016
teamId_01,3106,3106,3107,3107,3107
teamId_02,3400,3407,3119,3196,3393
TeamName_01,Alabama St,Alabama St,SUNY Albany,SUNY Albany,SUNY Albany
TeamName_02,Texas,Troy,Army,Florida,Syracuse
target,0,0,0,1,0
ConfAbbrev_01,swac,swac,aec,aec,aec
ConfAbbrev_02,big_twelve,sun_belt,patriot,sec,acc
Description_01,Southwest Athletic Conference,Southwest Athletic Conference,America East Conference,America East Conference,America East Conference
Description_02,Big 12 Conference,Sun Belt Conference,Patriot League,Southeastern Conference,Atlantic Coast Conference


In [5]:
cat_features = ['teamId_01', 'TeamName_01',
                'teamId_02', 'TeamName_02',
               'ConfAbbrev_01', 'Description_01', 'Seed_01',
               'ConfAbbrev_02', 'Description_02', 'Seed_02']

In [6]:
# Count Encoding
def create_features_categorical(base_df, input_df, categorical_cols):
    for col in tqdm(categorical_cols):
        CE_df = base_df[col].value_counts(dropna=False).reset_index()
        CE_df.columns = [col, 'CE_'+col]
        input_df = input_df.merge(CE_df, how='left', on = col)
    return input_df

train_02 = create_features_categorical(base_df=train_01, 
            input_df=train_01,
            categorical_cols = cat_features)

test_02 = create_features_categorical(base_df=train_01, 
            input_df=test_01,
            categorical_cols = cat_features)

# Label Encoding
for col in cat_features:
    gc.collect()
    train_02[col] = train_02[col].astype(str)
    test_02[col] = test_02[col].astype(str)

for col in tqdm(cat_features):
    gc.collect()
    le = LabelEncoder()
    le.fit(pd.concat([train_02[col], test_02[col]], axis=0, sort=False))
    train_02['LE_' + str(col)] = le.transform(train_02[col])
    test_02['LE_' + str(col)] = le.transform(test_02[col])

100%|██████████| 10/10 [00:00<00:00, 280.08it/s]
100%|██████████| 10/10 [00:00<00:00, 303.58it/s]
100%|██████████| 10/10 [00:00<00:00, 35.43it/s]


In [7]:
# Target Encoding
split_num = 5
train_01['target'] = np.where(train_01['target'] > 0.5, 1, 0)
target = train_01['target']

for col in tqdm(cat_features):
    data_tmp = pd.DataFrame({col: train_02[col], 'target' : target})
    target_mean = data_tmp.groupby(col)['target'].mean()
    test_02['TE_' + col] = test_02[col].map(target_mean)
    tmp = np.repeat(np.nan, train_02.shape[0])
    
    kf = KFold(n_splits=split_num, shuffle = True, random_state = 72)
    for idx_1, idx_2 in kf.split(train_02):
        target_mean = data_tmp.iloc[idx_1].groupby(col)['target'].mean()
        tmp[idx_2] = train_02[col].iloc[idx_2].map(target_mean)
    
    train_02['TE_' + col] = tmp

100%|██████████| 10/10 [00:00<00:00, 119.33it/s]


In [9]:
# Select Features to use
base_cols_01 = []
col_names_CE = train_02.loc[:, [c.startswith('CE_') for c in train_02.columns]].columns
col_names_LE = train_02.loc[:, [c.startswith('LE_') for c in train_02.columns]].columns
col_names_TE = train_02.loc[:, [c.startswith('TE_') for c in train_02.columns]].columns
features = []
for item in list(col_names_CE), list(col_names_LE), list(col_names_TE):
    # appending elements to the flat_list
    features += item
train_01['target'] = np.where(train_01['target'] > 0.5, 1, 0)
target = train_01['target']
len(features)
# 

30

In [10]:
train_02.shape, test_02.shape

((1270, 42), (2278, 41))

In [11]:
train_df = train_02
test_df = test_02

In [12]:
%%time
n_round = 500 #5000
fold_num = 2
params_lgb = {'num_leaves': 20,
          'min_data_in_leaf': 10,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.01,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'binary_logloss',
          'random_state': 42,
              'early_stopping_rounds ':50
         }

scores=[]
y_preds_lgb = np.zeros(test_df.shape[0])
feature_importances = pd.DataFrame()
feature_importances['feature'] = train_df[features].columns

gc.collect()
kf = KFold(fold_num, shuffle=True, random_state=71)
for fold_n, (tr_idx, va_idx) in enumerate(kf.split(train_df)):
    X_train, X_valid = train_df[features].iloc[tr_idx], train_df[features].iloc[va_idx]
    y_train, y_valid = target.iloc[tr_idx], target.iloc[va_idx]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)
    model = lgb.train(params_lgb, dtrain, n_round, 
                      verbose_eval = 50,
                      early_stopping_rounds =50,
                      valid_sets = [dtrain, dvalid])
    gc.collect()
    feature_importances[f'fold_{fold_n + 1}'] = model.feature_importance()
    y_pred_valid = model.predict(X_valid)
    y_preds_lgb += model.predict(test_df[features])/fold_num
    score = log_loss(y_valid.values, y_pred_valid)
    scores.append(score)
    print(score)

print('------------------')
print(np.mean(scores))

Training until validation scores don't improve for 50 rounds
[50]	training's binary_logloss: 0.522609	valid_1's binary_logloss: 0.606013
[100]	training's binary_logloss: 0.428199	valid_1's binary_logloss: 0.569398
[150]	training's binary_logloss: 0.364179	valid_1's binary_logloss: 0.554701
[200]	training's binary_logloss: 0.313807	valid_1's binary_logloss: 0.550602
[250]	training's binary_logloss: 0.274212	valid_1's binary_logloss: 0.548422
Early stopping, best iteration is:
[249]	training's binary_logloss: 0.274921	valid_1's binary_logloss: 0.548291
0.5482908050532543
Training until validation scores don't improve for 50 rounds
[50]	training's binary_logloss: 0.524732	valid_1's binary_logloss: 0.601882
[100]	training's binary_logloss: 0.432163	valid_1's binary_logloss: 0.570698
[150]	training's binary_logloss: 0.368229	valid_1's binary_logloss: 0.559265
[200]	training's binary_logloss: 0.31987	valid_1's binary_logloss: 0.562307
Early stopping, best iteration is:
[162]	training's binar

In [13]:
# -- this year --
# 0.553581227126699

len(features)

30

In [16]:
now = datetime.now()
now_time = str(now).replace('.', '')
sub_file_path=os.path.join('data', 'submission', f'submission{now_time}.csv')
PATH=sub_file_path.replace('-', '').replace(':', '').replace(' ', '')
sample_submission = pd.read_csv(os.path.join(base_path, 'WDataFiles_Stage2', 'WSampleSubmissionStage2.csv'))
sample_submission['Pred'] = y_preds_lgb
sample_submission.to_csv(os.path.join(base_path, PATH), index=False)
sample_submission.head()

Unnamed: 0,ID,Pred
0,2022_3107_3110,0.639602
1,2022_3107_3112,0.10802
2,2022_3107_3116,0.10495
3,2022_3107_3124,0.1453
4,2022_3107_3125,0.273923
