In [1]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier

#import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [2]:
target = 'claim'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [3]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [4]:
INPUT = Path("../input/tabular-playground-series-sep-2021")

train = pd.read_csv(INPUT / "train.csv")
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

# Preprocessing

In [5]:
features = [col for col in train.columns if 'f' in col]

In [6]:
train['n_missing'] = train[features].isna().sum(axis=1)
#test['n_missing'] = test[features].isna().sum(axis=1)

features += ['n_missing']

In [7]:
train[features] = train[features].fillna(train[features].median())
#test[features] = test[features].fillna(test[features].median())

In [8]:
ss = StandardScaler()


In [9]:
train[features]

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,n_missing
0,0.108590,0.004314,-37.566,0.017364,0.289150,-10.25100,135.12,168900.0,3.992400e+14,86.489,...,-12.2280,1.7482,1.90960,-7.11570,4378.80,1.2096,8.613400e+14,140.10,1.01770,1
1,0.100900,0.299610,11822.000,0.276500,0.459700,-0.83733,1721.90,119810.0,3.874100e+15,9953.600,...,-56.7580,4.1684,0.34808,4.14200,913.23,1.2464,7.575100e+15,1861.00,0.28359,0
2,0.178030,-0.006980,907.270,0.272140,0.459480,0.17327,2298.00,360650.0,1.224500e+13,15827.000,...,-5.7688,1.2042,0.26290,8.13120,45119.00,1.1764,3.218100e+14,3838.20,0.40690,5
3,0.152360,0.007259,780.100,0.025179,0.519470,7.49140,112.51,259490.0,7.781400e+13,-36.837,...,-34.8580,2.0694,0.79631,-16.33600,4952.40,1.1784,4.533000e+12,4889.10,0.51486,2
4,0.116230,0.502900,-109.150,0.297910,0.344900,-0.40932,2538.90,65332.0,1.907200e+15,144.120,...,-13.6410,1.5298,1.14640,-0.43124,3856.50,1.4830,-8.991300e+12,3228.00,0.23049,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
957914,0.098979,0.202530,2192.100,0.348010,0.276430,-0.84065,2214.20,26246.0,1.061600e+15,10882.000,...,-51.2970,1.7323,20.76300,14.74400,27415.00,1.2160,2.139400e+13,2612.00,0.42597,0
957915,0.134930,0.413600,1050.100,0.000521,-0.001246,1.76990,3519.90,862290.0,7.321400e+13,2869.000,...,-41.4180,2.6986,-1.09760,-1.09980,13826.00,1.3320,1.829400e+17,3300.50,0.92974,4
957916,0.104660,0.473510,2415.400,0.252230,0.196970,-0.82323,3044.90,13832.0,2.693300e+12,2294.700,...,-15.9670,1.6843,1.07590,16.64600,8655.10,1.1113,1.925800e+16,297.61,0.91446,0
957917,0.118190,0.320870,657.860,0.237290,0.260250,-0.71237,173.20,20811.0,9.711000e+14,14315.000,...,-2.8375,1.3006,132.50000,8.53050,3842.10,1.1941,1.248300e+16,5212.10,0.20942,1


In [10]:
train[target]

0         1
1         0
2         1
3         1
4         1
         ..
957914    0
957915    1
957916    0
957917    1
957918    0
Name: claim, Length: 957919, dtype: int64

# Optuna

In [11]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle



In [12]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {   'n_jobs' : -1,
                 
                    'n_neighbors':trial.suggest_int('n_neighbors', 5, 500),
                    'weights':trial.suggest_categorical('weights', ['uniform', 'distance']),
                  'algorithm':trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #ctb_oof = np.zeros(train.shape[0])
  #ctb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
      print(f"===== fold {fold} =====")

      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]

      X_train[features] = ss.fit_transform(X_train[features])
      X_valid[features] = ss.transform(X_valid[features])
      #X_test[features] = ss.transform(X_test[features])

      pred=np.zeros(y_valid.shape[0])

      start = time.time()
      for inseed in seed_list:
            #param_space['random_state'] = inseed

            model = KNeighborsClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                
            )

            #ctb_oof[val_idx] = model.predict_proba(X_valid)
            pred += model.predict_proba(X_valid)[:,-1] / len(seed_list)

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, pred)

      print(f"fold {fold} - ctb rmse: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

      #print(f"oof ctb_rmse = {mean_squared_error(train[target], ctb_oof, squared=False)}")
  
  return auc

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-09-14 15:01:01,264][0m A new study created in memory with name: no-name-a32afa10-78ff-48d8-adaf-990c8dce5bd6[0m


===== fold 0 =====


fold 0 - ctb rmse: 0.768225, elapsed time: 11844.06sec

===== fold 1 =====


fold 1 - ctb rmse: 0.766618, elapsed time: 14887.28sec

===== fold 2 =====


In [None]:
study.best_params

# Log

