# import

In [2]:
import warnings
warnings.simplefilter('ignore')

import os
import gc 
import sys
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb 

from tqdm import tqdm
from collections import defaultdict
from scipy.spatial import distance
from multiprocessing import cpu_count

from sklearn.model_selection import StratifiedKFold, KFold 
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD



In [3]:
pd.options.display.max_columns = None

NFOLDS = 6
SEED = 2019

# read

In [6]:
sorted(os.listdir('../feature'))

['ss_101.csv', 'ss_105.csv', 'ss_108.csv', 'ss_201.csv', 'ss_202.csv']

In [7]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [8]:
test.insert(1, 'target', 0)

In [11]:
var_cols = [f'var_{i}' for i in range(0, 200)]

In [12]:
ss_101 = pd.read_csv('../feature/ss_101.csv')

In [None]:
cols = [f for f in ss_101]

In [13]:
ss_101.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_61,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_73,var_74,var_75,var_76,var_77,var_78,var_79,var_80,var_81,var_82,var_83,var_84,var_85,var_86,var_87,var_88,var_89,var_90,var_91,var_92,var_93,var_94,var_95,var_96,var_97,var_98,var_99,var_100,var_101,var_102,var_103,var_104,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_112,var_113,var_114,var_115,var_116,var_117,var_118,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128,var_129,var_130,var_131,var_132,var_133,var_134,var_135,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_158,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,kmeans_2,kmeans_2_1_distance,kmeans_2_2_distance,kmeans_3,kmeans_3_1_distance,kmeans_3_2_distance,kmeans_3_3_distance,kmeans_4,kmeans_4_1_distance,kmeans_4_2_distance,kmeans_4_3_distance,kmeans_4_4_distance,kmeans_5,kmeans_5_1_distance,kmeans_5_2_distance,kmeans_5_3_distance,kmeans_5_4_distance,kmeans_5_5_distance,kmeans_6,kmeans_6_1_distance,kmeans_6_2_distance,kmeans_6_3_distance,kmeans_6_4_distance,kmeans_6_5_distance,kmeans_6_6_distance,kmeans_7,kmeans_7_1_distance,kmeans_7_2_distance,kmeans_7_3_distance,kmeans_7_4_distance,kmeans_7_5_distance,kmeans_7_6_distance,kmeans_7_7_distance,kmeans_8,kmeans_8_1_distance,kmeans_8_2_distance,kmeans_8_3_distance,kmeans_8_4_distance,kmeans_8_5_distance,kmeans_8_6_distance,kmeans_8_7_distance,kmeans_8_8_distance,kmeans_9,kmeans_9_1_distance,kmeans_9_2_distance,kmeans_9_3_distance,kmeans_9_4_distance,kmeans_9_5_distance,kmeans_9_6_distance,kmeans_9_7_distance,kmeans_9_8_distance,kmeans_9_9_distance
0,test_0,0.12858,2.325169,0.848823,1.287888,0.217705,0.340433,0.503601,0.506387,0.555762,1.006657,-0.437413,-0.184036,-0.280522,-1.767294,0.002152,-0.178331,-0.636761,-1.991131,0.041417,0.46569,-1.63015,0.190986,-0.952999,0.084145,-1.093209,1.585839,-0.921882,-0.693643,0.227132,-0.483567,0.739136,-0.524988,0.998684,-0.462166,-0.182081,0.829683,-0.724681,-1.037938,-0.202575,1.987908,-1.073613,1.614942,0.097712,0.860384,0.748175,-0.901576,0.303024,2.011251,0.220044,0.019716,0.451018,0.122059,-0.467509,1.710803,-0.197378,0.682818,0.756926,0.177326,-1.995002,0.468909,-0.054255,0.720462,-0.813118,0.065357,0.378351,0.504334,0.162672,0.755654,-1.189402,0.482595,0.744832,0.495208,-1.375551,1.426702,0.521288,0.799789,0.157048,-1.306747,-1.117209,2.304397,0.495811,0.946973,2.124585,1.088519,0.258075,0.530753,-0.107838,1.602077,1.781303,1.180855,0.667686,0.615405,0.595435,-0.767916,-1.836058,-0.793109,-1.013328,0.928904,0.522019,1.395744,-0.281291,0.789303,1.00068,-0.413429,0.688778,-0.880724,-0.174007,-0.76696,-1.218676,-0.774156,-0.454305,-0.249821,0.697997,1.352302,-1.007196,-0.250807,0.498041,-1.209127,-0.153947,1.300144,1.4448,0.059794,-0.142733,0.382467,-1.210443,0.004281,1.184604,1.014846,-1.022837,0.295146,0.339913,0.82005,1.923244,1.869645,0.103766,0.49913,-1.201971,0.013437,-1.70024,-1.341942,-1.749104,-0.745144,1.380443,0.253768,0.632322,-0.469404,0.527775,-1.132086,0.548332,0.001039,0.796029,-0.06523,-0.035879,-0.866173,2.417335,0.544385,-1.358209,0.056555,-0.225939,0.188639,-1.680658,-1.029156,-1.076678,-1.001671,-1.052682,0.721767,-1.107793,0.49745,-0.191761,-1.226493,0.966445,0.464179,1.069802,1.529537,-0.296593,0.222204,0.278211,-0.629337,-0.533907,-0.104954,-0.343829,0.223838,-1.804728,-1.073903,1.998589,0.59433,0.098637,-1.146779,-0.534844,0.930228,-1.178346,1.455463,-2.26941,-0.219313,-1.36569,1.822189,0.379771,1.964356,-0.132886,-0.521536,0,13.890304,13.995981,0,13.883361,14.06603,13.934303,2,13.948049,13.872803,13.861605,14.21921,0,13.863517,14.086442,13.979325,14.025543,13.975279,2,14.109965,13.899351,13.866446,14.200058,13.962286,13.951669,2,14.111944,13.971087,13.850846,14.121081,13.946957,14.152569,13.88675,5,13.885663,13.96664,14.121457,13.990285,13.897668,13.877304,14.197634,14.183681,0,13.878474,13.915786,13.908583,14.134586,14.098575,14.145941,14.061245,14.080816,13.938438
1,test_1,-0.704096,0.711926,0.225103,-0.783376,-1.16209,0.13322,0.702294,0.608687,-1.408535,-1.29337,-0.320423,0.49047,0.455986,-1.288174,-0.903874,-0.931375,-0.904482,1.538972,1.771537,-1.545884,-0.194663,-1.720524,0.335835,-1.133109,1.490146,-0.585723,-0.204675,-1.660025,0.223308,-0.786282,-0.349167,0.505545,-0.172758,0.132187,-1.486249,-0.571935,1.455523,-1.566453,0.521101,-0.545401,2.022043,-1.03552,-0.937639,0.634426,-1.146488,-1.332342,0.774929,-0.466623,-0.730467,-0.524367,-1.524253,1.4627,0.823648,1.093469,0.902867,0.644188,1.433969,0.830117,-0.352519,0.691416,-1.616922,-0.580388,0.226701,-0.773491,-0.356579,0.561093,-1.211499,0.554782,1.556533,-1.011468,-1.018583,0.108741,-0.359906,0.255382,-0.90113,0.218259,0.110268,0.804192,0.514126,-0.951236,-0.514937,-0.297959,1.199451,1.082476,-1.444063,0.530753,0.497687,-0.307088,-0.238717,-0.315231,-0.074326,1.28229,0.623102,0.171628,-1.80859,0.68734,-0.4758,0.889405,0.347148,-0.359665,0.534897,0.417731,1.539529,0.477702,0.792715,-1.715313,-1.119705,-1.726732,-2.479537,-0.168907,-0.984213,-0.687242,0.21683,0.185656,1.049503,-0.433844,-0.226605,-0.587552,-0.176718,-0.662523,-0.617385,0.209368,0.373988,0.237611,0.497971,-1.505533,1.324802,-2.229445,-1.400224,-1.578609,-1.437738,-0.192195,1.373621,0.734935,1.216215,1.678395,-0.695122,-1.434247,0.523442,-0.131296,1.567259,1.855192,-1.717009,-1.397084,0.459014,-2.551122,-0.630054,0.132247,0.674037,-1.790647,0.783852,-0.195045,-2.027347,-0.334904,0.161047,0.604662,1.741989,1.468799,-0.804861,0.961379,0.622085,0.163813,-1.176479,0.384717,-1.428287,-0.755658,0.691352,1.692765,0.685114,-0.666111,-0.171695,-1.368758,1.16662,-0.829489,-1.363947,-0.702156,0.304487,-1.409472,-0.393967,0.936304,0.599008,1.398411,-1.234589,-0.653516,1.337473,-1.030041,0.892831,0.138607,0.136747,0.235729,1.6261,0.458849,-0.666913,1.702162,-0.769749,0.430332,-0.693922,1.047147,1.079813,-1.697342,0,14.303008,14.445357,0,14.296884,14.447116,14.434061,2,14.401691,14.378849,14.271288,14.574022,0,14.303018,14.41554,14.501825,14.441648,14.423774,4,14.334217,14.533153,14.329389,14.633661,14.25645,14.465219,2,14.324623,14.540309,14.303231,14.602641,14.325474,14.463159,14.479506,0,14.319173,14.33039,14.329648,14.468236,14.627126,14.48039,14.371395,14.580727,8,14.327616,14.672258,14.483171,14.302703,14.64807,14.519317,14.472483,14.425985,14.183149
2,test_2,-1.706599,-2.159083,-0.21662,0.124328,-0.5009,1.889452,-0.597301,1.085117,0.372744,0.625616,-0.924631,0.035882,-1.858752,-1.018004,1.472224,-0.882791,0.216577,1.241478,0.759965,1.046181,-0.140244,-0.594549,0.155933,-0.42968,-1.419109,-0.340547,-1.921186,0.635216,2.202925,-0.619425,-0.183512,-0.308131,1.400111,1.744109,0.970611,1.171706,0.599209,-0.390496,-0.603056,0.052185,-1.489826,-0.683068,-2.332488,1.970809,-0.099954,1.298972,-0.427553,-0.793407,-0.528418,-1.087519,-1.234836,-0.332147,0.377539,-0.470792,0.979029,2.307054,0.920545,-0.990166,-1.677574,-0.484358,-0.75378,0.541064,0.953612,0.621053,1.841471,1.620248,-0.752848,0.260261,-1.463996,2.049175,-0.439151,-0.885568,0.050888,2.043821,0.086405,-0.06015,-1.983243,0.149434,1.15814,0.898208,1.394613,-0.171725,0.566053,-0.177046,0.368513,-0.885091,-1.228774,0.58822,-0.561424,-2.492634,-1.359628,0.210043,-0.061432,-1.644824,0.818916,-0.76872,-1.0282,-0.709844,-0.420885,1.355008,0.341033,0.153996,0.662306,-0.942708,-0.348054,0.061251,-0.216273,1.112858,-1.195327,0.227714,0.83168,1.65245,-1.324664,0.173116,-2.139142,0.065696,0.464139,1.642579,0.21869,-1.305259,0.435171,0.519034,0.69375,0.143862,-0.756068,0.507553,1.455997,0.469678,1.087197,-0.422,0.532092,0.447878,-0.018987,-0.586231,0.829241,2.257892,0.667397,-1.217713,2.034344,-1.796975,-1.621395,-0.222251,-1.218287,0.21973,0.805631,-1.272924,0.523801,2.709505,1.352844,0.902132,-0.963549,1.01211,0.615543,-0.071752,0.058973,-0.024608,-0.649233,2.202462,0.662756,0.055408,1.395181,-0.510673,-1.437929,-1.269072,0.426288,-1.19507,0.437419,1.9e-05,-1.195242,1.406761,-0.005927,0.430153,-0.33521,-0.385959,1.003283,1.468532,-1.085812,1.433264,-0.982882,0.668582,-0.067102,-1.114884,-0.394665,1.053987,1.259126,0.10395,-1.268377,0.712406,-0.973258,0.44162,-0.869387,1.171094,0.035904,-0.287168,-1.597055,1.585747,-1.725967,-2.006236,1.335643,-1.908508,0,15.298543,15.310238,2,15.312185,15.392328,15.254424,0,15.239756,15.264452,15.304866,15.517937,4,15.339315,15.427082,15.413444,15.39197,15.134957,5,15.370819,15.44903,15.345212,15.47127,15.262792,15.20512,5,15.381527,15.363002,15.340107,15.59638,15.238485,15.172554,15.410098,4,15.367018,15.281626,15.390448,15.420399,15.190149,15.336755,15.433792,15.52425,2,15.389344,15.33793,15.164895,15.386467,15.336219,15.560115,15.418313,15.408549,15.320121
3,test_3,-0.700805,0.075133,0.496203,-0.106626,-1.381252,1.046486,-0.545317,1.175727,0.928399,-0.090254,-0.067821,-0.303215,0.166644,1.071282,0.546656,0.390096,0.279294,-1.433887,-1.672522,1.130001,0.211783,-0.571983,-0.468105,2.313881,0.848944,1.65589,0.846869,-0.14058,-0.349034,0.58914,1.744738,-0.391074,1.113432,-1.054206,-1.162978,-1.239653,1.240163,-1.158196,1.385296,2.005092,0.229072,-1.40201,1.657929,-0.482454,-0.382664,-0.013882,0.846239,1.655672,1.466603,-1.376722,-0.192935,-1.464244,-0.320252,0.31657,0.034552,0.13848,-0.340449,0.75605,-0.333915,1.300383,0.742492,0.672043,-0.042069,0.598561,1.200935,0.420911,0.177812,0.158468,-1.463996,-1.629004,-1.299088,-0.936597,-0.807831,-2.231431,-0.622321,-0.768529,0.815986,0.091816,0.079605,-0.187003,1.587021,0.311448,-1.161417,1.578734,1.061939,1.006452,-0.486339,0.912017,0.837638,-0.955612,-0.382738,-0.456843,-0.666063,-0.3442,0.016569,0.44539,0.588716,1.719681,-0.478242,1.80257,0.908248,-2.193973,-2.136583,-0.273009,0.960028,-1.574075,-0.805881,0.501161,-0.14461,1.048468,1.262606,-0.257219,1.073006,1.011937,-0.971648,0.706708,-0.296224,-0.335994,1.46275,1.231946,-0.367248,-0.506369,1.411229,-0.327975,0.012383,-1.505533,0.663683,0.243213,-0.031386,1.755999,0.488851,0.564414,-2.084884,-0.064157,1.729965,-0.03397,-0.382918,-0.555738,-0.91243,0.654114,1.761067,1.528607,-0.750681,-0.229574,1.28223,-0.781084,-0.658264,0.68318,-0.829396,-2.079675,-1.261886,-0.090437,-0.591629,-0.637777,-0.389184,-1.161727,0.410036,0.237773,-0.500165,1.060696,-0.412081,-1.134687,-1.183507,-0.671786,-1.505241,1.105334,-0.872769,0.824057,0.756814,0.013965,1.632094,0.79384,-2.003936,1.980261,-0.796652,-0.352492,-0.398797,0.786323,0.943627,-0.530096,2.325226,-0.184709,0.59209,-0.806546,-0.49562,1.616362,-1.771158,-0.818301,-0.488107,0.573759,1.395615,0.53953,-0.181783,0.063876,-0.894036,2.311161,0.304547,0.347029,-0.948218,-0.088647,0,14.278438,14.399775,0,14.289305,14.352119,14.431494,2,14.335787,14.401905,14.262833,14.482489,0,14.244527,14.367529,14.63327,14.303022,14.360124,2,14.383037,14.445004,14.287358,14.554416,14.306194,14.372009,4,14.380155,14.479225,14.285348,14.579002,14.259411,14.396066,14.416925,0,14.259273,14.359183,14.387904,14.569857,14.27085,14.492205,14.417736,14.520668,4,14.283361,14.523026,14.362924,14.396909,14.277542,14.641054,14.396783,14.54048,14.296138
4,test_4,0.339218,0.369131,1.296233,0.467585,-1.220739,-0.448505,1.672659,-1.735465,0.812288,-0.3432,0.85693,-0.004347,0.403378,0.09375,-1.247187,0.016003,0.992695,1.314886,0.352278,0.16919,-1.9883,-0.113958,0.876598,1.104211,0.817149,0.745235,2.130797,-0.742847,-0.38855,0.75217,-1.004885,0.10473,-1.640924,0.643284,0.804357,-0.162377,0.658177,0.394067,-1.097217,-0.073253,-0.93005,-0.697711,0.586628,1.486612,1.696184,1.15395,1.708251,0.972457,-1.19281,-0.411228,0.06754,-1.102589,0.475837,-0.434171,-0.037332,1.121801,-1.116228,0.207455,0.269778,-0.268877,-0.023428,0.12654,-0.449761,1.604303,-0.077441,0.14345,-1.117987,1.11668,1.144643,0.371418,1.285746,1.425731,-0.393747,2.528412,-1.224632,-1.21003,0.262679,-1.086748,0.404368,0.852354,0.80797,0.890385,-1.326333,-0.177747,0.418873,1.062718,0.173718,-1.611831,-2.061507,-0.646196,1.937948,0.210043,1.679717,-0.933718,0.175592,0.551667,0.267839,2.132054,-0.241817,0.461384,-0.843278,0.90037,0.106074,-0.707773,1.504555,0.158134,1.096609,0.077577,-0.027863,-0.54948,0.232302,0.514971,-0.370496,1.009698,0.156743,0.463421,-1.793947,-1.638523,1.147722,1.742833,1.254101,-0.045376,-0.482914,0.884415,0.815232,0.727734,0.129901,-0.896105,-0.032621,-0.936316,0.460025,-0.146413,-0.412655,0.881435,0.747437,-1.589317,0.401445,0.671475,-1.282093,0.525295,1.867152,-0.966734,1.781249,-0.946078,0.67565,-1.660203,1.024441,1.356737,1.126575,1.620849,1.311523,-0.794715,0.580995,1.899406,-0.711712,-2.27962,-0.124842,-1.661606,-0.782008,1.038895,-0.121596,0.154636,0.530675,0.422701,-1.670141,1.367669,0.731874,-0.180455,0.248453,1.455726,-0.359877,-0.31005,-0.291412,0.507794,-0.873692,0.441696,-1.226479,-0.667392,-0.907081,1.250431,0.261309,0.201951,-0.912365,0.154595,-0.939727,2.081133,-0.189024,-1.254831,-0.985958,0.902571,0.222779,0.56929,-0.4353,0.012546,0.49543,-0.10362,-1.368191,-1.758752,-0.644545,-0.566648,0,13.951828,14.011402,0,13.990273,14.01036,13.999253,3,14.061164,14.000031,14.012283,13.978344,1,14.021784,13.879293,14.119128,14.00629,14.088497,1,14.222322,13.831246,13.984859,13.920968,14.120495,14.136126,3,14.241616,13.911256,14.025616,13.890589,14.205633,14.080122,13.943173,7,14.01731,14.037936,14.249299,13.969593,14.166945,14.094775,13.996391,13.848572,2,14.052254,14.205952,13.873094,14.253243,14.017574,13.9125,14.059155,14.027514,14.082892


# Preprocess

In [None]:
df = pd.concat([train, test], axis=0)

In [None]:
df_unique = df.nunique().reset_index().rename(columns={'0': 'count'})

In [None]:
mms_df = df.copy()
ss_df = df.copy()

## MinMax

In [None]:
mms = MinMaxScaler()
mms.fit(mms_df[var_columns])
mms_df[var_columns] = mms.transform(mms_df[var_columns])

## Standard

In [None]:
ss = StandardScaler()
ss.fit(ss_df[var_columns])
ss_df[var_columns] = ss.transform(ss_df[var_columns])

## PCA

## TruncatedSVD

# prepare dataset

In [None]:
train = df.iloc[:len(train_index)]
test = df.iloc[len(test_index):]

y = train['target']

not_use_cols = ['ID_code', 'target'] 
use_cols = [c for c in train.columns if c not in not_use_cols]

In [None]:
train.head()

In [None]:
test.head()

In [None]:
X = train[use_cols]
X_test = test[use_cols]

# pseudo labeling

In [None]:
def cv_lightgbm(X, y, X_test, NFOLDS=5, SEED=6):
    params_in_train = {
        'num_boost_round': 20000,
        'early_stopping_rounds': 200,
        'verbose_eval': 500,
    }

    skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    oof = np.zeros(len(X))
    predictions = np.zeros(len(X_test))
    scores = {'train': [], 'valid': []}
    features = X.columns
    feature_importance_df = pd.DataFrame(columns=['fold', 'feature', 'importance'])

    for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
        print('fold: {}/{}'.format(fold+1, skf.n_splits))
        
        params = {
            'boosting': 'gbdt',
            'metric': 'auc',
            'objective': 'binary',
            'max_depth': -1,
            'num_leaves': 2,
            'min_data_in_leaf': 64,
            'bagging_freq': 5,
            'learning_rate': 0.01,
            'feature_fraction': 0.01,
            'bagging_fraction': 0.4,    
            'min_gain_to_split': 0.01,
            'min'
            'num_threads': cpu_count(),
            'verbose': -1,
            'seed': int(2**fold),
            'bagging_seed': int(2**fold),
            'drop_seed': int(2**fold),
        }
        
        dtrain = lgb.Dataset(X.iloc[train_index], label=y.iloc[train_index])
        dvalid = lgb.Dataset(X.iloc[valid_index], label=y.iloc[valid_index])
        
        model = lgb.train(params, dtrain, valid_sets=[dtrain, dvalid], **params_in_train)
        scores['train'].append(model.best_score['training']['auc'])
        scores['valid'].append(model.best_score['valid_1']['auc'])
        oof[valid_index] = model.predict(X.iloc[valid_index], num_iteration=model.best_iteration)

        fold_feature_importance_df = pd.DataFrame(columns=['fold', 'feature', 'importance'])
        fold_feature_importance_df['feature'] = features
        fold_feature_importance_df['importance'] = model.feature_importance()
        fold_feature_importance_df['fold'] = fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_feature_importance_df], axis=0)

        predictions += model.predict(X_test, num_iteration=model.best_iteration) / NFOLDS

        del model

    return oof, predictions, scores, feature_importance_df

In [None]:
oof, predictions, scores, feature_importance_df = cv_lightgbm(X, y, X_test) 

In [None]:
cv_score = roc_auc_score(y, oof)**0.5
print('Num folds: {}'.format(NFOLDS))
print('Train Scores: mean {:.5f}, max {:.5f}, min {:.5f}, std {:.5f}'.format(
    np.mean(scores['train']), np.max(scores['train']), np.min(scores['train']), np.std(scores['train'])))
print('Valid Scores: mean {:.5f}, max {:.5f}, min {:.5f}, std {:.5f}'.format(
    np.mean(scores['valid']), np.max(scores['valid']), np.min(scores['valid']), np.std(scores['valid'])))
print('CV Score: {:<8.5f}'.format(cv_score))

In [None]:
y_test = np.array([np.where(x >= 0.5, 1, 0) for x in predictions])

In [None]:
# y_test = np.array(predictions)

In [None]:
y_test

In [None]:
def cv_pseudo_labeling_lightgbm(X, y, X_test, y_test, NFOLDS=5, SEED=6):
    params_in_train = {
        'num_boost_round': 20000,
        'early_stopping_rounds': 200,
        'verbose_eval': 500,
    }

    skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    oof = np.zeros(len(X))
    predictions = np.zeros(len(X_test))
    scores = {'train': [], 'valid': []}
    features = X.columns
    feature_importance_df = pd.DataFrame(columns=['fold', 'feature', 'importance'])

    for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
        print('fold: {}/{}'.format(fold+1, skf.n_splits))
        
        params = {
            'boosting': 'gbdt',
            'metric': 'binary',
            'objective': 'binary',
            'max_depth': 6,
            'num_leaves': 12,
            'min_data_in_leaf': 64,
            'bagging_freq': 5,
            'learning_rate': 0.01,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.4,
            'reg_alpha': 2,
            'reg_lambda': 5,
            'min_gain_to_split': 0.01,
            'min_child_weight': 19,
            'num_threads': cpu_count(),
            'verbose': -1,
            'seed': int(2**fold),
            'bagging_seed': int(2**fold),
            'drop_seed': int(2**fold),
        }
        
        X_train = pd.concat([X.iloc[train_index], X_test], axis=0)
        y_train = np.concatenate([y.iloc[train_index], y_test], axis=0)
        
        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X.iloc[valid_index], label=y.iloc[valid_index])
        
        model = lgb.train(params, dtrain, valid_sets=[dtrain, dvalid], **params_in_train)
        scores['train'].append(model.best_score['training']['binary_logloss'])
        scores['valid'].append(model.best_score['valid_1']['binary_logloss'])
        oof[valid_index] = model.predict(X.iloc[valid_index], num_iteration=model.best_iteration)

        fold_feature_importance_df = pd.DataFrame(columns=['fold', 'feature', 'importance'])
        fold_feature_importance_df['feature'] = features
        fold_feature_importance_df['importance'] = model.feature_importance()
        fold_feature_importance_df['fold'] = fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_feature_importance_df], axis=0)

        predictions += model.predict(X_test, num_iteration=model.best_iteration) / NFOLDS

        del model

    return oof, predictions, scores, feature_importance_df

In [None]:
oof, psudeo_predictions, scores, feature_importance_df = cv_pseudo_labeling_lightgbm(X, y, X_test, y_test) 

In [None]:
cv_score = roc_auc_score(y, oof)**0.5
print('Num folds: {}'.format(NFOLDS))
print('Train Scores: mean {:.5f}, max {:.5f}, min {:.5f}, std {:.5f}'.format(
    np.mean(scores['train']), np.max(scores['train']), np.min(scores['train']), np.std(scores['train'])))
print('Valid Scores: mean {:.5f}, max {:.5f}, min {:.5f}, std {:.5f}'.format(
    np.mean(scores['valid']), np.max(scores['valid']), np.min(scores['valid']), np.std(scores['valid'])))
print('CV Score: {:<8.5f}'.format(cv_score))

In [None]:
submission = pd.read_csv(os.path.join('..', 'data', 'sample_submission.csv'))
submission['target'] = psudeo_predictions
submission.to_csv(os.path.join('..', 'submission', '{}_lightgbm.csv'.format(str(datetime.datetime.today().date()).replace('-', ''))), index=False)

In [None]:
feature_importance_df['importance'] = feature_importance_df['importance'].astype('int')
ordered_feature = feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).index
plt.figure(figsize=(12, 48))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
sns.barplot(x='importance', y='feature', data=feature_importance_df, order=ordered_feature)
plt.show()

In [None]:
sns.distplot(submission['target'])
plt.show()

In [None]:
submission = pd.read_csv(os.path.join('..', 'submission', '20190317_lightgbm.csv'))

In [None]:
sns.distplot(submission['target'])
plt.show()