In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import xgboost as xgb
from utils.common import gini_normalized

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [None]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
train.head()

In [None]:
train.info()

In [None]:
calc_col =[col for col in train.columns if '_calc' in col] 
print( calc_col)
cat_col = [col for col in train.columns if '_cat' in col]
print(cat_col)

In [None]:
train.drop(calc_col, axis=1, inplace=True)
test.drop(calc_col, axis=1, inplace=True)

In [None]:
train.drop('ps_car_03_cat', axis=1, inplace=True)
test.drop('ps_car_03_cat', axis=1, inplace=True)

In [None]:
def fill_missing_value(df):
    for i in df.column:
        if df[i].isnull().sum()>0:
            df[i].fillna(df[i].mode()[0],inplace=True)

## ~~~~~~~~~~ ###

for c in train.columns:
    l = len(train[train[c]==-1])
    if l == 0:
        continue
    
    print( '{} of <-1> replaced in {}'.format(c,l))
    train[c].replace(-1, value=np.NaN, inplace=True)
    test[c].replace(-1, value=np.NaN, inplace=True)

    
   

In [None]:
#Covert int/float varaible into category data
for i in test.columns:
    if test[i].nunique()<=30:
        train[i] = train[i].astype('category')
        test[i] = test[i].astype('category')

In [None]:
train.info()

In [None]:
def OHE(df):
    cat_col = df.select_dtypes(include =['category']).columns
    
    c2,c3 = [],{}
    
    print('Categorical feature',len(cat_col))
    for c in cat_col:
        if df[c].nunique()>2 :
            c2.append(c)
            c3[c] = 'ohe_'+c
    
    df = pd.get_dummies(df, prefix=c3, columns=c2,drop_first=True,dummy_na=True)

    #df = df.drop(c2,axis=1)
    print(df.shape)
    return df

In [None]:
train2 = OHE(train)
test2 = OHE(test)

In [None]:

X = train2.drop(['target','id'],axis=1)
y = train2['target']
x_test = test2.drop('id',axis=1)


In [None]:
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=11264)

In [None]:
params = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 4,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700,
    'learning_rate': 0.07
    }

In [None]:
X = train.drop(['id', 'target'], axis=1).values
y = train.target.values
X_test = test.drop('id', axis=1)
sub=test['id'].to_frame()
sub['target']=0

In [None]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [None]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(X_test.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 1,600 rounds (with early stopping after 70)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, d_train, 1600, watchlist, early_stopping_rounds=70, maximize=True, verbose_eval=100,feval=gini_xgb)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test, ntree_limit=mdl.best_ntree_limit)
    sub['target'] += p_test / kfold


In [None]:
# sub.to_csv('StratifiedKFold.csv', index=False)

# TODO: combine _bin into single value