In [1]:
import pandas as pd 
import numpy as np
from tqdm import *
import math 
import matplotlib.pyplot as plt
import pickle
import operator
%matplotlib inline
import os
import sys
from datetime import *
import lightgbm as lgb
import xgboost as xgb

import time
from sklearn.metrics import roc_auc_score,f1_score
from sklearn.model_selection import StratifiedKFold,KFold
pd.set_option('display.max_columns', 500)

In [2]:
def save_variable(v,filename):
    f=open(filename,'wb')
    pickle.dump(v,f)
    f.close()
    return filename

def load_variavle(filename):
    f=open(filename,'rb')
    r=pickle.load(f)
    f.close()
    return r

In [3]:
def auc(y,pred):
    return roc_auc_score(y, pred)

def f1(y,pred):
    return f1_score(y, pred,average='macro')

In [4]:
def create_feature(df):
    create_fe = list()
    col = list()

    create_fe.append(len(df))
    create_fe.append(len(df.drop_duplicates()))
    col.append('data_len')
    col.append('data_drop_dup_len')
    for i in df.columns:
        if i!='设备类型':
            create_fe.append(len(df[i].unique()))
            create_fe.append(df[i].max())
            create_fe.append(df[i].min())
#             create_fe.append(df[i].max()-df[i].min())        
            create_fe.append(df[i].sum())
            create_fe.append(df[i].mean())
            create_fe.append(df[i].std())
#             create_fe.append(df[i].std()/df[i].mean())  
#             create_fe.append(df[i].skew())
            
            col.append(i+'_unique_len')
            col.append(i+'_max')
            col.append(i+'_min')
#             col.append(i+'max_min_sub')
            col.append(i+'_sum')
            col.append(i+'_mean')
            col.append(i+'_std')
#             col.append(i+'std_mean_sub')
#             col.append(i+'_skew')
        else:
            create_fe.append(df[i].max())
            col.append(i+'_')
    return create_fe,col

In [5]:
train_uid = os.listdir('../data/data_train/')
test_uid = os.listdir('../data/data_test/')

In [6]:
def get_data():
    shebei = {'ZV41153':0, 'ZV55eec':1, 'ZV75a42':2, 
          'ZV7e8e3':3, 'ZV90b78':4, 'ZVc1d93':5, 'ZVe0672':6}
    try:
        train_all_fe =load_variavle('../data/train_fe_v3.pkl')
        test_all_fe =load_variavle('../data/test_fe_v3.pkl')
    except:
        # trin feature
        train_all_fe = list()
        for i in tqdm(train_uid):
            df = pd.read_csv('../data/data_train/'+i)
            df['设备类型'] = df['设备类型'].map(shebei)
            df,col  = create_feature(df)
            train_all_fe.append(df)
        train_all_fe = pd.DataFrame(train_all_fe,columns=col)
        save_variable(train_all_fe,'../data/train_fe_v3.pkl')
        # test feature
        test_all_fe = list()
        for i in tqdm(test_uid):
            df = pd.read_csv('../data/data_test/'+i)
            df['设备类型'] = df['设备类型'].map(shebei)
            df,col = create_feature(df)
            test_all_fe.append(df)
        test_all_fe = pd.DataFrame(test_all_fe,columns=col)
        save_variable(test_all_fe,'../data/test_fe_v3.pkl')
    return train_all_fe,test_all_fe

In [7]:
train_all_fe,test_all_fe = get_data()

100%|██████████| 69900/69900 [10:22<00:00, 112.33it/s]
100%|██████████| 69973/69973 [10:23<00:00, 112.22it/s]


In [9]:
label = pd.read_csv('../data/train_labels.csv')
label.columns = ['uid','label']

train_all_fe['uid'] = train_uid
train = train_all_fe.merge(label,on = ['uid'],how='left')

In [10]:
X_train = train.drop(['uid','label'],axis=1)
y_train = train['label']
X_test = test_all_fe

In [11]:
K = 5
skf = StratifiedKFold(n_splits = K, shuffle = True ,random_state=267)

In [13]:
xgb_pred_te_all = 0
for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
    
    y_tr, y_val = y_train.iloc[train_index].copy(), y_train.iloc[test_index].copy()
    X_tr, X_val= X_train.iloc[train_index,:].copy(), X_train.iloc[test_index,:].copy()
    print( "\nFold ", i)

    xgb_tr = xgb.DMatrix(X_tr, y_tr)
    xgb_val = xgb.DMatrix(X_val, y_val)
    xgb_te = xgb.DMatrix(X_test)
    xgb_params = {"objective": 'binary:logistic',
                  "booster" : "gbtree",
                  "eta": 0.05,
                   "subsample": 0.85,
                  'eval_metric':'auc',
                  "colsample_bytree": 0.86,
                  'gpu_id':0,                        
                  "thread":-1,
                  "seed": 666
                  }
    print(np.sum(y_tr==0)/np.sum(y_tr==1))
    watchlist = [(xgb_tr, 'train'), (xgb_val, 'eval')]
    xgb_model =xgb.train(xgb_params,
                 xgb_tr,
                 num_boost_round = 1,
                 evals =watchlist, 
                 verbose_eval=200,
                 early_stopping_rounds=200)

    pred_te = xgb_model.predict(xgb_te,ntree_limit=xgb_model.best_ntree_limit)
    xgb_pred_te_all = xgb_pred_te_all + pred_te / K


Fold  0
1.0379387003899558
[0]	train-auc:0.643731	eval-auc:0.629722
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.

Fold  1
1.0379751448667955
[0]	train-auc:0.643726	eval-auc:0.631077
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.

Fold  2
1.0379751448667955
[0]	train-auc:0.638461	eval-auc:0.635575
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.

Fold  3
1.0379751448667955
[0]	train-auc:0.640716	eval-auc:0.638054
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.

Fold  4
1.0379373177842566
[0]	train-auc:0.640286	eval-auc:0.63377
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Wil

In [14]:
sub = pd.DataFrame({'ID':test_uid})
sub['Label'] = xgb_pred_te_all
sub = sub.sort_values('Label',ascending=False).reset_index(drop = True)

In [15]:
sub.loc[sub.index<34001,'Label'] = 1
sub.loc[sub.index>=34001,'Label'] = 0
sub['Label'] = sub['Label'].astype(int)

In [16]:
sub.to_csv('../submit/DCIC_sub.csv', index = False)