In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV2.csv')
test = pd.read_csv('../data/test_featureV2.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

  'precision', 'predicted', average, warn_for)


[5]	cv_agg's res: 0.533356 + 0.00380381
[10]	cv_agg's res: 0.658673 + 0.00772869
[15]	cv_agg's res: 0.699018 + 0.00493054
[20]	cv_agg's res: 0.72848 + 0.00686544
[25]	cv_agg's res: 0.738663 + 0.0072745
[30]	cv_agg's res: 0.744802 + 0.0107534
[35]	cv_agg's res: 0.751126 + 0.0141155
[40]	cv_agg's res: 0.756827 + 0.0103796
[45]	cv_agg's res: 0.760631 + 0.0129415
[50]	cv_agg's res: 0.763868 + 0.0108678
[55]	cv_agg's res: 0.759019 + 0.0101366
[60]	cv_agg's res: 0.761163 + 0.00869351
[65]	cv_agg's res: 0.765461 + 0.00990956
[70]	cv_agg's res: 0.764619 + 0.0129452
[75]	cv_agg's res: 0.765903 + 0.012782
[80]	cv_agg's res: 0.766888 + 0.010533
[85]	cv_agg's res: 0.767988 + 0.0094948
[90]	cv_agg's res: 0.766355 + 0.0106398
[95]	cv_agg's res: 0.76567 + 0.00902801
[100]	cv_agg's res: 0.764527 + 0.00916072
[105]	cv_agg's res: 0.765081 + 0.00644796
[110]	cv_agg's res: 0.765972 + 0.00664617
[115]	cv_agg's res: 0.763173 + 0.00823929
[120]	cv_agg's res: 0.765192 + 0.00844099
[125]	cv_agg's res: 0.76695 

{'res-mean': [0.48399853872729676,
  0.5124584244174276,
  0.5248909614945895,
  0.5313633768930407,
  0.5333562099448658,
  0.5557240287245855,
  0.5933639558651062,
  0.6244358532781856,
  0.6401429144509222,
  0.6586732475676963,
  0.6691180439715999,
  0.6842804570769978,
  0.6946677578862825,
  0.6976433249427361,
  0.6990178404107121,
  0.705384380969635,
  0.7132776737867159,
  0.71718199639606,
  0.7237009645836238,
  0.7284795918009852,
  0.7340184400290702,
  0.7353213505763252,
  0.7358860685661744,
  0.7425416419600563,
  0.7386626799966794,
  0.7426287974992697,
  0.7423368086356223,
  0.7430453684880872,
  0.7438677037964759,
  0.7448020229739217,
  0.7476336040370696,
  0.7485587990832697,
  0.7488615292875466,
  0.7505283527960485,
  0.7511261341639122,
  0.7522797985739219,
  0.7529106266956557,
  0.7537703955529778,
  0.7558840152839453,
  0.7568267056011427,
  0.7593518743672308,
  0.7591908544038386,
  0.7593125090364296,
  0.7596933003573095,
  0.760630678176888,
 

## 训练

In [7]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.579071
[10]	training's res: 0.806963
[15]	training's res: 0.895965
[20]	training's res: 0.931418
[25]	training's res: 0.951106
[30]	training's res: 0.962383
[35]	training's res: 0.973728
[40]	training's res: 0.982108
[45]	training's res: 0.988122
[50]	training's res: 0.992647
[55]	training's res: 0.996573
[60]	training's res: 0.998194
[65]	training's res: 0.998882
[70]	training's res: 0.999555
[75]	training's res: 0.999778
[80]	training's res: 1
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[19

### 预测

In [8]:
pred=model.predict(test.drop(['uid'],axis=1))

In [9]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [10]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/testB.csv',index=False,header=False,sep=',',columns=['uid','label'])