In [31]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series
import time

In [32]:
# 读取文件
train_xy = pd.read_csv("data/train_xy.csv",header=0,sep=",")
train_x = pd.read_csv("data/train_x.csv",header=0,sep=",")
test_all = pd.read_csv("data/test_all.csv",header=0,sep=",")

print(train_xy.shape)
print(train_x.shape)
print(test_all.shape)

(15000, 160)
(10000, 159)
(10000, 159)


In [45]:
train = train_xy.copy()
test = test_all.copy()
test['y'] = -1
# 合并一下train 和 test
data = pd.concat([train,test],axis = 0) # train_xy，test_all索引上连接
print(train.shape)
print(test.shape)
print(data.shape)

(15000, 160)
(10000, 160)
(25000, 160)


In [46]:
# 对剩下的特征进行分析，分为数值型 、 类别型
numerical_features = []
categorical_features = []
for i in range(157):
    feat = "x_" + str(i+1)
    if i <= 94: # 1-95
        numerical_features.append(feat)
    else:
        categorical_features.append(feat)
print("有用的数值型特征：",len(numerical_features))
print("有用的类别型特征：",len(categorical_features))

有用的数值型特征： 95
有用的类别型特征： 62


In [47]:
# 统计每个用户缺失值的个数
def get_nan_count(data):
    df = data.copy()
    df = df.replace(-99,np.nan)
    df['nan_count'] = df.shape[1] - df.count(axis = 1).values  # 列数 - 非nan数
    dummy = pd.get_dummies(pd.cut(df['nan_count'],7),prefix = 'nan') # 对缺失数据进行离散化,划分为7个区间
    print(dummy.shape)
    res = pd.concat([data,dummy],axis = 1) # 合并到原来的数据
    print(res.shape)
    return res
data = get_nan_count(data)

(25000, 7)
(25000, 167)


In [48]:
# 重要性top24
imp_feat = [ 'x_80', 'x_2', 'x_81', 'x_95', 'x_1',
             'x_52', 'x_63', 'x_54', 'x_43', 'x_40',
             'x_93', 'x_42', 'x_157', 'x_62', 'x_29',
             'x_61', 'x_55', 'x_79', 'x_59', 'x_69',
             'x_48', 'x_56', 'x_7', 'x_64']
print("重要的特征个数：",len(imp_feat))
# 对一些重要的特征进行填充，
for feat in imp_feat[:10]: # 填充top 10 ,而不是所有
    if feat in numerical_features:   # 数值型用均值
        data[feat] = data[feat].replace(-99,np.nan)
        data[feat] = data[feat].fillna(data[feat].mean()) # 非nan均值
    if feat in categorical_features: # 类别型：不处理、中位数 、众数
        print("这是类别特征：",feat)

pass

重要的特征个数： 24


In [37]:
'''
# 对类别型的特征，进行one-hot --------------- 没效果
def set_one_hot(data,categorical_feature):
    rest_feat = list(set(data.columns.values.tolist()) - set(categorical_feature))
    df = data[rest_feat].copy()
    dummies = [df]
    for feat in categorical_feature:
        dummy = pd.get_dummies(data[feat], prefix = feat)
        dummies.append(dummy)
    res = pd.concat(dummies,axis = 1) # 横向合并
    print("data shape:",res.shape)
    return res

da = set_one_hot(data,['x_157','x_140']) # 测试一下167-2+7+3
data = set_one_hot(data,categorical_features)
'''
pass

In [38]:
'''
# 对数值型的特征，处理为rank特征（鲁棒性好一点）----其实不处理不影响，因为排序不影响大小关系，是单调的
for feat in numerical_features:
    if feat not in imp_feat[:10]: #对填充均值的不rank
        data[feat] = data[feat].rank() / float(data.shape[0]) # 排序，并且进行归一化
'''
pass

In [24]:
'''
# 读取构造的特征
most_feature = pd.read_csv("feature/most_feature30.csv",header=0,sep=",")
print(most_feature.shape)

# 与data合并
data = pd.merge(data,most_feature.iloc[:,:11],on='cust_id')
print(data.shape)
'''
pass

In [49]:
train = data.loc[data['y']!=-1,:] # train set
test = data.loc[data['y']==-1,:]  # test set
print(train.shape)
print(test.shape)

(15000, 167)
(10000, 167)


In [50]:
# 获取特征列，去除id，group, y
no_features = ['cust_id','cust_group','y'] 
features = [feat for feat in train.columns.values if feat not in no_features]
print("所有特征的维度：",len(features))

所有特征的维度： 164


In [51]:
# 得到输入X ，输出y
train_id = train['cust_id'].values
y = train['y'].values
X = train[features].values
print("X shape:",X.shape)
print("y shape:",y.shape)

test_id = test['cust_id'].values
test_data = test[features].values
print("test shape",test_data.shape)

X shape: (15000, 164)
y shape: (15000,)
test shape (10000, 164)


In [None]:
# ---------------xgb模型 5折cv--------------

In [52]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import time

print("start：********************************")
start = time.time()

auc_list = []
pred_list = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    #参数设置
    params = {'booster': 'gbtree',
              'objective':'binary:logistic',
              'eta': 0.02,
              'max_depth': 4,
              'min_child_weight': 6,  
              'colsample_bytree': 0.7,
              'subsample': 0.7,
              #'gamma':1,
              #'lambda ':1,
              #'alpha ':0，
              'silent':1
              }
    params['eval_metric'] = ['auc'] 
    # 数据结构
    dtrain = xgb.DMatrix(X_train, label = y_train)
    dvali = xgb.DMatrix(X_test,label = y_test)
    evallist  = [(dtrain,'train'),(dvali,'valid')]  # 'valid-auc' will be used for early stopping
    # 模型train
    model = xgb.train(params, dtrain,
                      num_boost_round=2000, 
                      evals = evallist,
                      early_stopping_rounds = 100,
                      verbose_eval=100)
    # 预测验证
    pred = model.predict(dvali, ntree_limit = model.best_ntree_limit)
    # 评估
    auc = roc_auc_score(y_test,pred)
    print('...........................auc value:',auc)
    auc_list.append(auc)
    # 预测
    dtest = xgb.DMatrix(test_data)
    pre = model.predict(dtest,ntree_limit = model.best_ntree_limit)
    pred_list.append(pre)

print('......................validate result mean :',np.mean(auc_list))

end = time.time()
print("......................run with time: ",(end - start) / 60.0)

print("over:*********************************")

start：********************************
[0]	train-auc:0.766066	valid-auc:0.7183
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[100]	train-auc:0.854692	valid-auc:0.802412
[200]	train-auc:0.872319	valid-auc:0.810094
[300]	train-auc:0.888624	valid-auc:0.815506
[400]	train-auc:0.903926	valid-auc:0.815928
Stopping. Best iteration:
[360]	train-auc:0.897529	valid-auc:0.817648

...........................auc value: 0.817647768578
[0]	train-auc:0.780705	valid-auc:0.733678
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 100 rounds.
[100]	train-auc:0.858107	valid-auc:0.788498
[200]	train-auc:0.877001	valid-auc:0.791706
[300]	train-auc:0.89248	valid-auc:0.790802
Stopping. Best iteration:
[209]	train-auc:0.878676	valid-auc:0.792405

...........................auc value: 0.792404723564
[0]	train-auc:0.762538	valid-auc:0.7

In [53]:
auc_list

[0.81764776857759069,
 0.79240472356414382,
 0.85816394737641666,
 0.84063034869706998,
 0.78983734277565865]

In [None]:
'''
mean auc: 0.819351205629 = 0.75264 -----------157维度原始特征，什么都不做。baseline
[0.81658949569903838,
 0.78953858151287737,
 0.85962740153333539,
 0.83786295182248138,
 0.79313759757660496]

mean auc: 0.820033862075 = 0.7537  -----------157维度的特征，+ 7个缺失值个数离散的特征
[0.81625517196305841,
 0.79132612240350819,
 0.85825003291505886,
 0.84018979329343024,
 0.79414818979884405]
 
 mean auc: 0.820585352584 = 0.75435 ------------157维度的特征，+ 7个缺失值个数离散的特征 + top10特征均值填充
 [0.8175095144010579,
 0.79445052107070158,
 0.85869058831869871,
 0.84057464628971335,
 0.7917014928397389]
'''
pass

In [44]:
mean_auc = np.mean(auc_list)
print("mean auc:",mean_auc)
filepath = 'result/xgb_'+ str(mean_auc)+'.csv' # 线下平均分数
# 转为array
res =  np.array(pred_list)
print("5折结果：",res.shape)

# 最后结果，mean，max，min
r = res.mean(axis = 0)
print('result shape:',r.shape)

result = DataFrame()
result['cust_id'] = test_id
result['pred_prob'] = r
result.to_csv(filepath,index=False,sep=",")

mean auc: 0.820483062709
5折结果： (5, 10000)
result shape: (10000,)
