# 数据质量分析
* 命令行运行时，切换工作目录

```
import os
os.chdir('./code')
sys.path.append(os.getcwd())

```

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import thinkstats2
import math
import seaborn as sns
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from scipy.stats import entropy
from scipy.stats import ks_2samp
from sklearn import metrics
from eda_kit import conditional_entropy,roc_auc_score,auc


In [2]:
A_train = pd.read_csv('../data/A_train.csv')
B_train = pd.read_csv('../data/B_train.csv')
B_test = pd.read_csv('../data/B_test.csv')

data_at = A_train
data_bt = B_train
data_test = B_test

print (data_at.dtypes[data_at.dtypes == np.int64])
print (data_bt.dtypes[data_bt.dtypes == np.int64])

In [None]:
# 特殊的一个字段
data_bt['UserInfo_170'].unique()


In [None]:
data_bt = data_bt.drop(['UserInfo_170'], axis=1)
data_at = data_at.drop(['UserInfo_170'], axis=1)
data_test = data_test.drop(['UserInfo_170'], axis=1)


* 维度情况
    * 3组数据维度一样，维度之间可比较？

In [8]:
print (data_bt.columns.shape)
print (data_bt.columns.intersection(data_at.columns).shape)
print (data_bt.columns.intersection(data_test.columns).shape)

(491L,)
(491L,)
(490L,)


* 缺失值分析
    * https://blog.csdn.net/weixin_40159138/article/details/89421014
    * https://www.jianshu.com/p/9c867fb9cf17
    * https://scikit-learn.org/stable/modules/impute.html#impute
    * https://blog.csdn.net/qq_38958113/article/details/98220246
    * 从下图可以看到缺失值非常接近
    * B_test和A_train缺失值情况几乎一致
    * B_train的缺失值情况比A_train严重很多
    * A_train中20%的用户缺失维度在100个以内，40%的缺失维度在450个以上，60%用户的缺失维度在150个以内
    * B_train 和 B_test 38%的用户缺失维度在186左右，60%用户维度缺失在460以上
    * 缺失值在学习过程中，feature_importance 会降低，并不一定会影响学习效果，*可以对比是否填充缺失值对结果的影响*
    * 填充策略：
        * 离散值填充
        * 连续值填充
        * A的情况是正确的情况下，可以用于填充B的值，或者AB的分布，可以填充B_test的值
        * 对待缺失值不同的态度，为决定不同的填充策略：比如用户主动不填收入，那么这个null值应该赋予一个已有值之外的，比如特殊的-999.9
        * 对A值采取固定填充，但是可以对B采取transform的填充，
        * 填充时机应该延后

In [None]:
fea_null = np.sum(data_at.isnull(), axis=0)
feb_null = np.sum(data_bt.isnull(), axis=0)
fet_null = np.sum(data_test.isnull(), axis=0)

plt.subplot(311).plot(fea_null.values)
plt.subplot(312).plot(feb_null.values)
plt.subplot(313).plot(fet_null.values)

# sort_values
plt.subplot(311).plot(np.sort(fea_null))
plt.subplot(312).plot(np.sort(feb_null))
plt.subplot(313).plot(np.sort(fet_null))
plt.show()

# 缺失值归一化
plt.plot(np.sort(fea_null/data_at.shape[0]), color='green')
plt.plot(np.sort(feb_null/data_bt.shape[0]), color='blue')
plt.plot(np.sort(fet_null/data_test.shape[0]),color='red')
plt.show()

u_fea_null = np.sum(data_at.isnull(), axis=1)
u_feb_null = np.sum(data_bt.isnull(), axis=1)
u_fet_null = np.sum(data_test.isnull(), axis=1)
u_fea_null.hist(cumulative=True, density=1, bins=100, alpha=.2, color="r")
u_feb_null.hist(cumulative=True, density=1, bins=100, alpha=.2, color="b")
u_fet_null.hist(cumulative=True, density=1, bins=100, alpha=.2, color="g")
plt.show()
print( "用户的维度缺失情况")
print( (u_fea_null[u_fea_null<156].count()-0.0)/u_fea_null.count())
print( (u_feb_null[u_feb_null<186].count()-0.0)/u_feb_null.count())
print( (u_fet_null[u_fet_null<186].count()-0.0)/u_fet_null.count())

# preserve less null columns
threshold_column_null = 0.99
at_target_columns = fea_null[fea_null<data_at.shape[0]*threshold_column_null].sort_values(axis=0).index
bt_target_columns = feb_null[feb_null<data_bt.shape[0]*threshold_column_null].sort_values(axis=0).index
test_target_columns = fet_null[fet_null<data_test.shape[0]*threshold_column_null].sort_values(axis=0).index
target_columns = at_target_columns.intersection(bt_target_columns)
data_at = data_at.fillna(10)
data_bt = data_bt.fillna(10)
data_test = data_test.fillna(10)
#data_at = data_at[target_columns]
#data_bt = data_bt[target_columns]
#data_test = data_test[target_columns.drop(['flag'])]

* 连续与离散
    * 知识：https://blog.csdn.net/ztf312/article/details/53991329
    * https://blog.csdn.net/banbuduoyujian/article/details/53957653
    * https://scikit-learn.org/stable/auto_examples/preprocessing/plot_discretization.html
    * https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.cut.html
    * https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
    * https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-discretization
    * 489维数据中，383维的取值在100个以内，推测大部分应该是离散值
    * 取值在100个以上的可能为连续值
    * 取值大于300的维度81个，大于500的69个，这些极可能是连续值
    * 需要对连续值进行离散化：分类

In [None]:
discrete_threshold = 120
data_tmp_all = pd.concat([data_at.sort_index(axis=1).drop(['flag','no'],axis=1),data_bt.sort_index(axis=1).drop(['flag','no'],axis=1)])
data_tmp_all = pd.concat([data_tmp_all, data_test.sort_index(axis=1).drop(['no'],axis=1)])
data_all_n_unique = data_tmp_all.nunique()
#plt.hist(data_all_n_unique, cumulative=False, bins=100)
#plt.show()
discrete_columns = data_all_n_unique[data_all_n_unique<=discrete_threshold].index
continuous_columns = data_all_n_unique[data_all_n_unique>discrete_threshold].index
null_columns_all = np.sum(data_tmp_all.isnull(), axis=0).sort_values(ascending=True)


* 同分布检验
    * https://blog.csdn.net/qq_41679006/article/details/80977113
    * https://www.cnblogs.com/arkenstone/p/5496761.html
    * https://blog.csdn.net/t15600624671/article/details/78770239
    * B_test 和 B_train只有2维数据的分布差异较大，显著性α=0.05
    * B_train 和 A_train的数据差异较大：有199维数据的分布相差大，所以考虑剔除掉199维数据
    * 缺失值少，且同分布的维度 极有可能是最重要的维度，可以尝试只取这部分数据进行分析， *可以做对比分析*

In [None]:
data_at = A_train
data_bt = B_train
data_test = B_test
#data_at[continuous_columns] = data_at[continuous_columns].fillna(-999)
#data_bt[continuous_columns] = data_bt[continuous_columns].fillna(-999)
#data_test[continuous_columns] = data_test[continuous_columns].fillna(-999)
dis_b_diff = {};dis_b_same={};dis_ab_diff={};dis_ab_same={};dis_abt_diff={};dis_abt_same={}
for column in target_columns.intersection(continuous_columns):
    # print column
    d, p = ks_2samp(data_bt[column].dropna(), data_test[column].dropna())
    if p<=0.05:
        dis_b_diff[column] = (d,p)
    else:
        dis_b_same[column] = (d,p)
    d, p = ks_2samp(data_bt[column].dropna(), data_at[column].dropna())
    if p<=0.05:
        dis_ab_diff[column] = (d,p)
    else:
        dis_ab_same[column] = (d,p)
    d, p = ks_2samp(data_test[column].dropna(), data_at[column].dropna())
    if p<=0.05:
        dis_abt_diff[column] = (d,p)
    else:
        dis_abt_same[column] = (d,p)
dis_b_diff = pd.DataFrame.from_dict(dis_b_diff,orient='index')
dis_b_same = pd.DataFrame.from_dict(dis_b_same,orient='index')
dis_ab_diff = pd.DataFrame.from_dict(dis_ab_diff,orient='index')
dis_ab_same = pd.DataFrame.from_dict(dis_ab_same,orient='index')
# 缺失值少，且同分布的维度

target_columns = target_columns.intersection(dis_ab_same.index).append(pd.Index(['no','flag']))
data_at = data_at[target_columns]
data_bt = data_bt[target_columns]
data_test = data_test[target_columns.drop(['flag'])]

* 线性相关性
    * 剔除掉线性相关性大的维度
    * https://towardsdatascience.com/feature-selection-with-pandas-e3690ad8504b

In [None]:
threshold_corr_bt = 0.98
corr_bt = data_bt.corr()
#sns.heatmap(corr_bt, annot=False, cmap=plt.cm.Reds)
#plt.show()
corr_length = corr_bt.shape[0]
final_cols_bt = []
del_cols_bt =[]
for i in range(corr_length):
    if corr_bt.columns[i] not in del_cols_bt:
        final_cols_bt.append(corr_bt.columns[i])
        for j in range(i+1,corr_length):
            if (corr_bt.iloc[i,j] > threshold_corr_bt) and (corr_bt.columns[j] not in del_cols_bt):
                del_cols_bt.append(corr_bt.columns[j])
target_no_corr_columns = target_columns.intersection(pd.Index(final_cols_bt))
target_no_corr_continuous_columns = target_no_corr_columns.intersection(continuous_columns)
target_no_corr_discrete_columns = target_no_corr_columns.intersection(discrete_columns)
data_at = data_at[target_columns]
data_bt = data_bt[target_columns]
data_test = data_test[target_columns.drop(['flag'])]
sns.heatmap(data_bt.corr(), annot=False, cmap=plt.cm.Reds)
plt.show()

* 再次离散化
    * 首先填充null值

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(data_at[continuous_columns])
data_at[continuous_columns] = pd.DataFrame(imp_mean.transform(data_at[continuous_columns]),columns=continuous_columns)
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(data_at[continuous_columns])
data_bt[continuous_columns] = pd.DataFrame(data=imp.transform(data_bt[continuous_columns]),columns=continuous_columns)
data_test['flag'] = np.nan
data_test[continuous_columns] = pd.DataFrame(data=imp.transform(data_test[continuous_columns]),columns=continuous_columns)
data_at[discrete_columns] = data_at[discrete_columns].fillna(1)
data_bt[discrete_columns] = data_bt[discrete_columns].fillna(1)
data_test[discrete_columns] = data_test[discrete_columns].fillna(1)
data_at[discrete_columns] = data_at[discrete_columns].fillna(data_at[discrete_columns].mean())
data_bt[discrete_columns] = data_bt[discrete_columns].fillna(data_at[discrete_columns].mean())
data_test[discrete_columns] = data_test[discrete_columns].fillna(data_at[discrete_columns].mean())

threshold_k_bins = 0.1
est = KBinsDiscretizer(n_bins=np.round((data_at[continuous_columns].max()-data_at[continuous_columns].min())*threshold_k_bins).values, encode='ordinal', strategy='uniform').fit(data_at[continuous_columns])
data_at.update( pd.DataFrame(est.transform(data_at[continuous_columns]),columns=continuous_columns))
data_bt.update( pd.DataFrame(est.transform(data_bt[continuous_columns]),columns=continuous_columns))
data_test.update( pd.DataFrame(est.transform(data_test[continuous_columns]),columns=continuous_columns))



* 异常值分析
    * https://www.cnblogs.com/tinglele527/p/11955103.html
    * https://scikit-learn.org/stable/modules/outlier_detection.html
    * https://scikit-learn.org/0.20/auto_examples/plot_anomaly_comparison.html
    * https://blog.csdn.net/PbGc396Dwxjb77F2je/article/details/99687952
    * 离散值中取值比例很小的这部分，可能有两种情况：对预测结果有强作用，对预测情况无影响，*可以做对比分析*
    * 离散值中，取值比例很小的部分，如果熵很大，说明本身对结果没有区分度，这部分异常值可能性很大，
    * 离散值中，条件熵大的维度，区分度小，这部分维度可能需要去除掉
    * 经过前面的同分布处理后，条件熵减少了很多，
    * 连续值中，box plot可以很方便观测处异常值
    * 连续值在A_train中异常值偏少，整体少于4%，是否需要处理？连续值在B_train中，整体异常值少于2.5%
    *

In [None]:
#标准差法 sunspots.counts > xbar + 2 * xstd
data_at_continuous = data_at[continuous_columns]
data_at_cont_abnormal = data_at_continuous > data_at_continuous.mean()+2*data_at_continuous.std()
data_at_cont_ab_proportion = np.sum(data_at_cont_abnormal, axis=0)/data_at_cont_abnormal.shape[0]
plt.hist(data_at_cont_ab_proportion, cumulative=True, bins=100, density=True);plt.show()

data_bt_continuous = data_bt[continuous_columns]
data_bt_cont_abnormal = data_bt_continuous > data_bt_continuous.mean()+2*data_bt_continuous.std()
data_bt_cont_ab_proportion = np.sum(data_bt_cont_abnormal, axis=0)/data_bt_cont_abnormal.shape[0]
plt.hist(data_bt_cont_ab_proportion, cumulative=True, bins=100, density=True);plt.show()

# 维度与熵
entropy_dis = {}
count_threshold = data_at.shape[0] * 0.05
entropy_threshold = 0.5
# 每个维度的异常熵
least_count_entropy = {}
for column in discrete_columns:
    s,d = conditional_entropy(data_at, cond=column)
    entropy_dis[column] = s
    for v in d:
        # 离散值中，如果某个值的数量很少，单独存起来
        if d[v][1]<=count_threshold and d[v][0]>=entropy_threshold:
            if column not in least_count_entropy:
                least_count_entropy[column] = {}
            least_count_entropy[column][v] = d[v]
entropy_dis = pd.DataFrame.from_dict(entropy_dis, orient='index')
print (entropy_dis.max(),entropy_dis.min())
plt.hist(np.sort(entropy_dis, axis=0), bins=100, cumulative=True);plt.show()


* 数据规范化
    * https://blog.csdn.net/weixin_38706928/article/details/80329563
    * https://scikit-learn.org/stable/modules/preprocessing.html

In [None]:
# stds = StandardScaler().fit(data_at.drop(['no','flag']))



* 经过以上分析，可以分布验证数据的处理情况
    * 排除null值多的维度
    * 取A B同分布维度，B_test + B_train = B
    * 剔除线性相关性强的维度
    * 数据离散化：sklearn
    * 排除条件熵大的维度
    * 填充null值：固定填充，根据分布填充
    * 数据规范化



In [None]:
# XGB_B
target_bb = data_bt['flag']

bb_data = data_bt.drop(['flag','no'],axis=1)
bbt_data = data_test.drop(['no','flag'], axis=1)

#   GBDT训练 输出（47%以上的特征）   ，训练集划分交叉验证

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bb_data, target_bb, test_size=0.3, random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1,  max_depth=1, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
y_pro = clf.predict_proba(X_test)
y_prd = pd.DataFrame(y_pro).iloc[:,1]
roc_auc_score(y_test,y_prd)

# 训练所有数据 输出

clf.fit(bb_data,target_bb)
b = clf.predict_proba(bbt_data)
b=pd.DataFrame(b)
pro_b = b.iloc[:,1]
no = data_test.iloc[:,-2]
pro = pd.DataFrame(pro_b)
no = pd.DataFrame(no)


# GBDT 重要特征

clf.fit(bb_data,target_bb)
clf_importance = clf.feature_importances_
clf_importance_ = pd.DataFrame(clf_importance)
clf_importance_.columns = {'importance'}
bb_columns = pd.DataFrame(bb_data.columns)
bb_columns.columns={'feature'}

#影响度排序
clf_feature_values = pd.concat([bb_columns,clf_importance_],axis=1)
clf_feature_values = clf_feature_values.sort_values(by='importance')


#影响度非0的特征
clf_feature_well = clf_feature_values[clf_feature_values['importance']!=0]
clf_feature_well_columns = clf_feature_well['feature'].values
clf_feature_well.index = clf_feature_well_columns
columns_GBDT = clf_feature_well.index

# 测试集提取这些特征，形成新的测试集

C_feature = data_bt[columns_GBDT]
new_test = data_test[columns_GBDT]
C_flag = pd.DataFrame(data_bt['flag'])
C_train = pd.concat([C_feature,C_flag],axis=1)

#CC = C_feature.fillna(0)
#new_test_  = new_test.fillna(0)
CC = C_feature
new_test_ = new_test

#   重要特征训练，训练集交叉验证

#from sklearn.ensemble import GradientBoostingClassifier
#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(CC, C_flag, test_size=0.3, random_state=0)
clf = GradientBoostingClassifier(n_estimators=110, learning_rate=1,  max_depth=1, random_state=0)#loss='exponential' mse
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
y_pro = clf.predict_proba(X_test)
y_prd = pd.DataFrame(y_pro).iloc[:,1]
roc_auc_score(y_test,y_prd)
clf.fit(CC,C_flag)
b = clf.predict_proba(new_test_)
b=pd.DataFrame(b)
pro_b = b.iloc[:,1]
no = data_test.iloc[:,0]
pro = pd.DataFrame(pro_b)
no = pd.DataFrame(no)

import xgboost as xgb

xg_train = xgb.DMatrix(X_train,label=y_train)
xg_test = xgb.DMatrix(X_test,label=y_test)


param = {'booster':'gbtree',
         'max_depth':10,
         'eta':0.1,
         'silent':1,
         'objective':'binary:logistic',
         'eval_metric':'auc',
         'subsample': 1,
         "colsample_bytree": 0.7,
         "min_child_weight":2,
              'gamma':3.1,
              'lambda':1,
        "thread":-1,}
num_boost_round = 1500
watchlist = [(xg_train, 'train'), (xg_test, 'eval')]
num_round=15
bst = xgb.train(param, xg_train, num_round)
preds = bst.predict(xg_test)
roc_auc_score(y_test,preds)

xg_train = xgb.DMatrix(CC,label=C_flag)
xg_test = xgb.DMatrix(new_test_)
bst = xgb.train(param, xg_train, num_round)
pro = bst.predict(xg_test)

xgb_model =xgb.cv(param,xg_train,num_boost_round,nfold=5, early_stopping_rounds=300)#, verbose_eval=True

pd.DataFrame(xgb_model)
pd.DataFrame(xgb_model)['test-auc-mean'].mean()



b=pd.DataFrame(b)

XGB_B=[]
XGB_B=pd.DataFrame(XGB_B)
XGB_B['no'] = no
XGB_B['pred'] = pro


 GBDT_B_0.587¶

In [None]:

import lightgbm as lgb

A_train = pd.read_csv('../data/A_train.csv')
B_train = pd.read_csv('../data/B_train.csv')
B_test = pd.read_csv('../data/B_test.csv')

A_train['label'] = -1
B_train['label'] = 0
B_test['label'] = 1
B_test['flag'] = np.nan

all_data = A_train.append(B_train)
all_data = all_data.append(B_test)
all_data = all_data.reset_index(drop=True)

user_infos = [i for i in all_data.columns if 'UserInfo' in i]
product_infos = [i for i in all_data.columns if 'ProductInfo' in i]
web_infos = [i for i in all_data.columns if 'WebInfo' in i]

all_data = all_data.fillna(10)

temp_data = all_data

drop_cols_l = ['flag', 'label', 'no']
train_x = temp_data[temp_data.label==0].drop(drop_cols_l, axis=1)
train_y = temp_data[temp_data.label==0]['flag']




def lgb_feature_selection(tr_x, tr_y, model_seed =666,num_rounds = 500):
    lgb_tr = lgb.Dataset(tr_x, tr_y)
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'random_state': model_seed}
    model = lgb.train(lgb_params, lgb_tr, num_boost_round=num_rounds,verbose_eval=100)
    return model




f_model = lgb_feature_selection(train_x.values, train_y.values)
lgb.plot_importance(f_model, figsize=(16,8))
features_names_im =pd.DataFrame({'feature_name':train_x.columns, 'f_value': f_model.feature_importance()})
features_used = features_names_im[features_names_im.f_value>=0.1*features_names_im.f_value.mean()]



tr_x = all_data.loc[all_data.label==0,features_used.feature_name.values].values
test_x = all_data.loc[all_data.label==1,features_used.feature_name.values].values
tr_y = all_data.loc[all_data.label==0, 'flag'].values
test_y = all_data[all_data.label==1][['no']]



from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(n_estimators=400, random_state=666)
gbdt.fit(tr_x, tr_y)
gbdt_pred = gbdt.predict_proba(test_x)
gbdt_pred[:, 1]
test_y['pred'] = gbdt_pred[:,1]
GBDT_B = test_y

 A_B_GDBT


In [None]:
A_train = pd.read_csv('../data/A_train.csv')
B_train = pd.read_csv('../data/B_train.csv')
B_test = pd.read_csv('../data/B_test.csv')#//
NO = B_test['no']#//


# In[67]:

B_train_columns = B_train.columns
B_null_count_less = []
B_null_count_large = []

# threshold = 0.63
for i in B_train_columns:
    if ((B_train[i].isnull().sum()) / len(B_train[i]) <= 0.63):
        B_null_count_less.append([i,(B_train[i].isnull().sum()) / len(B_train[i])])
    else:
        B_null_count_large.append([i,(B_train[i].isnull().sum()) / len(B_train[i])])

# len(B_null_count_less) 327

# len(B_null_count_large) 164


B_test_columns = B_test.columns
B_test_count_less = []
B_test_count_large = []

for i in B_test_columns:
    if ((B_test[i].isnull().sum()) / len(B_test[i]) <= 0.63):
        B_test_count_less.append([i,(B_test[i].isnull().sum()) / len(B_test[i])])
    else:
        B_test_count_large.append([i,(B_test[i].isnull().sum()) / len(B_test[i])])


A_feature = pd.DataFrame(B_null_count_less).values[:,0]
B_feature = pd.DataFrame(B_null_count_less).values[:,0]
BT_feature = pd.DataFrame(B_test_count_less).values[:,0]

a_data = A_train[A_feature]
b_data = B_train[B_feature]
bt_data = B_train[BT_feature]

a_columns = a_data.columns
a_columns = a_columns.sort_values()               #缺失量排序

b_columns = b_data.columns   ## B_train columns，多了一个flag
b_columns = b_columns.sort_values()

bt_columns = bt_data.columns   ## B_test columns
bt_columns = bt_columns.sort_values()

a_data = A_train[a_columns]
b_data = B_train[b_columns]
bt_data = B_test[bt_columns]




b_target = b_data['flag']
a_target = a_data['flag']


b_data.drop('flag',axis=1,inplace=True)
a_data.drop('flag',axis=1,inplace=True)

aa_data = a_data.fillna(1)
bb_data = b_data.fillna(1)
bt_data = bt_data.fillna(1)

bb_data.drop('no',axis=1,inplace=True)
aa_data.drop('no',axis=1,inplace=True)
bt_data.drop('no',axis=1,inplace=True)

#-----
aa_data = data_at.drop(['no','flag'],axis=1)
a_target = data_at['flag']
bb_data = data_bt.drop(['no','flag'],axis=1)
b_target = data_bt['flag']

aa_data = aa_data.fillna(1)
bb_data = bb_data.fillna(1)
bt_data = bt_data.fillna(1)

from sklearn.ensemble import GradientBoostingClassifier


clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1,  max_depth=2,random_state=0)#
clf.fit(aa_data, a_target)
y_pred = clf.predict_proba(bb_data)
y_pred = pd.DataFrame(y_pred).iloc[:,1]
roc_auc_score(b_target,y_pred)

# clf.fit(aa_data,a_target)
clf_importance = clf.feature_importances_
clf_importance_ = pd.DataFrame(clf_importance)
clf_importance_.columns = {'importance'}
bb_columns = pd.DataFrame(bb_data.columns)
bb_columns.columns={'feature'}

#影响度排序
clf_feature_values = pd.concat([bb_columns,clf_importance_],axis=1)
# feature_values.columns = {'importance','feature'}
clf_feature_values = clf_feature_values.sort_values(by='importance')


#影响度非0的特征
clf_feature_well = clf_feature_values[clf_feature_values['importance']!=0]
clf_feature_well_columns = clf_feature_well['feature'].values
clf_feature_well.index = clf_feature_well_columns

columns_GBDT = clf_feature_well.index


C_feature = A_train[columns_GBDT]
D_feature = B_train[columns_GBDT]
E_feature = B_test[columns_GBDT]

C_feature = data_at[columns_GBDT]
D_feature = data_bt[columns_GBDT]
E_feature = data_test[columns_GBDT]


C_flag = pd.DataFrame(A_train['flag'])
D_flag = pd.DataFrame(B_train['flag'])

C_82 = C_feature['UserInfo_82']
C_82 =pd.DataFrame(C_82.fillna(C_feature['UserInfo_82'].median()))
C_82.columns={'new_82'}

D_82 = D_feature['UserInfo_82']
D_82 =pd.DataFrame(D_82.fillna(E_feature['UserInfo_82'].median())) #用B——test的中位数代替有一点提升
D_82.columns={'new_82'}

E_82 = E_feature['UserInfo_82']
E_82 =pd.DataFrame(E_82.fillna(E_feature['UserInfo_82'].median()))
E_82.columns={'new_82'}

newC_feature=pd.DataFrame(C_feature['UserInfo_82']*C_feature['UserInfo_222'])
newC_feature.columns={'new_feature_1'}

newD_feature=pd.DataFrame(D_feature['UserInfo_82']*D_feature['UserInfo_222'])
newD_feature.columns={'new_feature_1'}

newE_feature=pd.DataFrame(E_feature['UserInfo_82']*E_feature['UserInfo_222'])
newE_feature.columns={'new_feature_1'}

C_feature = pd.concat([C_feature,C_82],axis = 1)
D_feature = pd.concat([D_feature,D_82],axis = 1)
E_feature = pd.concat([E_feature,E_82],axis = 1)

C_feature = pd.concat([C_feature,newC_feature],axis = 1)
D_feature = pd.concat([D_feature,newD_feature],axis = 1)
E_feature = pd.concat([E_feature,newE_feature],axis = 1)

C_feature = C_feature.fillna(1)
D_feature = D_feature.fillna(1)
E_feature = E_feature.fillna(1)

from sklearn.ensemble import GradientBoostingClassifier


clf = GradientBoostingClassifier(n_estimators=149, learning_rate=0.66,  max_depth=2, random_state=0,max_features=14,min_weight_fraction_leaf=0.11)
clf.fit(C_feature, C_flag)

y_pred = clf.predict_proba(D_feature)
y_pred = pd.DataFrame(y_pred).iloc[:,1]
roc_auc_score(D_flag,y_pred)




y_pred = clf.predict_proba(E_feature)
y_pred = pd.DataFrame(y_pred).iloc[:,1]
b=pd.DataFrame(y_pred)

no = pd.DataFrame(NO)

A_B_GBDT=[]
A_B_GBDT=pd.DataFrame(A_B_GBDT)
A_B_GBDT['no'] = no
A_B_GBDT['pred'] = y_pred

A_B_lgb


In [None]:

A_train = pd.read_csv('../data/A_train.csv')
B_train = pd.read_csv('../data/B_train.csv')
B_test = pd.read_csv('../data/B_test.csv')#//
NO = B_test['no']#//

B_train_columns = B_train.columns
B_null_count_less = []
B_null_count_large = []

# threshold = 0.63
for i in B_train_columns:
    if ((B_train[i].isnull().sum()) / len(B_train[i]) <= 0.63):
        B_null_count_less.append([i,(B_train[i].isnull().sum()) / len(B_train[i])])
    else:
        B_null_count_large.append([i,(B_train[i].isnull().sum()) / len(B_train[i])])

# len(B_null_count_less) 327

# len(B_null_count_large) 164


B_test_columns = B_test.columns
B_test_count_less = []
B_test_count_large = []

for i in B_test_columns:
    if ((B_test[i].isnull().sum()) / len(B_test[i]) <= 0.63):
        B_test_count_less.append([i,(B_test[i].isnull().sum()) / len(B_test[i])])
    else:
        B_test_count_large.append([i,(B_test[i].isnull().sum()) / len(B_test[i])])

A_feature = pd.DataFrame(B_null_count_less).values[:,0]
B_feature = pd.DataFrame(B_null_count_less).values[:,0]
BT_feature = pd.DataFrame(B_test_count_less).values[:,0]

a_data = A_train[A_feature]
b_data = B_train[B_feature]
bt_data = B_train[BT_feature]

a_columns = a_data.columns
a_columns = a_columns.sort_values()               #缺失量排序

b_columns = b_data.columns   ## B_train columns，多了一个flag
b_columns = b_columns.sort_values()

bt_columns = bt_data.columns   ## B_test columns
bt_columns = bt_columns.sort_values()

a_data = A_train[a_columns]
b_data = B_train[b_columns]
bt_data = B_test[bt_columns]

b_target = b_data['flag']
a_target = a_data['flag']


b_data.drop('flag',axis=1,inplace=True)
a_data.drop('flag',axis=1,inplace=True)


aa_data = a_data.fillna(1)
bb_data = b_data.fillna(1)
bt_data = bt_data.fillna(1)

bb_data.drop('no',axis=1,inplace=True)
aa_data.drop('no',axis=1,inplace=True)
bt_data.drop('no',axis=1,inplace=True)



import lightgbm as lgb

lgb_train = lgb.Dataset(aa_data,label=a_target)
lgb_vd = lgb.Dataset(bb_data,label=b_target)
# lgb_test = lgb.Dataset(D_feature)
# lgb_vd = lgb.Dataset(vd_x, vd_y, reference=lgb_tr)
lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
#     'max_depth':18,
#     'feature_fraction':0.85,
#     'lambda_l1':1.2,
    'random_state': 0}#18     0.85

lgb_model = lgb.train(lgb_params, lgb_train, num_boost_round=200,verbose_eval=True)

lgb_pred = lgb_model.predict(bb_data)

roc_auc_score(b_target,lgb_pred)



lgb_importance = pd.DataFrame(lgb_model.feature_importance(importance_type="split"))
lgb_importance.columns={'importance'}
columns = pd.DataFrame(b_columns).iloc[:-2,:]
columns.columns={'feature'}
lgb_importance = pd.concat([columns,lgb_importance],axis=1)
lgb_importance = lgb_importance.sort_values(by='importance')
lgb_importance = lgb_importance[lgb_importance['importance']>29].reset_index().drop('index',axis=1)
lgb_importance_columns = lgb_importance['feature'].values

C_feature = A_train[lgb_importance_columns]
D_feature = B_train[lgb_importance_columns]
E_feature = B_test[lgb_importance_columns]



C_82 = C_feature['UserInfo_82']
C_82 =pd.DataFrame(C_82.fillna(C_feature['UserInfo_82'].median()))
C_82.columns={'new_82'}

D_82 = D_feature['UserInfo_82']
D_82 =pd.DataFrame(D_82.fillna(E_feature['UserInfo_82'].median())) #用B——test的中位数代替有一点提升
D_82.columns={'new_82'}

E_82 = E_feature['UserInfo_82']
E_82 =pd.DataFrame(E_82.fillna(E_feature['UserInfo_82'].median()))
E_82.columns={'new_82'}

newC_feature=pd.DataFrame(C_feature['UserInfo_253']*C_feature['UserInfo_242'])
newC_feature.columns={'new_feature_1'}

newD_feature=pd.DataFrame(D_feature['UserInfo_253']*D_feature['UserInfo_242'])
newD_feature.columns={'new_feature_1'}

newE_feature=pd.DataFrame(E_feature['UserInfo_253']*E_feature['UserInfo_242'])
newE_feature.columns={'new_feature_1'}


C_feature = pd.concat([C_feature,newC_feature],axis = 1)
D_feature = pd.concat([D_feature,newD_feature],axis = 1)
E_feature = pd.concat([E_feature,newE_feature],axis = 1)

C_feature.drop('UserInfo_134',axis=1,inplace=True)
D_feature.drop('UserInfo_134',axis=1,inplace=True)
E_feature.drop('UserInfo_134',axis=1,inplace=True)


C_feature  = C_feature.fillna(1)
D_feature = D_feature.fillna(1)
E_feature = E_feature.fillna(1)

columns = D_feature.columns



lgb_params_new = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth':17,
    'feature_fraction':0.80,
    'lambda_l1':0.6,
#     'scale_pos_weight':1.1,
    'random_state': 0}
                                                   #17 0.80 0.6 1550 0.591  drop134  new82       253X242    1162
lgb_train = lgb.Dataset(C_feature,label=a_target)#17 0.80 0.6 1550 0.589  drop134  new82            1550
lgb_vd = lgb.Dataset(D_feature,label=b_target)   #17 0.80 0.6 1469 0.588 drop134
lgb_model = lgb.train(lgb_params_new, lgb_train, num_boost_round=2000,verbose_eval=True,valid_sets=lgb_vd, early_stopping_rounds=500)#1256

preds = lgb_model.predict(E_feature)
no = pd.DataFrame(NO)

A_B_LGB=[]
A_B_LGB=pd.DataFrame(A_B_LGB)
A_B_LGB['no'] = no
A_B_LGB['pred'] = preds

# 融合

In [None]:
D=[]
D=pd.DataFrame(D)


D['no'] = NO
D['pred'] = (A_B_GBDT.iloc[:,1]*0.89 + A_B_LGB.iloc[:,1]*0.113)*0.181+ XGB_B.iloc[:,1]*0.415 + GBDT_B.iloc[:,1].values*0.415

# XGB_B
target_bb = data_bt['flag']

bb_data = data_bt.drop('flag',axis=1,inplace=True)
bbt_data = data_test.fillna(0)

#   GBDT训练 输出（47%以上的特征）   ，训练集划分交叉验证

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

X_train, X_test, y_train, y_test = cross_validate.train_test_split(bb_data, target_bb, test_size=0.3, random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1,  max_depth=1, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
y_pro = clf.predict_proba(X_test)
y_prd = pd.DataFrame(y_pro).iloc[:,1]
roc_auc_score(y_test,y_prd)

# 训练所有数据 输出

clf.fit(bb_data,target_bb)
b = clf.predict_proba(bbt_data)
b=pd.DataFrame(b)
pro_b = b.iloc[:,1]
no = data_test.iloc[:,0]
pro = pd.DataFrame(pro_b)
no = pd.DataFrame(no)


# GBDT 重要特征

clf.fit(bb_data,target_bb)
clf_importance = clf.feature_importances_
clf_importance_ = pd.DataFrame(clf_importance)
clf_importance_.columns = {'importance'}
bb_columns = pd.DataFrame(bb_data.columns)
bb_columns.columns={'feature'}

#影响度排序
clf_feature_values = pd.concat([bb_columns,clf_importance_],axis=1)
clf_feature_values = clf_feature_values.sort_values(by='importance')


#影响度非0的特征
clf_feature_well = clf_feature_values[clf_feature_values['importance']!=0]
clf_feature_well_columns = clf_feature_well['feature'].values
clf_feature_well.index = clf_feature_well_columns
columns_GBDT = clf_feature_well.index

# 测试集提取这些特征，形成新的测试集

C_feature = B_train[columns_GBDT]
new_test = B_test[columns_GBDT]
C_flag = pd.DataFrame(B_train['flag'])
C_train = pd.concat([C_feature,C_flag],axis=1)

CC = C_feature.fillna(0)
new_test_  = new_test.fillna(0)

#   重要特征训练，训练集交叉验证

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

X_train, X_test, y_train, y_test = cross_validate.train_test_split(CC, C_flag, test_size=0.3, random_state=0)
clf = GradientBoostingClassifier(n_estimators=110, learning_rate=1,  max_depth=1, random_state=0)#loss='exponential' mse
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
y_pro = clf.predict_proba(X_test)
y_prd = pd.DataFrame(y_pro).iloc[:,1]
roc_auc_score(y_test,y_prd)
clf.fit(CC,C_flag)
b = clf.predict_proba(new_test_)
b=pd.DataFrame(b)
pro_b = b.iloc[:,1]
no = B_test.iloc[:,0]
pro = pd.DataFrame(pro_b)
no = pd.DataFrame(no)

import xgboost as xgb

xg_train = xgb.DMatrix(X_train,label=y_train)
xg_test = xgb.DMatrix(X_test,label=y_test)


param = {'booster':'gbtree',
         'max_depth':10,
         'eta':0.1,
         'silent':1,
         'objective':'binary:logistic',
         'eval_metric':'auc',
         'subsample': 1,
         "colsample_bytree": 0.7,
         "min_child_weight":2,
              'gamma':3.1,
              'lambda':1,
        "thread":-1,}
num_boost_round = 1500
watchlist = [(xg_train, 'train'), (xg_test, 'eval')]
num_round=15
bst = xgb.train(param, xg_train, num_round)
preds = bst.predict(xg_test)
roc_auc_score(y_test,preds)

xg_train = xgb.DMatrix(CC,label=C_flag)
xg_test = xgb.DMatrix(new_test_)
bst = xgb.train(param, xg_train, num_round)
pro = bst.predict(xg_test)

xgb_model =xgb.cv(param,xg_train,num_boost_round,nfold=5, early_stopping_rounds=300)#, verbose_eval=True

pd.DataFrame(xgb_model)
pd.DataFrame(xgb_model)['test-auc-mean'].mean()



b=pd.DataFrame(b)

no = pd.DataFrame(NO)

XGB_B=[]
XGB_B=pd.DataFrame(XGB_B)
XGB_B['no'] = no
XGB_B['pred'] = pro

In [None]:
# XGB_B
target_bb = data_bt['flag']

bb_data = data_bt.drop('flag',axis=1,inplace=True)
bbt_data = data_test.fillna(0)

#   GBDT训练 输出（47%以上的特征）   ，训练集划分交叉验证

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

X_train, X_test, y_train, y_test = cross_validate.train_test_split(bb_data, target_bb, test_size=0.3, random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1,  max_depth=1, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
y_pro = clf.predict_proba(X_test)
y_prd = pd.DataFrame(y_pro).iloc[:,1]
roc_auc_score(y_test,y_prd)

# 训练所有数据 输出

clf.fit(bb_data,target_bb)
b = clf.predict_proba(bbt_data)
b=pd.DataFrame(b)
pro_b = b.iloc[:,1]
no = data_test.iloc[:,0]
pro = pd.DataFrame(pro_b)
no = pd.DataFrame(no)


# GBDT 重要特征

clf.fit(bb_data,target_bb)
clf_importance = clf.feature_importances_
clf_importance_ = pd.DataFrame(clf_importance)
clf_importance_.columns = {'importance'}
bb_columns = pd.DataFrame(bb_data.columns)
bb_columns.columns={'feature'}

#影响度排序
clf_feature_values = pd.concat([bb_columns,clf_importance_],axis=1)
clf_feature_values = clf_feature_values.sort_values(by='importance')


#影响度非0的特征
clf_feature_well = clf_feature_values[clf_feature_values['importance']!=0]
clf_feature_well_columns = clf_feature_well['feature'].values
clf_feature_well.index = clf_feature_well_columns
columns_GBDT = clf_feature_well.index

# 测试集提取这些特征，形成新的测试集

C_feature = B_train[columns_GBDT]
new_test = B_test[columns_GBDT]
C_flag = pd.DataFrame(B_train['flag'])
C_train = pd.concat([C_feature,C_flag],axis=1)

CC = C_feature.fillna(0)
new_test_  = new_test.fillna(0)

#   重要特征训练，训练集交叉验证

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate

X_train, X_test, y_train, y_test = cross_validate.train_test_split(CC, C_flag, test_size=0.3, random_state=0)
clf = GradientBoostingClassifier(n_estimators=110, learning_rate=1,  max_depth=1, random_state=0)#loss='exponential' mse
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
y_pred = clf.predict(X_test)
y_pro = clf.predict_proba(X_test)
y_prd = pd.DataFrame(y_pro).iloc[:,1]
roc_auc_score(y_test,y_prd)
clf.fit(CC,C_flag)
b = clf.predict_proba(new_test_)
b=pd.DataFrame(b)
pro_b = b.iloc[:,1]
no = B_test.iloc[:,0]
pro = pd.DataFrame(pro_b)
no = pd.DataFrame(no)

import xgboost as xgb

xg_train = xgb.DMatrix(X_train,label=y_train)
xg_test = xgb.DMatrix(X_test,label=y_test)


param = {'booster':'gbtree',
         'max_depth':10,
         'eta':0.1,
         'silent':1,
         'objective':'binary:logistic',
         'eval_metric':'auc',
         'subsample': 1,
         "colsample_bytree": 0.7,
         "min_child_weight":2,
              'gamma':3.1,
              'lambda':1,
        "thread":-1,}
num_boost_round = 1500
watchlist = [(xg_train, 'train'), (xg_test, 'eval')]
num_round=15
bst = xgb.train(param, xg_train, num_round)
preds = bst.predict(xg_test)
roc_auc_score(y_test,preds)

xg_train = xgb.DMatrix(CC,label=C_flag)
xg_test = xgb.DMatrix(new_test_)
bst = xgb.train(param, xg_train, num_round)
pro = bst.predict(xg_test)

xgb_model =xgb.cv(param,xg_train,num_boost_round,nfold=5, early_stopping_rounds=300)#, verbose_eval=True

pd.DataFrame(xgb_model)
pd.DataFrame(xgb_model)['test-auc-mean'].mean()



b=pd.DataFrame(b)

no = pd.DataFrame(NO)

XGB_B=[]
XGB_B=pd.DataFrame(XGB_B)
XGB_B['no'] = no
XGB_B['pred'] = pro