In [1]:
# 导入库
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# 导入数据
train  = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

In [3]:
# 数据预处理

In [4]:
# drop columns startwiths 'pscalc' because it's not relevant to the target data.
# the reason ->  https://www.kaggle.com/lsjsj92/porto-simple-eda-with-python-unbalanced-data

In [5]:
id_test = test['id'].values
train_target = train['target'].values

train = train.drop(['target', 'id'], axis = 1)
test = test.drop(['id'], axis = 1)
print(train.shape, test.shape)

(595212, 57) (892816, 57)


In [6]:
# 找到ps_calc_特征
col_to_drop = train.columns[train.columns.str.startswith("ps_calc_")]
print(col_to_drop)

Index(['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10',
       'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],
      dtype='object')


In [7]:
# 删除特征对应的列
train = train.drop(col_to_drop, axis=1)
test = test.drop(col_to_drop, axis=1)

# 将缺失值(-1)替换为nan
train = train.replace(-1, np.nan)
test = test.replace(-1, np.nan)

# 查找类别特征
cat_features = [a for a in train.columns if a.endswith("cat")]
print(cat_features)

['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']


In [8]:
# pd.get_dummies -> onehotencode的实现
for column in cat_features:
    temp = pd.get_dummies(pd.Series(train[column]))
    train = pd.concat([train, temp], axis=1) # 列连接
    train = train.drop([column], axis=1) # 删除原来的列

In [9]:
for column in cat_features:
    temp = pd.get_dummies(pd.Series(test[column]))
    test = pd.concat([test, temp], axis = 1)
    test = test.drop([column], axis=1)

In [10]:
print(train.shape, test.shape)

(595212, 198) (892816, 198)


In [11]:
# Ensemble集成学习模型 h ttps://blog.csdn.net/a1628864705/article/details/63309077
# kfold划分， stacker学习器(逻辑回归), 上采样，多个基学习器

class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models
        
    def fit_predict(self, X, y, T):
        increase = True
        print(X.shape)
        if increase:
            pos = pd.Series(y == 1)
            y = pd.Series(y)
            X = pd.concat([X, X.loc[pos]], axis = 0)
            y = pd.concat([y, y.loc[pos]], axis = 0)
            idx = np.arange(len(X))
            np.random.shuffle(idx)
            X = X.iloc[idx]
            y = y.iloc[idx]
        print(X.shape)
        print(T.shape)
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)
        
        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle = True, random_state=17).split(X, y))
        
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        
        
        
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], self.n_splits))
            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                

                print("fit %s fold %d " %(str(clf).split('(')[0], j+1))
                
                clf.fit(X_train, y_train)
                y_pred = clf.predict_proba(X_holdout)[:, 1]
                
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:, 1]
            S_test[:, i] = S_test_i.mean(axis=1)
        result = cross_val_score(self.stacker, S_train, y, cv=3)
        print("Stacker score : %.5f "%(result.mean()))
        self.stacker.fit(S_train, y)
        
        res = self.stacker.predict_proba(S_test)[:, 1]
        return res

In [12]:
# lgbm参数， 3个lgbm学习器

lgb_params = {
    'learning_rate' : 0.02,
    'n_estimators' : 650,
    'max_bin' : 10,
    'subsample' : 0.8,
    'subsample_freq' : 10,
    'colsample_bytree' : 0.8,
    'min_child_samples' : 500,
    'seed' : 99
}

lgb_params2 = {
    'n_estimators' : 1090,
    'learning_rate' : 0.02,
    'colsample_bytree' : 0.3,
    'subsample' : 0.7,
    'subsample_freq' : 2,
    'num_leaves' : 16,
    'seed' : 99
}

lgb_params3 = {
    'n_estimators' : 110,
    'max_depth' : 4,
    'learning_rate' : 0.02,
    'seed' : 99
}

# 3个lgbm学习器
lgb_model = LGBMClassifier(**lgb_params)
lgb_model2 = LGBMClassifier(**lgb_params2)
lgb_model3 = LGBMClassifier(**lgb_params3)

# stacker
log_model = LogisticRegression()

In [13]:
stack = Ensemble(n_splits=3, stacker=log_model, base_models=(lgb_model, lgb_model2, lgb_model3))

y_pred = stack.fit_predict(train, train_target, test)

(595212, 198)
(616906, 198)
(892816, 198)
fit LGBMClassifier fold 1 
fit LGBMClassifier fold 2 
fit LGBMClassifier fold 3 
fit LGBMClassifier fold 1 
fit LGBMClassifier fold 2 
fit LGBMClassifier fold 3 
fit LGBMClassifier fold 1 
fit LGBMClassifier fold 2 
fit LGBMClassifier fold 3 
Stacker score : 0.92943 


In [14]:
y_pred

array([0.05565732, 0.05290177, 0.04360995, ..., 0.06824   , 0.04578818,
       0.05950708])

In [15]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_pred

In [16]:
sub.head(20)

Unnamed: 0,id,target
0,0,0.055657
1,1,0.052902
2,2,0.04361
3,3,0.041569
4,4,0.063904
5,5,0.079925
6,6,0.037248
7,8,0.059155
8,10,0.075098
9,11,0.089516
