In [1]:
import numpy as np
import pandas as pd
# 显示所有列
pd.set_option("display.max_columns",  None)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# 方差阈值法，用于特征选择，过滤器法的一种，去掉那些方差没有达到阈值的特征。默认情况下，删除零方差的特征
from sklearn.feature_selection import VarianceThreshold  # https://zhuanlan.zhihu.com/p/331853738

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings("ignore")

### Step 1 and 2 - Build first QDA model and predict test

In [2]:
# data
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

cols = [c for c in train.columns if c not in ["id", "target", "wheezy-copper-turtle-magic"]]

In [11]:
oof = np.zeros(len(train))
preds = np.zeros(len(test))

# 512个模型
for i in tqdm_notebook(range(512)):
    train2 = train[train["wheezy-copper-turtle-magic"] == i]
    test2 = test[test["wheezy-copper-turtle-magic"] == i]
    if (len(train2) == 0)&(len(test2) == 0):continue
    # 数据索引
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True, inplace=True)
    
#     data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
    # fit_transform（X [，y]）适合数据，然后对其进行转换
#     data2 = VarianceThreshold(threshold=2).fit_transform(data[cols]) # 得到符合方差阈值的特征

    # 特征选择(方差阈值法)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    if len(test2) > 0: test3 = sel.transform(test2[cols])  
#     print(data2)
#     print(data2.shape)
#     print(train2.shape)
#     print(data.shape)
    # 与train2差在特征数量
#     train3 = data2[:train2.shape[0]];test3 = data2[train2.shape[0]:]

    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True) # KFold
    # train_index训练集索引, test_index测试集索引
    for train_index, test_index in skf.split(train3, train2["target"]):
#         print(train_index, test_index)
        clf = QuadraticDiscriminantAnalysis(0.1) # QDA
#         print(train3[train_index,:])
        clf.fit(train3[train_index, :], train2.loc[train_index]["target"])
        oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] # train中划分测试集的预测结果
        # test金额
        if len(test2) > 0: preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits # 相加的结果除划分的次数

auc = roc_auc_score(train["target"], oof) # 训练集预测结果和实际结果比较
print(f"AUC: {auc: .5}")

HBox(children=(FloatProgress(value=0.0, max=512.0), HTML(value='')))


AUC:  0.96497


### Step 3 & 4 - Add pseudo label data and build second model

In [12]:
test["target"] = preds
oof = np.zeros(len(train))
preds = np.zeros(len(test))

# 512 model
for k in tqdm_notebook(range(512)):
    train2 = train[train["wheezy-copper-turtle-magic"] == k]
    train2p = train2.copy();idx1 = train2.index
    test2 = test[test["wheezy-copper-turtle-magic"] == k]
    
    # 伪标签data
    test2p = test2[(test2["target"]<=0.01) | (test2["target"]>=0.99)].copy()
    test2p.loc[test2p["target"]>=0.5, "target"] = 1 # 大于等于0.5是为1
    test2p.loc[test2p["target"]<0.5, "target"] = 0 # 小于0.5是为0
    train2p = pd.concat([train2p, test2p], axis=0) # 连接
    train2p.reset_index(drop=True, inplace=True)
    
    # 特征选择(方差阈值法)
    sel = VarianceThreshold(threshold=1.5).fit(train2p[cols])
    train3p = sel.transform(train2p[cols]) # test2p和train2p组合
    train3 = sel.transform(train2[cols]) # train2
    if len(test2) > 0: test3 = sel.transform(test2[cols])
        
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3p, train2p["target"]):
        test_index3 = test_index[test_index<len(train3)] # 排除掉伪标签，oof没有
        
        clf = QuadraticDiscriminantAnalysis(0.5) # QDA
        clf.fit(train3p[train_index, :], train2p.loc[train_index]["target"])
        oof[idx1[test_index3]] = clf.predict_proba(train3[test_index3, :])[:, 1] # train中划分测试集的预测结果
        # test金额
        if len(test2) > 0: preds[test2.index] += clf.predict_proba(test3)[:, 1] / skf.n_splits # 相加的结果除划分的次数

auc = roc_auc_score(train["target"], oof)
print('Pseudo Labeled QDA scores CV =',round(auc,5))

HBox(children=(FloatProgress(value=0.0, max=512.0), HTML(value='')))


Pseudo Labeled QDA scores CV = 0.96774
