## 导入包

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from gensim.models import Word2Vec
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier as cat
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss, mean_squared_log_error, precision_recall_curve, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pickle
from sklearn.pipeline import Pipeline

from datetime import datetime
from tqdm import tqdm
import sys
import os
import gc
import argparse
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
from sklearn.manifold import TSNE # 导入tsne包
from sklearn.decomposition import PCA, KernelPCA # PCA
from sklearn.manifold import Isomap # Isomap
from sklearn.impute import KNNImputer

## 数据读取和基本处理

### 读取数据

In [3]:
# 读取疾病特征数据集（训练集）
disease_feature1 = pd.read_csv("./data/train/disease_feature1.csv")
disease_feature2 = pd.read_csv("./data/train/disease_feature2.csv")
disease_feature3 = pd.read_csv("./data/train/disease_feature3.csv")

# 读取 train_food 和 train_answer（训练集）
train_food = pd.read_csv("./data/train/train_food.csv")
train_answer = pd.read_csv("./data/train/train_answer.csv")

In [4]:
data = train_answer
data = data.rename(columns={"related":"is_related"})
data.head()

Unnamed: 0,food_id,disease_id,is_related
0,food_0,disease_998,0
1,food_0,disease_861,0
2,food_0,disease_559,0
3,food_0,disease_841,0
4,food_0,disease_81,0


In [5]:
food = train_food
food.head()

Unnamed: 0,food_id,N_0,N_1,N_2,N_3,N_4,N_5,N_6,N_7,N_8,...,N_202,N_203,N_204,N_205,N_206,N_207,N_208,N_209,N_210,N_211
0,food_0,,,,,0.0,,,,,...,,,0.02,0.0,,,30.5,92.82,,0.92
1,food_1,,,,,0.0,,,,,...,,,23.9,0.0,,,0.0,2.41,,3.31
2,food_4,,,,,0.0,,,,,...,,,0.12,0.0,,,3.5,15.46,,0.36
3,food_5,,,,0.068,0.0,0.045,0.75,0.314,,...,,,0.89,0.0,,,3.3,86.35,,0.2
4,food_6,,,,0.115,0.0,0.091,0.58,0.508,,...,,,1.13,0.0,0.0,,41.6,93.22,,0.54


### 对food_id和disease_id进行编码

In [6]:
data["food"] = data["food_id"].apply(lambda x: int(x.split("_")[-1]))
data["disease"] = data["disease_id"].apply(lambda x: int(x.split("_")[-1]))   

In [7]:
# 目标编码
cat_list = ['disease']

def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge

def statis_feat(df_know, df_unknow, cat_list):  # 统计特征
    for f in tqdm(cat_list):
        df_unknow = stat(df_know, df_unknow, [f], {'is_related': ['mean']})

    return df_unknow

df_train = data[~data['is_related'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = data[data['is_related'].isnull()]

df_stas_feat = None
kf = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
for train_index, val_index in kf.split(df_train, df_train['is_related']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val, cat_list)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test, cat_list)
data = pd.concat([df_stas_feat, df_test], axis=0)
data = data.reset_index(drop=True)

del (df_stas_feat)
del (df_train)
del (df_test)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.51it/s]


## 疾病特征处理

### 缺失值填0

In [8]:
disease_feature1 = disease_feature1.fillna(0)
disease_feature2 = disease_feature2.fillna(0)
disease_feature3 = disease_feature3.fillna(0)

### 降维前归一化

In [9]:
def standard_disease(df):
    """
    降维前归一化
    """
    std = MinMaxScaler()
    disease_id_array = df["disease_id"]
    cols = [f for f in df.columns if f not in ["disease_id"]]
    df_std = std.fit_transform(df[cols])
    df_temp = pd.DataFrame(data = df_std[0:, 0:], columns = cols)
    df_disease_id = pd.DataFrame(data=disease_id_array, columns = ["disease_id"])
    df = pd.concat([df_disease_id, df_temp], axis=1)
    return df
disease_feature1 = standard_disease(disease_feature1)
disease_feature2 = standard_disease(disease_feature2)
disease_feature3 = standard_disease(disease_feature3)

  ### PCA降维

In [10]:
## PCA 对疾病特征数据集进行降维处理 
def pca(df, n):
    disease_id_array = df["disease_id"]
    df_pca = PCA(n_components=n).fit_transform(df.iloc[:, 1:])
    df_temp = pd.DataFrame(data=df_pca[0:, 0:], columns = [ "F_" + str(item) for item in range(df_pca.shape[1])])
    df_disease_id = pd.DataFrame(data=disease_id_array, columns = ["disease_id"])
    df_disease = pd.concat([df_disease_id, df_temp], axis=1)
    print(df_disease.shape)
    return df_disease

# pca
df_disease1 = pca(disease_feature1, 128)
df_disease2 = pca(disease_feature2, 144)
df_disease3 = pca(disease_feature3, 256)

(220, 129)
(301, 145)
(392, 257)


## 数据合并

In [11]:
data = pd.merge(data, food, on="food_id", how="left")
data = pd.merge(data, df_disease1, on="disease_id", how="left")
data = pd.merge(data, df_disease2, on="disease_id", how="left")
data = pd.merge(data, df_disease3, on="disease_id", how="left")
data.head()

Unnamed: 0,food_id,disease_id,is_related,food,disease,disease_is_related_mean,N_0,N_1,N_2,N_3,...,F_246,F_247,F_248,F_249,F_250,F_251,F_252,F_253,F_254,F_255
0,food_0,disease_861,0,0,861,0.003521,,,,,...,-0.013086,0.007805,-0.038002,-0.022234,-0.030239,-0.047222,0.020007,-0.07189,-0.018778,-0.045336
1,food_0,disease_839,0,0,839,0.007299,,,,,...,0.046618,0.052718,-0.036865,-0.044638,-0.076461,-0.092308,0.01053,-0.032527,0.08329,0.035846
2,food_0,disease_50,0,0,50,0.018382,,,,,...,0.008846,-0.010672,-0.013317,0.007108,0.032617,0.014577,-0.044105,0.017589,0.009023,-0.009265
3,food_0,disease_1370,0,0,1370,0.214286,,,,,...,0.002464,0.018879,-0.005077,0.055209,0.005366,-0.050008,-0.008424,0.015456,0.020397,0.011667
4,food_0,disease_1015,0,0,1015,0.202749,,,,,...,-0.038134,0.048754,-0.045703,-0.042401,-0.021547,-0.003382,-0.05551,0.035909,-0.019242,-0.002197


## 长尾特征截尾处理

In [12]:
cols = ["N_14", "N_59", "N_60", "N_61", "N_85", "N_165", "N_198", "N_193", "N_204", "N_211"]
def log1p(df, col):
    df[col] = np.log1p(df[col])
for c in tqdm(cols):
    log1p(data, c)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 18.17it/s]


## 特征工程

### 重要特征交叉

In [13]:
topn = ["N_33", "N_42", "N_43", "N_74", "N_106", "N_111", "N_209", "disease", "food"]
for i in range(len(topn)):
    for j in range(i + 1, len(topn)):
        data[f"{topn[i]}+{topn[j]}"] = data[topn[i]] + data[topn[j]]
        data[f"{topn[i]}-{topn[j]}"] = data[topn[i]] - data[topn[j]]
        data[f"{topn[i]}*{topn[j]}"] = data[topn[i]] * data[topn[j]]
        data[f"{topn[i]}/{topn[j]}"] = data[topn[i]] / (data[topn[j]] + 1e-5)

In [14]:
## 特征交叉 重要食物特征与疾病特征
topn = ["N_33", "F_82_x", "F_39_x"]
for i in range(len(topn)):
    for j in range(i + 1, len(topn)):
        data[f"{topn[i]}+{topn[j]}"] = data[topn[i]] + data[topn[j]]
        data[f"{topn[i]}-{topn[j]}"] = data[topn[i]] - data[topn[j]]
        data[f"{topn[i]}*{topn[j]}"] = data[topn[i]] * data[topn[j]]
        data[f"{topn[i]}/{topn[j]}"] = data[topn[i]] / (data[topn[j]] + 1e-5)

### 重要特征分箱处理

In [15]:
## 重要特征分箱处理
### N_33
def get_N_33_seg(x):
    if x >= 0 and x <= 0.125:
        return 0
    elif x > 0.125 and x <= 1.25:
        return 1
    elif x > 1.25 and x <= np.max(data["N_33"]):
        return 2

data["N33_seg"] = data["N_33"].apply(lambda x: get_N_33_seg(x))

### N_42
def get_N_42_seg(x):
    if x >= 0 and x <= 500:
        return 0
    elif x > 500 and x <= 1000:
        return 1
    elif x > 1000 and x <= np.max(data["N_42"]):
        return 2

data["N42_seg"] = data["N_42"].apply(lambda x: get_N_42_seg(x))

# N_43
def get_N_43_seg(x):
    if x >=0 and x <= 20:
        return 0
    elif x > 20 and x <= 38:
        return 1
    elif x > 38 and x <= 50:
        return 2
    elif x > 50 and x <= np.max(data["N_43"]):
        return 3
data["N43_seg"] = data["N_43"].apply(lambda x: get_N_43_seg(x))

### N_74
def get_N_74_seg(x):
    if x >=0 and x <= 2:
        return 0
    elif x > 2 and x <= 10:
        return 1
    elif x > 10 and x <= np.max(data["N_74"]):
        return 2

data["N74_seg"] = data["N_74"].apply(lambda x: get_N_74_seg(x))

### N_106
def get_N_106_seg(x):
    if x >= 0 and x <= 50:
        return 0
    elif x > 50 and x <= 150:
        return 1
    elif x > 150 and x <= 300:
        return 2
    elif x > 300 and x < np.max(data["N_106"]):
        return 3

data["N106_seg"] = data["N_106"].apply(lambda x: get_N_106_seg(x))

### N_111
def get_N_111_seg(x):
    if x >= 0 and x <= 500:
        return 0
    elif x > 500 and x <= 2000:
        return 1
    elif x > 2000 and x <= np.max(data["N_111"]):
        return 2

data["N111_seg"] = data["N_111"].apply(lambda x: get_N_111_seg(x))

### food
data['food_qcut'] = pd.qcut(data['food'], 10, labels=False, duplicates='drop')

### disease
data['disease_qcut'] = pd.qcut(data['disease'], 14, labels=False, duplicates='drop')

### N_209
data['N_209_qcut'] = pd.qcut(data['N_209'], 10, labels=False, duplicates='drop')

In [16]:
def static_feature(df, features, groups):
    for method in tqdm(['mean', 'std', 'max', 'min']):
        for feature in features:
            for group in groups:
                df[f'{group}_{feature}_{method}'] = df.groupby(group)[feature].transform(method)
    return df

dense_feats = ["N_33*disease", 
"N_42+disease", "N_42-disease", "N_42*disease", 
"N_43+disease", "N_43*disease", 
"N_74*disease", 
"N_111+disease", "N_111-disease", "N_111*disease",
"N_33+F_82_x", "N_33+F_39_x",
]
cat_feats = ['food']
data = static_feature(data, dense_feats, cat_feats)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.16it/s]


## 特征选择

### 根据特征重要性筛选特征

In [17]:
feat_imp = pd.read_csv("./features_importance.csv")
feat_no_imp = feat_imp[(feat_imp["imp"] < 100)].reset_index(drop=True)
no_imp_cols = feat_no_imp.feats.tolist()
data = data.drop(no_imp_cols, axis=1)

In [18]:
drop_cols = ["disease_id", "food_id", "is_related"]

### 去除只有单一取值的特征（剔除低方差特征）

In [19]:
for col in data.columns:
    if data[col].nunique() < 2:
        drop_cols.append(col)

### 删除缺失率过高的特征

In [20]:
def dropNaN(df, p, col):
    na_sum = df[col].isna().sum()
    percent_value = na_sum / len(df[col])
    if percent_value >= p:
        df = df.drop([col], axis=1)
    return df
for c in data.columns:
    data = dropNaN(data, 0.95, c)

## 保存特征以及分割数据集

In [21]:
features_name = [f for f in data.columns if f not in drop_cols]

In [22]:
X = data[features_name].reset_index(drop=True)
label = data["is_related"].reset_index(drop=True)

## 模型训练

In [None]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 10 # 10折比5折好
    seed = 2023
#     kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    
    oof = np.zeros(train_x.shape[0]) # 初始化一个大小为n（n=训练集行数），值全为0的数组 用于存放每折验证集的预测概率
    predict = np.zeros(test_x.shape[0]) # 初始化一个大小为n（n=测试集行数），值全为0的数组 用于存放预测概率

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y) # 该折训练集矩阵
            valid_matrix = clf.Dataset(val_x, label=val_y) # 该折验证集矩阵

            params = {
                'learning_rate': 0.01,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'num_leaves': 63,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 5,
                'seed': 2022,
                'bagging_seed': 1,
                'feature_fraction_seed': 7,
                'min_data_in_leaf': 20,
                'verbose': -1, 
                'n_jobs':8,
            }
            # 模型训练 valid_sets也可以只放valid_matrix verbose_eval表示打印信息的间隔 early_stopping_rounds表示早停，
            # 防止过拟合，表示在验证集上,当连续n次迭代,分数没有提高后,提前终止训练
            model = clf.train(params, train_matrix, 100000, valid_sets=[train_matrix, valid_matrix], 
                              categorical_feature=[], verbose_eval=100, early_stopping_rounds=500) 
            val_pred = model.predict(val_x, num_iteration=model.best_iteration) # 预测该折验证集 最优迭代次数
            test_pred = model.predict(test_x, num_iteration=model.best_iteration) # 该折训练下的模型来预测测试集
            
#             print(list(sorted(zip(cols, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y) # 该折该折训练集矩阵
            valid_matrix = clf.DMatrix(val_x , label=val_y) # 该折验证集矩阵
            test_matrix = clf.DMatrix(test_x) # 测试集矩阵
            
#             params = {'booster': 'gbtree',
#                       'objective': 'binary:logistic',
#                       'eval_metric': ['auc', 'ndcg'],
#                       'gamma': 1,
#                       'min_child_weight': 1.5,
#                       'max_depth': 5,
#                       'lambda': 10,
#                       'subsample': 0.7,
#                       'colsample_bytree': 0.7,
#                       'colsample_bylevel': 0.7,
#                       'eta': 0.05,
#                       'seed': 2020,
#                       'nthread': 8,
#                       'gpu_id':0,
#                       'tree_method':'gpu_hist'
#                       }
            ## optuna调参
            params = {'booster': 'gbtree',
                          'objective': 'binary:logistic',
                          'eval_metric': 'auc',
                          'gamma': 1.6102,
                          'min_child_weight': 1.331,
                          'max_depth': 8,
                          'subsample': 0.6538,
                          'colsample_bytree': 0.5433,
                          'colsample_bylevel': 0.7,
                          'reg_alpha':0.0118,
                          'reg_lambda':1.79e-05,
                          'eta': 0.0554,
                          'seed': 2020,
                          'nthread': 8,
                          'gpu_id': 0,
                          'tree_method': 'gpu_hist'
                          }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            model = clf.train(params, train_matrix, num_boost_round=10000, evals=watchlist, verbose_eval=100, early_stopping_rounds=500)
            pickle.dump(model, open(f"./model/xgb_model{i}", "wb"))
            pickle.dump(model.best_ntree_limit, open(f"./model/xgb_model_best_ntree_limit{i}", "wb"))
            
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit) # 最优模型时对应树的个数
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            
            model = clf(
            n_estimators=10000,
            random_seed=1024,
            eval_metric='AUC',
            learning_rate=0.05,
            max_depth=5,
            early_stopping_rounds=500,
            metric_period=500,
            task_type='GPU'
                    )

            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      use_best_model=True,
                      verbose=1)
            
            pickle.dump(model, open(f"./model/cat_model{i}", "wb"))
            
            val_pred  = model.predict_proba(val_x)[:,1]
            test_pred = model.predict_proba(test_x)[:,1]
        
        oof[valid_index] = val_pred # 将每一折验证集的预测结果放入原先的初始化矩阵中（每一折会对应索引valid_index）
        
        predict += test_pred / folds # ？
        
        
        cv_scores.append(roc_auc_score(val_y, val_pred))
        print(cv_scores)
    return model, oof, predict

In [None]:
start_time = datetime.now()
xgb_oof = cv_model(xgb, X, label, 'xgb')
end_time = datetime.now() 
print(end_time - start_time)