In [1]:
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.decomposition import PCA,TruncatedSVD

import lightgbm as lgb
import xgboost as xgb
import catboost as ctb

import numpy as np
import pandas as pd

In [2]:
test_food = pd.read_csv('./test_A/初赛A榜测试集/preliminary_a_food.csv')
test_sub = pd.read_csv('./test_A/初赛A榜测试集/preliminary_a_submit_sample.csv')
train_food = pd.read_csv('./train/训练集/train_food.csv')
train_answer = pd.read_csv('./train/训练集/train_answer.csv')

disease_feature1 = pd.read_csv('./train/训练集/disease_feature1.csv')
disease_feature2 = pd.read_csv('./train/训练集/disease_feature2.csv')
disease_feature3 = pd.read_csv('./train/训练集/disease_feature3.csv')

In [3]:
def 降维(feats, nfeats=64):
#     pca = PCA(n_components=nfeats,random_state=2023)
    tsvd = TruncatedSVD(n_components=nfeats,algorithm='randomized', n_iter=50, random_state=2023, tol=0.0)
    new_feats = tsvd.fit_transform(feats)
    
    return new_feats
nfeats = 50
new_feat1 = 降维(disease_feature1.iloc[:,1:], nfeats=nfeats)
new_feat2 = 降维(disease_feature2.iloc[:,1:], nfeats=nfeats)
new_feat3 = 降维(disease_feature3.iloc[:,1:], nfeats=nfeats)

feat1 = pd.DataFrame(new_feat1)
feat1.columns = [f"pca_1_{i}" for i in range(nfeats)]
feat1['disease_id'] = disease_feature1.disease_id

feat2 = pd.DataFrame(new_feat2)
feat2.columns = [f"pca_2_{i}" for i in range(nfeats)]
feat2['disease_id'] = disease_feature2.disease_id

feat3 = pd.DataFrame(new_feat3)
feat3.columns = [f"pca_3_{i}" for i in range(nfeats)]
feat3['disease_id'] = disease_feature3.disease_id

In [12]:
del disease_feature1,disease_feature2,disease_feature3

In [4]:
train = train_answer.merge(train_food, on='food_id', how='left').merge(feat1, on='disease_id', how='left').merge(feat2, on='disease_id', how='left').merge(feat3, on='disease_id', how='left')
test = test_sub.merge(test_food, on='food_id', how='left').merge(feat1, on='disease_id', how='left').merge(feat2, on='disease_id', how='left').merge(feat3, on='disease_id', how='left')

In [5]:
train = train.fillna(0)
test = test.fillna(0)
feat_col = train.columns.tolist()[3:]
x_train = train[feat_col]
y_train = train['related']
x_test = test[feat_col]

In [6]:
del train,test

In [8]:
params_xgb = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': ['logloss','auc'],
    'gamma': 0.1,
    'max_depth': 5, # 树的最大深度
    'alpha': 0, # 关于权重的L1正则化项
    'lambda': 0,
    'subsample': 0.7,
    'colsample_bytree': 0.5,
    'min_child_weight': 3,
    'silent': 0,
    'eta': 0.03, # 学习率
    'nthread': -1,
    'seed': 2023,
}

params_lgb = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    # 'num_class': 2,
    'metric': {'binary_logloss', 'auc'},
    'num_leaves': 30,
    'min_data_in_leaf': 20,
    'learning_rate': 0.01,
    'max_depth':5,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.4,
    'lambda_l2': 0.5,
    'min_gain_to_split': 0.2,
    'verbose': -1,
    'num_threads': -1,
    'feature_fraction_seed':2023,
    'bagging_seed':2023,
    'seed':2023
}

params_ctb = {
    'learning_rate': 0.01,
    'loss_function': "Logloss",
    'eval_metric': "AUC",
    'depth': 8,
    'random_seed':2023,
    'min_data_in_leaf': 100,
    'logging_level': 'Verbose',
    'use_best_model': True,
    'one_hot_max_size': 5,   #类别数量多于此数将使用ordered target statistics编码方法,默认值为2。
    'boosting_type':"Ordered", #Ordered 或者Plain,数据量较少时建议使用Ordered,训练更慢但能够缓解梯度估计偏差。
    'max_ctr_complexity': 4, #特征组合的最大特征数量，设置为1取消特征组合，设置为2只做两个特征的组合,默认为4。
    'nan_mode': 'Min' 
}

In [9]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)
oof_lgb = np.zeros([len(x_train),])
oof_xgb = np.zeros([len(x_train),])
oof_ctb = np.zeros([len(x_train),])
predictions_lgb = np.zeros([len(x_test),])
predictions_xgb = np.zeros([len(x_test),])
predictions_ctb = np.zeros([len(x_test),])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
    print("fold n°{}".format(fold_+1))
    lgb_trn_data = lgb.Dataset(x_train.iloc[trn_idx], y_train.iloc[trn_idx])
    lgb_val_data = lgb.Dataset(x_train.iloc[val_idx], y_train.iloc[val_idx])
    
    cat_trn_data = ctb.Pool(data=x_train.iloc[trn_idx],label=y_train.iloc[trn_idx])
    cat_val_data = ctb.Pool(data=x_train.iloc[val_idx],label=y_train.iloc[val_idx])
    
    xgb_trn_data = xgb.DMatrix(x_train.iloc[trn_idx], label=y_train.iloc[trn_idx])
    xgb_val_data = xgb.DMatrix(x_train.iloc[val_idx], label=y_train.iloc[val_idx])
 
    num_round = 100000
    clf_lgb = lgb.train(params_lgb,
                        lgb_trn_data,
                        num_round,
                        valid_sets = [lgb_trn_data, lgb_val_data], 
                        verbose_eval = 200,
                        early_stopping_rounds = 500)
    clf_lgb.save_model(filename=f'./saved_model/{fold_+1}_fold_lgb.txt',num_iteration=clf_lgb.best_iteration)
    
    clf_ctb = ctb.CatBoostClassifier(iterations = 100000,early_stopping_rounds = 100, **params_ctb)
    clf_ctb.fit(cat_trn_data, eval_set=cat_val_data,verbose_eval=100)
    clf_ctb.save_model(f'./saved_model/{fold_+1}_fold_ctb.txt')
    
    watchlist = [(xgb_trn_data, 'train'), (xgb_val_data, 'valid')]
    clf_xgb = xgb.train(params_xgb, xgb_trn_data, num_round, watchlist, verbose_eval=100, early_stopping_rounds=100)
    clf_xgb.save_model(f'./saved_model/{fold_+1}_fold_xgb.txt')
    
    oof_lgb[val_idx] = clf_lgb.predict(x_train.iloc[val_idx], num_iteration=clf_lgb.best_iteration)
    oof_ctb[val_idx] = clf_ctb.predict(x_train.iloc[val_idx])
    oof_xgb[val_idx] = clf_xgb.predict(xgb.DMatrix(x_train.iloc[val_idx]))
    
    predictions_lgb += clf_lgb.predict(x_test, num_iteration=clf_lgb.best_iteration) / folds.n_splits
    predictions_ctb += clf_ctb.predict(x_test) / folds.n_splits
    predictions_xgb += clf_xgb.predict(xgb.DMatrix(x_test)) / folds.n_splits

fold n°1




Training until validation scores don't improve for 500 rounds
[200]	training's auc: 0.919702	training's binary_logloss: 0.219252	valid_1's auc: 0.909082	valid_1's binary_logloss: 0.225392
[400]	training's auc: 0.940321	training's binary_logloss: 0.187647	valid_1's auc: 0.929966	valid_1's binary_logloss: 0.19586
[600]	training's auc: 0.948802	training's binary_logloss: 0.172947	valid_1's auc: 0.937834	valid_1's binary_logloss: 0.182647
[800]	training's auc: 0.954543	training's binary_logloss: 0.163403	valid_1's auc: 0.942835	valid_1's binary_logloss: 0.174404
[1000]	training's auc: 0.959029	training's binary_logloss: 0.155504	valid_1's auc: 0.946847	valid_1's binary_logloss: 0.167637
[1200]	training's auc: 0.962622	training's binary_logloss: 0.149034	valid_1's auc: 0.949679	valid_1's binary_logloss: 0.162396
[1400]	training's auc: 0.965349	training's binary_logloss: 0.143989	valid_1's auc: 0.951823	valid_1's binary_logloss: 0.158464
[1600]	training's auc: 0.967636	training's binary_logl



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-logloss:0.67261	train-auc:0.71013	valid-logloss:0.67267	valid-auc:0.70552
[100]	train-logloss:0.22463	train-auc:0.91689	valid-logloss:0.23107	valid-auc:0.90633
[200]	train-logloss:0.18160	train-auc:0.94388	valid-logloss:0.19120	valid-auc:0.93263
[300]	train-logloss:0.16421	train-auc:0.95384	valid-logloss:0.17559	valid-auc:0.94191
[400]	train-logloss:0.15363	train-auc:0.95971	valid-logloss:0.16665	valid-auc:0.94695
[500]	train-logloss:0.14517	train-auc:0.96411	valid-logloss:0.15977	valid-auc:0.95083
[600]	train-logloss:0.13872	train-auc:0.96737	valid-logloss:0.15481	valid-auc:0.95351
[700]	train-logloss:0.13305	train-auc:0.97021	valid-logloss:0.15095	valid-auc:0.95542
[

In [12]:
y_valid = y_train
y_pre = [int(x>0.2) for x in oof_ctb]
acc = [int(i==j) for i,j in zip(y_valid, y_pre)]
acc = sum(acc)/len(acc)
f1 = f1_score(y_valid, y_pre,average='binary')
precision = precision_score(y_valid, y_pre,average='binary')
recall = recall_score(y_valid, y_pre,average='binary')
roc_auc = roc_auc_score(y_valid, oof_ctb)
score = (f1+roc_auc)/2
print(f"roc_auc:{roc_auc}")
print(f"acc:{acc}")
print(f"F1 score: {f1}")
print(f"Precision score: {precision}")
print(f"Recall score: {recall}")
print(f"(F1+auc)/2: {score}")

roc_auc:0.7943847782260391
acc:0.9498926826513033
F1 score: 0.7044271375619509
Precision score: 0.8523483168715985
Recall score: 0.6002555184896018
(F1+auc)/2: 0.749405957893995


In [13]:
train_prob = pd.DataFrame()
train_prob['lgb'] = oof_lgb
train_prob['xgb'] = oof_xgb
train_prob['ctb'] = oof_ctb
train_prob['label'] = y_train

test_prob = pd.DataFrame()
test_prob['lgb'] = predictions_lgb
test_prob['xgb'] = predictions_xgb
test_prob['ctb'] = predictions_ctb

In [14]:
from sklearn.linear_model import LinearRegression

lrg = LinearRegression()

lrg.fit(train_prob[['lgb','xgb','ctb']], train_prob.label)
prob_y = lrg.predict(test_prob)

In [15]:
def get_prob(x):
    if x[0] == 1:
        return 0.5 + 0.1 * x[1]
    else:
        return 0.4 + 0.1 * x[1]
    
test_sub['related_p'] = prob_y
test_sub['rank'] = 1
test_sub['rank'] = test_sub.groupby('rank')['related_p'].rank(method='first', ascending=False)
test_sub['related_f1'] = 0

# 阈值换分，根据提交全1测试大概4570-4590之间
test_sub.loc[test_sub['rank'] < int(4584), 'related_f1'] = 1

test_sub['related_prob'] = test_sub[['related_f1', 'related_p']].apply(lambda x: get_prob(x), axis=1)
test_sub[['food_id', 'disease_id', 'related_prob']].to_csv('./results/submit_xgb_lgb_ctbv2.csv', index=False)