In [1]:
"""
作者：librauee
微信公众号：老肥码码码
日期：2020.12.18
线上得分：0.8003
截至日期排名：1
"""
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
# 数据读取，去除无关列和标签列
# 这里需对标签列进行label encoder编码（转化为数字）
train = pd.read_csv('train.csv')
test = pd.read_csv('test_noLabel.csv')
lb = LabelEncoder()
y = lb.fit_transform(train['Label'])
train['Label'] = y

X_train = train.drop(['ID', 'Label'], axis=1)
X_test = test.drop(['ID'], axis=1)

In [3]:
# 将类别特征转换为category类型
cat_cols = ['Contract', 'Dependents', 'DeviceProtection', 'InternetService',
            'MultipleLines', 'Partner', 'PaymentMethod',
            'PhoneService',  'TVProgram', 'gender']

X_train[cat_cols] = X_train[cat_cols].astype('category')
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [4]:
# LGB模型五折交叉验证
features = X_train.columns
params = {
          'objective':'binary',
          'metric':'binary_error', 
          'num_iterations': 10000, 
}

predictions_lgb = np.zeros((len(X_test)))

seeds = [2028]
for seed in seeds:
    KF = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof_lgb = np.zeros(len(X_train))
    for fold_, (trn_idx, val_idx) in enumerate(KF.split(X_train.values, y)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X_train.iloc[trn_idx][features],label=y[trn_idx])    
        val_data = lgb.Dataset(X_train.iloc[val_idx][features],label=y[val_idx])
        num_round = 10000
        clf = lgb.train(
            params,
                        trn_data,
                        num_round,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=200,  
            categorical_feature=cat_cols,
        )

        oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions_lgb[:] += clf.predict(X_test[features], num_iteration=clf.best_iteration) / 5 /len(seeds)
    print("ACC: {}".format(accuracy_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))

fold n°0
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[38]	training's binary_error: 0.187515	valid_1's binary_error: 0.231358
fold n°1
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[20]	training's binary_error: 0.19469	valid_1's binary_error: 0.243786
fold n°2
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[23]	training's binary_error: 0.198231	valid_1's binary_error: 0.21244
fold n°3
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[28]	training's binary_error: 0.193926	valid_1's binary_error: 0.208612
fold n°4
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[12]	training's binary_error: 0.206839	valid_1's binary_error: 0.209569
ACC: 0.7788406351635737


In [5]:
# 提交
submit = pd.read_csv('submit_example.csv')
submit['Label'] = lb.inverse_transform([1 if i >= 0.5 else 0 for i in predictions_lgb])
submit.to_csv('submit_LGB.csv', index=False)