In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error


In [38]:
# 1.读文件
train_data = pd.read_csv('./dataset/train_final.csv')
test_data = pd.read_csv('./dataset/test_final.csv')


# 2.切分数据输入：特征 输出：预测目标变量
train_y = train_data.loan_status
test_y = test_data.loan_status
train_X = train_data.drop(['loan_status'], axis=1)
test_X = test_data.drop(['loan_status'], axis=1)


# 3.参数
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression',  # 目标函数
    'metric': {'l2', 'auc'},  # 评估函数
    'num_leaves': 31,  # 叶子节点数
    'learning_rate': 0.8,  # 学习速率
    'force_row_wise': 'true',
    'feature_fraction': 0.9,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}


# 4.转换为Dataset数据格式
lgb_train = lgb.Dataset(train_X, train_y)
lgb_eval = lgb.Dataset(test_X, test_y, reference=lgb_train)


# 5.调用LightGBM模型，使用训练集数据进行训练（拟合）
# Add verbosity=2 to print messages while running boosting
my_model = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)

# 6.使用模型对测试集数据进行预测
predictions = my_model.predict(test_X, num_iteration=my_model.best_iteration)


# 7.对模型的预测结果进行评判（平均绝对误差）
defaultMeanError = mean_absolute_error(predictions, test_y);
print("Mean Absolute Error : " + str(defaultMeanError))





# 8、添加衍生列
# train_X['added1'] = train_X['continuous_dti'].map(lambda x: 1 if x>20 else 0)
# test_X['added1'] = test_X['continuous_dti'].map(lambda x: 1 if x>20 else 0)

train_X['added2'] = train_X['continuous_installment'].map(lambda x: 1 if x>380 else 0)
test_X['added2'] = test_X['continuous_installment'].map(lambda x: 1 if x>380 else 0)

# train_X['added3'] = train_X['continuous_installment'] * train_X['continuous_installment']
# test_X['added3'] = test_X['continuous_installment'] * test_X['continuous_installment']
# #
# train_X['added4'] = train_X['continuous_installment'] * train_X['continuous_int_rate']
# test_X['added4'] = test_X['continuous_installment'] * test_X['continuous_int_rate']

#
# train_X['added5'] = train_X['continuous_inq_last_6mths'].map(lambda x: 1 if x>2 else 0)
# test_X['added5'] = test_X['continuous_inq_last_6mths'].map(lambda x: 1 if x>2 else 0)


params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression',  # 目标函数
    'metric': {'l1', 'auc'},  # 评估函数
    'num_leaves': 5,  # 叶子节点数
    'learning_rate': 0.55,  # 学习速率
    'force_row_wise': 'true',
    'feature_fraction': 0.8,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'bagging_freq': 4,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}


# 9.转换为Dataset数据格式
lgb_train = lgb.Dataset(train_X, train_y)
lgb_eval = lgb.Dataset(test_X, test_y, reference=lgb_train)


# 10.调用LightGBM模型，使用训练集加了衍生变量的数据进行训练（拟合）
# Add verbosity=2 to print messages while running boosting
my_model = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=5)

# 11.使用模型对加了衍生变量的测试集数据进行预测
predictions = my_model.predict(test_X, num_iteration=my_model.best_iteration)


# 12.对加了衍生变量的模型的预测结果进行评判（平均绝对误差）
newMeanError = mean_absolute_error(predictions, test_y)
print("Mean Absolute Error : " + str(newMeanError))

#13.输出模型误差提升
print("Improve : " + str(defaultMeanError - newMeanError))

[LightGBM] [Info] Total Bins 2611
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 141
[LightGBM] [Info] Start training from score 0.795760
[1]	valid_0's auc: 0.948928	valid_0's l2: 0.0648067
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.95041	valid_0's l2: 0.0617646
[3]	valid_0's auc: 0.952782	valid_0's l2: 0.0617944
[4]	valid_0's auc: 0.953968	valid_0's l2: 0.06205
[5]	valid_0's auc: 0.953989	valid_0's l2: 0.0620907
[6]	valid_0's auc: 0.953212	valid_0's l2: 0.0623673
[7]	valid_0's auc: 0.952315	valid_0's l2: 0.0627473
Early stopping, best iteration is:
[2]	valid_0's auc: 0.95041	valid_0's l2: 0.0617646
Mean Absolute Error : 0.12698311017879485
[LightGBM] [Info] Total Bins 2613
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 142
[LightGBM] [Info] Start training from score 0.795760
[1]	valid_0's auc: 0.932236	valid_0's l1: 0.215988
Training until validation scores don't