# LightGBM

In [146]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [147]:
train = pd.read_csv('dataset/train.csv', index_col=0)
test = pd.read_csv('dataset/test.csv', index_col=0)
# print(train['Attrition'].value_counts())

# 处理 Attrition 字段
train['Attrition'] = train['Attrition'].map(lambda x:1 if x=='Yes' else 0)

# 查看数据中每列是否有空值
# print(train.isna().sum())

In [148]:
# 去掉没用的列 员工号码，标准工时（=80）
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)

In [149]:
# 对于分类特征进行特征值编码
attr=['Age','BusinessTravel','Department','Education','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']

In [156]:
label_encoder = LabelEncoder()

for col in attr:
    train[col] = label_encoder.fit_transform(train[col])
    test[col] = label_encoder.transform(test[col])

X_train, X_valid, y_train, y_valid = train_test_split(train.drop('Attrition', axis=1), train['Attrition'], test_size=0.2, random_state=2023)

In [185]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'eta': 0.01,
    'max_depth': 6,
    'num_leaves': 8,  # 根据需要调整
    'colsample_bytree': 0.8,
    'subsample': 0.9,
    'subsample_freq': 8,
    'alpha': 0.6,
    'lambda': 0,
    'device_type': 'cpu',
    'force_row_wise': True
}

trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_valid, label=y_valid)

model = lgb.train(params, trn_data, num_boost_round=100, valid_sets=[val_data], feature_name='auto', categorical_feature='auto', keep_training_booster=False)

# 使用训练好的模型进行预测
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)

# 将概率值转换为类别标签
threshold = 0.5  # 设置阈值
y_pred_binary = np.where(y_pred > threshold, 1, 0)

# 计算准确率
accuracy = accuracy_score(y_valid, y_pred_binary)
print(f"准确率：{accuracy}")

[LightGBM] [Info] Number of positive: 153, number of negative: 787
[LightGBM] [Info] Total Bins 1128
[LightGBM] [Info] Number of data points in the train set: 940, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.162766 -> initscore=-1.637790
[LightGBM] [Info] Start training from score -1.637790
准确率：0.864406779661017


In [178]:
2**15

32768