In [1]:
from sklearn.feature_extraction import DictVectorizer

import pandas as pd

In [2]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)

In [3]:
# 查看数据情况
train.head(2)

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1374,58,No,Travel_Rarely,605,Sales,21,3,Life Sciences,1,1938,...,3,80,1,29,2,2,1,0,0,0
1092,45,No,Travel_Rarely,950,Research & Development,28,3,Technical Degree,1,1546,...,4,80,1,8,3,3,5,4,0,3


In [4]:
# 查看离职人数情况(label)
train["Attrition"].value_counts()

No     988
Yes    188
Name: Attrition, dtype: int64

In [5]:
# 处理label, 分类数据(string) ==> number
train["Attrition"] = train["Attrition"].map(lambda x: 1 if x == "Yes" else 0)
train.head(1)

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1374,58,0,Travel_Rarely,605,Sales,21,3,Life Sciences,1,1938,...,3,80,1,29,2,2,1,0,0,0


In [6]:
# 检查是否有空值
train.isna().sum()

# 去掉没用的列 员工号码，标准工时（=80）
train = train.drop(["EmployeeNumber", "StandardHours"], axis=1)
test = test.drop(["EmployeeNumber", "StandardHours"], axis=1)

# 对于分类特征进行特征值编码

In [7]:
from sklearn.preprocessing import LabelEncoder

# 对于分类特征进行特征值编码
attr = [
    "Age",
    "BusinessTravel",
    "Department",
    "Education",
    "EducationField",
    "Gender",
    "JobRole",
    "MaritalStatus",
    "Over18",
    "OverTime",
]
lbe_list = []
for feature in attr:
    lbe = LabelEncoder()
    train[feature] = lbe.fit_transform(train[feature])
    test[feature] = lbe.transform(test[feature])
    lbe_list.append(lbe)

In [8]:
train.head()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1374,40,0,2,605,2,21,2,1,1,4,...,3,3,1,29,2,2,1,0,0,0
1092,27,0,2,950,1,28,2,5,1,4,...,4,4,1,8,3,3,5,4,0,3
768,22,0,2,300,2,26,2,2,1,3,...,3,2,1,8,3,2,7,7,7,5
569,18,0,0,1434,2,8,3,1,1,1,...,3,2,0,10,1,3,10,7,0,9
911,7,1,1,599,2,24,0,1,1,3,...,3,4,0,1,4,3,1,0,1,0


# 划分训练集/测试集

In [9]:
from sklearn.model_selection import train_test_split

# 将原始数据按照比例分割为“测试集”和“训练集”
# X_train 训练样本(不包含label)
# y_train 训练样本标签
X_train, X_valid, y_train, y_valid = train_test_split(
    train.drop("Attrition", axis=1), train["Attrition"], test_size=0.2, random_state=42
)

# 使用LR模型

In [10]:
from sklearn.linear_model.logistic import LogisticRegression



In [15]:
model = LogisticRegression(max_iter=100, verbose=True, random_state=33, tol=1e-4)

# 训练模型
model.fit(X_train, y_train)

# 预测, 结果只取lable为1的概率. 默认会返回每行(每个样本)预测为 0 & 1的概率.
predict = model.predict_proba(test)[:, 1]
predict

In [23]:
test["Attrition"] = predict
test.head()

Unnamed: 0_level_0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
442,18,0,635,2,10,3,3,1,2,1,...,4,0,10,3,2,10,3,9,7,0.107286
1091,15,2,575,1,25,2,1,1,4,1,...,4,0,5,2,3,5,3,0,2,0.328308
981,17,1,662,2,18,3,2,1,4,0,...,3,1,5,0,2,4,2,3,2,0.240434
785,22,2,1492,1,20,3,5,1,1,1,...,4,1,14,6,3,11,10,11,1,0.072648
1332,11,1,459,1,24,1,1,1,4,1,...,2,0,1,3,2,1,0,1,0,0.350869


In [24]:
test["Attrition"].to_csv("submition_lr.csv")

In [None]:
# 转化为二分类输出
# test['Attrition']=test['Attrition'].map(lambda x:1 if x>=0.5 else 0)
# test[['Attrition']].to_csv('submit_lr.csv')