# Project One: 员工离职预测

#### step 1, 对数据进行探索

In [308]:
import pandas as pd
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import LinearSVC
from sklearn.preprocessing import MinMaxScaler


In [265]:

train = pd.read_csv('dataset/train.csv', index_col=0)
test = pd.read_csv('dataset/test.csv', index_col=0)
# print(train['Attrition'].value_counts())

# 处理 Attrition 字段
train['Attrition'] = train['Attrition'].map(lambda x:1 if x=='Yes' else 0)

# 查看数据中每列是否有空值
# print(train.isna().sum())

#### step2, 去掉无用特征，处理分类特征

In [266]:
print(train['StandardHours'].value_counts())

StandardHours
80    1176
Name: count, dtype: int64


In [267]:
# 去掉无用的列，员工号码，标准工时（=80)
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)

In [268]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 294 entries, 442 to 1229
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       294 non-null    int64 
 1   BusinessTravel            294 non-null    object
 2   DailyRate                 294 non-null    int64 
 3   Department                294 non-null    object
 4   DistanceFromHome          294 non-null    int64 
 5   Education                 294 non-null    int64 
 6   EducationField            294 non-null    object
 7   EmployeeCount             294 non-null    int64 
 8   EnvironmentSatisfaction   294 non-null    int64 
 9   Gender                    294 non-null    object
 10  HourlyRate                294 non-null    int64 
 11  JobInvolvement            294 non-null    int64 
 12  JobLevel                  294 non-null    int64 
 13  JobRole                   294 non-null    object
 14  JobSatisfaction           29

In [269]:
# 对于分类特征进行特征值编码
attr=['Age','BusinessTravel','Department','Education','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
lbe_list=[]
for feature in attr:
    lbe=LabelEncoder()
    train[feature]=lbe.fit_transform(train[feature])
    test[feature]=lbe.transform(test[feature])
    lbe_list.append(lbe)

In [270]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 294 entries, 442 to 1229
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       294 non-null    int64
 1   BusinessTravel            294 non-null    int64
 2   DailyRate                 294 non-null    int64
 3   Department                294 non-null    int64
 4   DistanceFromHome          294 non-null    int64
 5   Education                 294 non-null    int64
 6   EducationField            294 non-null    int64
 7   EmployeeCount             294 non-null    int64
 8   EnvironmentSatisfaction   294 non-null    int64
 9   Gender                    294 non-null    int64
 10  HourlyRate                294 non-null    int64
 11  JobInvolvement            294 non-null    int64
 12  JobLevel                  294 non-null    int64
 13  JobRole                   294 non-null    int64
 14  JobSatisfaction           294 non-null    in

In [271]:
train.to_csv('dataset/train_label_encoder.csv')
test.to_csv('dataset/test_label_encoder.csv')

In [306]:
train_load = pd.read_csv('dataset/train_label_encoder.csv', index_col=0)
test_load = pd.read_csv('dataset/test_label_encoder.csv', index_col=0)

X_train, X_valid, y_train, y_valid = train_test_split(train_load.drop('Attrition',axis=1), train_load['Attrition'], test_size=0.2, random_state=2023)

## LR

In [301]:

model = LogisticRegression(max_iter=100, 
                           verbose=True, 
                           random_state=33,
                           tol=1e-4
                          )

model.fit(X_train, y_train)

# 二分类结果， 0 或者 1
# predict = model.predict(test_load)
# print(predict)

# 二分类任务，有 2 个概率值，label=0 的概率，label=1 的概率
predict = model.predict_proba(test_load)[:, 1]
print('\nPredict: \n{}'.format(predict))

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           33     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.51558D+02    |proj g|=  4.40316D+06

At iterate   50    f=  3.80176D+02    |proj g|=  5.79065D+04


 This problem is unconstrained.



At iterate  100    f=  3.65940D+02    |proj g|=  5.36072D+03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   33    100    116      1     0     0   5.361D+03   3.659D+02
  F =   365.93961248786218     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 

Predict: 
[0.1188237  0.35698911 0.26744238 0.09076416 0.36492531 0.24040345
 0.09114984 0.08645963 0.0472586  0.21236341 0.03074305 0.42434077
 0.06909767 0.17419313 0.05966228 0.10779347 0.07809651 0.11039349
 0.10721182 0.32849862 0.17779864 0.02407476 0.15552488 0.16607751
 0.35725175 0.14760679 0.22492627 0.05034543 0.37735218 0.12043144
 0.05058955 0.0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [302]:
test_load['Attrition']=predict
test_load[['Attrition']].to_csv('dataset/submit_lr.csv')
print('submit_lr.csv saved')

submit_lr.csv saved


In [None]:
# 转化为二分类输出
#test_load['Attrition']=test_load['Attrition'].map(lambda x:1 if x>=0.5 else 0)
#test_load[['Attrition']].to_csv('dataset/submit_lr.csv')

## SVM 支持向量机

In [111]:
# 三种 SVM 方法

# sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=1, decision_function_shape='ovr', random_state=None)

# sklearn.svm.NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=1, decision_function_shape='ovr', random_state=None)

# sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)

In [309]:
mms = MinMaxScaler(feature_range=(0, 1))
X_train = mms.fit_transform(X_train)
X_valid = mms.fit_transform(X_valid)
test_load = mms.fit_transform(test_load)

In [310]:

model = LinearSVC(max_iter=1000,
                  random_state=33,
                  verbose=True)

model.fit(X_train, y_train)
predict = model.predict(test_load)
print('\nPredict\n:{}'.format(predict))

[LibLinear]...............................*
optimization finished, #iter = 319
Objective value = -380.794014
nSV = 637

Predict
:[0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]




In [305]:
test_load['Attrition'] = predict
test_load[['Attrition']].to_csv('dataset/submit_svc.csv')
print('submit_svc.csv saved')

submit_svc.csv saved


课程地址： 待更新...

欢迎关注「坍缩的奇点」， 获取更多免费教程。

![欢迎订阅：坍缩的奇点](../assets/Capture-2023-11-02-164446.png)