# Project One: Employee Turnover Forecasting

#### step 1, explore the data

In [44]:
import pandas as pd
from sklearn.linear_model._logistic import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import LinearSVC
from sklearn.preprocessing import MinMaxScaler

import os
import warnings

dp = os.environ.get('DATA_PATH') + 'AI_Cheats/'
warnings.filterwarnings('ignore')

In [45]:
train = pd.read_csv(dp + 'employee_turnover/train.csv', index_col=0)
test = pd.read_csv(dp + 'employee_turnover/test.csv', index_col=0)

# print(train['Attrition'].value_counts())

# Process the 'Attrition' field
train['Attrition'] = train['Attrition'].map(lambda x: 1 if x == 'Yes' else 0)

# Check if there are any missing values in each column
# print(train.isna().sum())

#### step2, remove useless features, process classification features

In [46]:
print(train['StandardHours'].value_counts())

80    1176
Name: StandardHours, dtype: int64


In [47]:
# Drop useless columns, EmployeeNumber, StandardHours (=80)
train = train.drop(['EmployeeNumber', 'StandardHours'], axis=1)
test = test.drop(['EmployeeNumber', 'StandardHours'], axis=1)

In [49]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 294 entries, 442 to 1229
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       294 non-null    int64 
 1   BusinessTravel            294 non-null    object
 2   DailyRate                 294 non-null    int64 
 3   Department                294 non-null    object
 4   DistanceFromHome          294 non-null    int64 
 5   Education                 294 non-null    int64 
 6   EducationField            294 non-null    object
 7   EmployeeCount             294 non-null    int64 
 8   EnvironmentSatisfaction   294 non-null    int64 
 9   Gender                    294 non-null    object
 10  HourlyRate                294 non-null    int64 
 11  JobInvolvement            294 non-null    int64 
 12  JobLevel                  294 non-null    int64 
 13  JobRole                   294 non-null    object
 14  JobSatisfaction        

(294, 32)

In [50]:
# Eigenvalue encoding for categorical features
attr=['Age','BusinessTravel','Department','Education','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
lbe_list=[]
for feature in attr:
    lbe=LabelEncoder()
    train[feature]=lbe.fit_transform(train[feature])
    test[feature]=lbe.transform(test[feature])
    lbe_list.append(lbe)

In [51]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 294 entries, 442 to 1229
Data columns (total 32 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   Age                       294 non-null    int64
 1   BusinessTravel            294 non-null    int64
 2   DailyRate                 294 non-null    int64
 3   Department                294 non-null    int64
 4   DistanceFromHome          294 non-null    int64
 5   Education                 294 non-null    int64
 6   EducationField            294 non-null    int64
 7   EmployeeCount             294 non-null    int64
 8   EnvironmentSatisfaction   294 non-null    int64
 9   Gender                    294 non-null    int64
 10  HourlyRate                294 non-null    int64
 11  JobInvolvement            294 non-null    int64
 12  JobLevel                  294 non-null    int64
 13  JobRole                   294 non-null    int64
 14  JobSatisfaction           294 non-null 

In [52]:
train.to_csv(dp + 'employee_turnover/train_label_encoder.csv')
test.to_csv(dp + 'employee_turnover/test_label_encoder.csv')

In [60]:
train_load = pd.read_csv(dp + 'employee_turnover/train_label_encoder.csv', index_col=0)
test_load = pd.read_csv(dp + 'employee_turnover/test_label_encoder.csv', index_col=0)

X_train, X_valid, y_train, y_valid = train_test_split(train_load.drop('Attrition',axis=1), train_load['Attrition'], test_size=0.2, random_state=2024)

## LR

In [54]:
model = LogisticRegression(max_iter=100, 
                           verbose=True, 
                           random_state=2024,
                           tol=1e-4
                          )

model.fit(X_train, y_train)

# Binary classification result, 0 or 1
# predict = model.predict(test_load)
# print(predict)

# Binary classification task, with 2 probability values: the probability for label=0 and label=1
predict = model.predict_proba(test_load)[:, 1]
print('\nPredict: \n{}'.format(predict))

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           33     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.93147D-01    |proj g|=  4.79184D+03

At iterate   50    f=  4.07559D-01    |proj g|=  1.23471D+00

At iterate  100    f=  3.95693D-01    |proj g|=  4.22680D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   33    100    122      1     0     0   4.227D+00   3.957D-01
  F =  0.39569270566385484     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 

Predict: 
[0.09569216 0.33722098 0.26765036 0.08178544 0.38732371 0.2

 This problem is unconstrained.


In [56]:
test_load['Attrition'] = predict
test_load[['Attrition']].to_csv(dp + 'employee_turnover/submit_lr.csv')
print('submit_lr.csv saved')

submit_lr.csv saved


In [37]:
# Convert to binary classification output
#test_load['Attrition'] = test_load['Attrition'].map(lambda x: 1 if x >= 0.5 else 0)
#test_load[['Attrition']].to_csv('dataset/submit_lr.csv')

## SVM (Support Vector Machine)

In [61]:
# Three SVM Methods

# sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=1, decision_function_shape='ovr', random_state=None)

# sklearn.svm.NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=1, decision_function_shape='ovr', random_state=None)

# sklearn.svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)

In [62]:
mms = MinMaxScaler(feature_range=(0, 1))
X_train = mms.fit_transform(X_train)
X_valid = mms.fit_transform(X_valid)
Y_test = mms.fit_transform(test_load)

In [63]:
X_train.shape

(940, 32)

In [64]:

model = LinearSVC(max_iter=1000,
                  random_state=33,
                  verbose=True)

model.fit(X_train, y_train)
predict = model.predict(Y_test)
print('\nPredict\n:{}'.format(predict))

[LibLinear]iter  1 act 4.358e+02 pre 4.358e+02 delta 2.541e-01 f 9.400e+02 |g| 3.430e+03 CG   1
cg reaches trust region boundary
iter  2 act 4.791e+01 pre 4.791e+01 delta 5.731e-01 f 5.042e+02 |g| 2.423e+02 CG   1
cg reaches trust region boundary
iter  3 act 5.192e+01 pre 4.635e+01 delta 6.917e-01 f 4.563e+02 |g| 2.785e+02 CG   3
cg reaches trust region boundary
iter  4 act 1.604e+01 pre 1.391e+01 delta 8.292e-01 f 4.044e+02 |g| 1.273e+02 CG   5
cg reaches trust region boundary
iter  5 act 4.172e+00 pre 3.797e+00 delta 9.209e-01 f 3.884e+02 |g| 6.714e+01 CG   8
iter  6 act 8.779e-01 pre 8.635e-01 delta 9.209e-01 f 3.842e+02 |g| 2.235e+01 CG  10
iter  7 act 2.629e-02 pre 2.624e-02 delta 9.209e-01 f 3.833e+02 |g| 2.757e+00 CG   8
iter  8 act 1.439e-03 pre 1.440e-03 delta 9.209e-01 f 3.833e+02 |g| 2.317e-01 CG  12

Predict
:[0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 

In [65]:
# predict_series = pd.Series(predict, index = test_load.index)
# submission_df = pd.DataFrame({'Attrition':predict})

test_load['Attrition'] = predict
test_load[['Attrition']].to_csv(dp + 'employee_turnover/submit_svc.csv')
# print('submit_svc.csv saved')