In [1]:
# dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# read in csv
file_path = "resources/IBM_employee_data.csv"
attrition_df = pd.read_csv(file_path)
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# determine if any missing values and data types for columns
attrition_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [4]:
# column names
attrition_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [5]:
# ambiguous/unnecessary columns - only going to keep MonthlyIncome
attrition_df[["DailyRate", "HourlyRate", "MonthlyIncome", "MonthlyRate", "StandardHours", "Over18"]]

Unnamed: 0,DailyRate,HourlyRate,MonthlyIncome,MonthlyRate,StandardHours,Over18
0,1102,94,5993,19479,80,Y
1,279,61,5130,24907,80,Y
2,1373,92,2090,2396,80,Y
3,1392,56,2909,23159,80,Y
4,591,40,3468,16632,80,Y
...,...,...,...,...,...,...
1465,884,41,2571,12290,80,Y
1466,613,42,9991,21457,80,Y
1467,155,87,6142,5174,80,Y
1468,1023,63,5390,13243,80,Y


In [6]:
# count of attrition column values
attrition_df["Attrition"].value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

Drop unnecessary columns

In [7]:
columns_to_drop = ["DailyRate", "HourlyRate", "MonthlyRate", "StandardHours", "EmployeeCount", "EmployeeNumber", "Over18"]

In [8]:
attrition_df.drop(columns=columns_to_drop, inplace=True)
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,Female,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,Male,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,Male,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,Female,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,Male,3,...,3,4,1,6,3,3,2,2,2,2


Encode categorical data

In [9]:
attrition_cat = attrition_df.dtypes[attrition_df.dtypes == "object"].index.tolist()
attrition_cat

['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [10]:
attrition_df[attrition_cat].nunique()

Attrition         2
BusinessTravel    3
Department        3
EducationField    6
Gender            2
JobRole           9
MaritalStatus     3
OverTime          2
dtype: int64

In [11]:
attrition_cat.remove("Attrition")
attrition_cat

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [12]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(attrition_df[attrition_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(attrition_cat)
encode_df.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [13]:
# Merge one-hot encoded features and drop the originals
attrition_df = attrition_df.merge(encode_df,left_index=True, right_index=True)
attrition_df = attrition_df.drop(attrition_cat,1)
attrition_df.head()
#attrition_df.Attrition.loc('Yes')

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,Yes,1,2,2,3,2,4,5993,8,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,49,No,8,1,3,2,2,2,5130,1,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,37,Yes,2,2,4,2,1,3,2090,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,33,No,3,4,4,3,1,3,2909,1,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,27,No,2,1,1,3,1,2,3468,9,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [14]:
# Create our target
y = attrition_df["Attrition"]

y.value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

Split data in training/test groups

In [15]:
# Split our preprocessed data into our features and target arrays
X = attrition_df.drop("Attrition",1)
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78, test_size= 0.20, train_size= 0.80)

  


In [16]:
type(X)

pandas.core.frame.DataFrame

Standardize

In [17]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Model

In [18]:
Forest = RandomForestClassifier(random_state=40, n_estimators=100)
Forest.fit(X_train_scaled, y_train)
print(f'Training Score: {Forest.score(X_train_scaled, y_train)}')
print(f'Testing Score: {Forest.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.8673469387755102


In [19]:
y_true = y_test
y_pred_test = Forest.predict(X_test)
y_pred_test


array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', '

In [20]:
accuracy_score(y_true, y_pred_test)

0.8537414965986394

In [21]:
confusion_matrix(y_true, y_pred_test)

array([[251,   1],
       [ 42,   0]])

In [22]:
print(classification_report(y_true, y_pred_test))

              precision    recall  f1-score   support

          No       0.86      1.00      0.92       252
         Yes       0.00      0.00      0.00        42

    accuracy                           0.85       294
   macro avg       0.43      0.50      0.46       294
weighted avg       0.73      0.85      0.79       294



In [23]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [24]:
# check version number
import imblearn
print(imblearn.__version__)

0.7.0


In [25]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler# define oversampling strategy
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)
# summarize class distribution
print(Counter(y_over))

Counter({'Yes': 1233, 'No': 1233})


In [26]:
# Split the preprocessed data into a training and testing dataset
X_traino, X_testo, y_traino, y_testo = train_test_split(X_over, y_over, random_state=78, test_size= 0.20, train_size= 0.80)

In [27]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scalero = scaler.fit(X_traino)

# Scale the data
X_train_scaledo = X_scaler.transform(X_traino)
X_test_scaledo = X_scaler.transform(X_testo)

In [29]:
Foresto = RandomForestClassifier(random_state=67, n_estimators=2000)
Foresto.fit(X_train_scaledo, y_traino)
print(f'Training Score: {Foresto.score(X_train_scaledo, y_traino)}')
print(f'Testing Score: {Foresto.score(X_test_scaledo, y_testo)}')

Training Score: 1.0
Testing Score: 0.951417004048583


In [30]:
y_testo

313      No
1034     No
357     Yes
129      No
368     Yes
       ... 
2237    Yes
19       No
76       No
1779    Yes
2379    Yes
Name: Attrition, Length: 494, dtype: object

In [31]:
y_trueo = y_testo
y_pred_testo = Foresto.predict(X_testo)
#y_pred_testo

In [32]:
confusion_matrix(y_trueo, y_pred_testo)

array([[260,   0],
       [234,   0]])

In [33]:
accuracy_score(y_trueo, y_pred_testo)

0.5263157894736842

In [34]:
print(classification_report(y_trueo, y_pred_testo))

              precision    recall  f1-score   support

          No       0.53      1.00      0.69       260
         Yes       0.00      0.00      0.00       234

    accuracy                           0.53       494
   macro avg       0.26      0.50      0.34       494
weighted avg       0.28      0.53      0.36       494



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
never =0
ever=0
for k in y_trueo:
    if k == 'No':
        never=never+1
    if k== 'Yes':
        ever=ever+1
print(f'yes {ever}, no {never}')

yes 234, no 260


In [36]:
hever =0
lever=0
for k in y_pred_testo:
    if k == 'No':
        hever=hever+1
    if k== 'Yes':
        lever=lever+1
print(f'yes {lever}, no {hever}')

yes 0, no 494


In [37]:
feature_importances = Forest.feature_importances_

In [None]:
features = sorted(zip(X.columns, Forest.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
dic = {'data': [X], 'target': y, 'feature_names': X.columns, 'target_name': 'attrition'}

In [None]:
y_test

In [None]:
i=0
nope=0
for g in y_test:
    if g == 'Yes':
        i=i+1
    else:
        nope= nope+1


In [None]:
print(f'i={i} n={nope}')