In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
%matplotlib inline

In [2]:
df = pd.read_csv('data/IBM HR Analytics Employee Attrition & Performance.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
len(df.columns), len(df)

(35, 1470)

In [52]:
cols = pd.Series(df.columns)
cols.head()

0               Age
1         Attrition
2    BusinessTravel
3         DailyRate
4        Department
dtype: object

In [113]:
import re
col_dtypes = pd.Series()
i = 0
for col in cols:
    dtype_str = re.sub(r"<class '|'>", "", str(type(df[col][0]))).replace('numpy.', '')
    col_dtypes = col_dtypes._append(pd.Series(dtype_str, [i]))
    i += 1
col_dtypes.head()

0    int64
1      str
2      str
3    int64
4      str
dtype: object

In [138]:
col_examples = []
i = 0

for col in cols:
    if col_dtypes.iloc[i].find('str') == -1:
        col_examples += [[df[col][np.random.randint(len(cols))] for ix in range(3)]]
    else:
        col_examples += [list(df[col].unique())[:3]]
    i += 1

col_examples[:5]

[[34, 44, 32],
 ['Yes', 'No'],
 ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'],
 [1005, 125, 591],
 ['Sales', 'Research & Development', 'Human Resources']]

In [163]:
feature_df = pd.DataFrame({
    'Feature': cols,
    'Datatype': col_dtypes,
    'Example Values': col_examples
})
feature_df.head()

Unnamed: 0,Feature,Datatype,Example Values
0,Age,int64,"[34, 44, 32]"
1,Attrition,str,"[Yes, No]"
2,BusinessTravel,str,"[Travel_Rarely, Travel_Frequently, Non-Travel]"
3,DailyRate,int64,"[1005, 125, 591]"
4,Department,str,"[Sales, Research & Development, Human Resources]"


In [164]:
feature_df.to_csv('Description of Data Features.csv', index=False)

In [None]:
profile = ProfileReport(df, title='Profiling Report', dark_mode=True)

In [None]:
# profile.to_file('my_report.html')

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['DistanceFromHome'], df['YearsAtCompany'])
plt.show()

In [None]:
# Example DataFrame
data = {
    'A': ['dog', 'cat', 'dog', 'bird'],
    'B': [1, 2, 3, 4]
}
df2 = pd.DataFrame(data)
print(f'str column before factorization: \n{df2['A']}')
# Notice how the dtype is an "object" instead of str.
print(f'first element of str columne type: {type(df2['A'][0])}')

# Dictionary to store the mappings
mappings = {}

# Encoding string columns to numeric
for column in df2.select_dtypes(include=['object']).columns:
    df2[column], mapping = pd.factorize(df2[column])
    mappings[column] = dict(enumerate(mapping))

print('\nOriginal DataFrame:')
print(pd.DataFrame(data))
print('\nEncoded DataFrame:')
print(df2)
print('\nMappings:')
print(mappings)

## Let's apply our knowledge to the original data.

In [None]:
df.head()

In [None]:
# We already have the DataFrame.

# Create a dictionary to store the mappings
mappings = {}

for column in df.select_dtypes(include='object'):
    df[column], mapping = df[column].factorize()
    mappings[column] = dict(enumerate(mapping))

In [None]:
df.head()

In [None]:
mappings

In [None]:
# Split the data into X and y
X = df.drop('Attrition', axis=1)
y = df['Attrition']

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
# Instantiating a Random Forest Regressor
clf = RandomForestClassifier()

In [None]:
# Fit the model to the data
clf.fit(X_train, y_train)

In [None]:
# Test the model on the test data
clf.score(X_test, y_test)

In [None]:
y_preds = clf.predict(X_test)
y_probs = clf.predict_proba(X_test)

### Different classification metrics

In [None]:
# Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_preds))

In [None]:
# Receiver Operating Characteristic (ROC curve)/Area under curve (AUC)
from sklearn.metrics import roc_curve, roc_auc_score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_probs[:, 1])
print(f'Area under ROC: {roc_auc_score(y_test, y_preds)}')
print(f'FPR: {false_positive_rate}, \nTPR: {true_positive_rate}, \nThresholds: {thresholds}')

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_preds))

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

### Let's try to improve the model through hyperparameters.

In [None]:
clf = RandomForestClassifier()
clf.get_params()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define a grid of hyperparameters
grid = {'n_estimators': list(range(10, 200, 10)),
        'max_depth': [None] + list(range(5, 50, 5)),
        'max_features': (['sqrt', 'log2'] + list(range(1, 20, 2))),
        'min_samples_split': list(range(2, 20, 2)),
        'min_samples_leaf': list(range(1, 10, 1))}

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Use all cores
clf = RandomForestClassifier(n_jobs=-1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10000, # Trying 1000 different models
                            cv=3, # 10-fold cross-validation
                            verbose=2) # Print out the results

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train)

# Find the best hyperparameters
print(rs_clf.best_params_)

# Scoring automatically uses the best hyperparameters
rs_clf.score(X_test, y_test)

In [None]:
import pickle

pickle.dump(rs_clf, open('random_forest_model_1.pkl', 'wb'))