In [None]:
import pandas as pd

#Model imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

#model dumping
from joblib import dump
import os

In [None]:
#dataset
data = pd.read_csv('../data/data.csv')

1. Random Forest classification: Will handle imbalances in left employees without the need to apply other techniques such as oversampling and undersampling
2. Metrics: 
- accuracy
- ROC/AUC
- precision/recall
- f1

In [None]:
data

In [None]:
#splitting data dataset
X = data.drop('Attrition', axis = 1)
y = data['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

#model pipeline 
pipeline = Pipeline([
        ('rf', RandomForestClassifier(random_state=42))
])

## Define the grid of hyperparameters to search
param_grid = {
    'rf__n_estimators': [100, 200, 300],
    'rf__max_depth': [None, 5, 10],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4]
}

# Create the grid search object
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Print the best hyperparameters
print(f"Best hyperparameters: {best_params}")


In [None]:
best_params

In [None]:
#above model should be in a pipeline, saved, and imported
#later for use in evaluation

In [12]:
#fit on model
best_model = grid_search.best_estimator_

#fit on train
result = best_model.fit(X_train, y_train)

#y-pred
y_pred = best_model.predict(X_test)


For future usability, we save our model, trained on this dataset, to be used for future tasks

In [None]:
folder = '../data'
model_file = os.path.join(folder, 'best_model.joblib')
dump(best_model, model_file)

print(f"Pipeline saved as '{model_file}'")

In [None]:
#evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
#classification report
classification_report = classification_report(y_test, y_pred)

print(accuracy)
print(classification_report)