In [2]:
import numpy as np
import pandas as pd
from joblib import load
from processed_feature_mapping import mapping
from train_model import data_cleaning

In [3]:
# 1a. Output the required cleaned data for train dataset
# Required processed data is already done in train_model.py
df = pd.read_csv('../data/raw_split_data/employee_train.csv')
df = data_cleaning(df)
X_train = df.drop(columns=['Attrition'])

column_transformer = load('./preprocessor/column_transformer.pkl')
label_encoder = load('./preprocessor/label_encoder.pkl')
RF_clf = load('./model/RF_clf.joblib')

y_train = df['Attrition']
X_train_processed = column_transformer.transform(X_train)
y_train_pred = RF_clf.predict(X_train_processed)
y_train_pred_inverse = label_encoder.inverse_transform(y_train_pred)
df['prediction'] = y_train_pred_inverse
df.rename(columns={'Attrition' : 'target'}, inplace=True)
df.to_csv('../data/cleaned_employee_train.csv', index=False)

# 1b. Output the required cleaned/processed data for test dataset
test_df = pd.read_csv("../data/raw_split_data/employee_test.csv")
test_df = data_cleaning(test_df)
X_test = test_df.drop(columns=['Attrition'])
y_test = test_df['Attrition']
X_test_processed = column_transformer.transform(X_test)
X_test_processed = pd.DataFrame.from_records(X_test_processed)
X_test_processed = mapping(X_test_processed, column_transformer)
X_test_processed.to_csv('../data/X_test_processed.csv', index=False)
y_test_pred = RF_clf.predict(X_test_processed)
y_test_pred_inverse = label_encoder.inverse_transform(y_test_pred)
y_test_pred_prob = RF_clf.predict_proba(X_test_processed)[:1]
test_df['prediction'] = y_test_pred_inverse
test_df.rename(columns={'Attrition' : 'target'}, inplace=True)
test_df.to_csv('../data/cleaned_employee_test.csv', index=False)

In [9]:
df.dtypes.to_dict()

{'Age': dtype('int64'),
 'target': dtype('O'),
 'BusinessTravel': dtype('int64'),
 'DailyRate': dtype('int64'),
 'Department': dtype('O'),
 'DistanceFromHome': dtype('int64'),
 'Education': dtype('int64'),
 'EducationField': dtype('O'),
 'EnvironmentSatisfaction': dtype('int64'),
 'Gender': dtype('O'),
 'HourlyRate': dtype('int64'),
 'JobInvolvement': dtype('int64'),
 'JobLevel': dtype('int64'),
 'JobRole': dtype('O'),
 'JobSatisfaction': dtype('int64'),
 'MaritalStatus': dtype('O'),
 'MonthlyIncome': dtype('int64'),
 'MonthlyRate': dtype('int64'),
 'NumCompaniesWorked': dtype('int64'),
 'OverTime': dtype('O'),
 'PercentSalaryHike': dtype('int64'),
 'PerformanceRating': dtype('int64'),
 'RelationshipSatisfaction': dtype('int64'),
 'StockOptionLevel': dtype('int64'),
 'TotalWorkingYears': dtype('int64'),
 'TrainingTimesLastYear': dtype('int64'),
 'WorkLifeBalance': dtype('int64'),
 'YearsAtCompany': dtype('int64'),
 'YearsInCurrentRole': dtype('int64'),
 'YearsSinceLastPromotion': dtype