In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#SK-Learn
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import time
from sklearn.feature_selection import RFE, f_regression
from pprint import pprint
from sklearn.model_selection import GridSearchCV,cross_val_score,train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,precision_score,f1_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('Train_Dataset_(12.csv')
test_data = pd.read_csv('Test_Dataset_(1)_(1).csv')

In [3]:
train_data.head()

Unnamed: 0,EmployeeID,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,WorkLifeBalance,...,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome,Attrition
0,5110001,35.0,Rarely,Analytics,5.0,CA,Male,69.0,1,1,...,1,1,1,20.0,7.0,2.0,,M,18932.0,1
1,5110002,32.0,Yes,Sales,5.0,Statistics,Female,62.0,4,3,...,0,8,0,20.0,4.0,1.0,,Single,18785.0,1
2,5110003,31.0,Rarely,Analytics,5.0,Statistics,F,45.0,5,3,...,1,3,0,26.0,12.0,1.0,3.0,Single,22091.0,0
3,5110004,34.0,Yes,Sales,10.0,Statistics,Female,32.0,3,2,...,1,1,0,23.0,5.0,1.0,3.0,Divorsed,20302.0,1
4,5110005,37.0,No,Analytics,27.0,Statistics,Female,49.0,3,4,...,1,8,0,21.0,12.0,1.0,9.0,Divorsed,21674.0,0


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5180 entries, 0 to 5179
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EmployeeID          5180 non-null   int64  
 1   Age                 4864 non-null   float64
 2   TravelProfile       5180 non-null   object 
 3   Department          5056 non-null   object 
 4   HomeToWork          4925 non-null   float64
 5   EducationField      5180 non-null   object 
 6   Gender              5134 non-null   object 
 7   HourlnWeek          4893 non-null   float64
 8   Involvement         5180 non-null   int64  
 9   WorkLifeBalance     5180 non-null   int64  
 10  Designation         5142 non-null   object 
 11  JobSatisfaction     5180 non-null   int64  
 12  ESOPs               5180 non-null   int64  
 13  NumCompaniesWorked  5180 non-null   int64  
 14  OverTime            5180 non-null   int64  
 15  SalaryHikelastYear  5011 non-null   float64
 16  WorkEx

In [5]:
train_data.dropna(subset=['Attrition'],inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5180 entries, 0 to 5179
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EmployeeID          5180 non-null   int64  
 1   Age                 4864 non-null   float64
 2   TravelProfile       5180 non-null   object 
 3   Department          5056 non-null   object 
 4   HomeToWork          4925 non-null   float64
 5   EducationField      5180 non-null   object 
 6   Gender              5134 non-null   object 
 7   HourlnWeek          4893 non-null   float64
 8   Involvement         5180 non-null   int64  
 9   WorkLifeBalance     5180 non-null   int64  
 10  Designation         5142 non-null   object 
 11  JobSatisfaction     5180 non-null   int64  
 12  ESOPs               5180 non-null   int64  
 13  NumCompaniesWorked  5180 non-null   int64  
 14  OverTime            5180 non-null   int64  
 15  SalaryHikelastYear  5011 non-null   float64
 16  WorkEx

In [6]:
X_df = train_data.drop(columns = ['EmployeeID','Attrition'])

In [7]:
X_df.head()

Unnamed: 0,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,WorkLifeBalance,Designation,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,35.0,Rarely,Analytics,5.0,CA,Male,69.0,1,1,Executive,1,1,1,1,20.0,7.0,2.0,,M,18932.0
1,32.0,Yes,Sales,5.0,Statistics,Female,62.0,4,3,Executive,2,0,8,0,20.0,4.0,1.0,,Single,18785.0
2,31.0,Rarely,Analytics,5.0,Statistics,F,45.0,5,3,Manager,2,1,3,0,26.0,12.0,1.0,3.0,Single,22091.0
3,34.0,Yes,Sales,10.0,Statistics,Female,32.0,3,2,Manager,4,1,1,0,23.0,5.0,1.0,3.0,Divorsed,20302.0
4,37.0,No,Analytics,27.0,Statistics,Female,49.0,3,4,Manager,4,1,8,0,21.0,12.0,1.0,9.0,Divorsed,21674.0


In [8]:
X_df.isna().sum()

Age                   316
TravelProfile           0
Department            124
HomeToWork            255
EducationField          0
Gender                 46
HourlnWeek            287
Involvement             0
WorkLifeBalance         0
Designation            38
JobSatisfaction         0
ESOPs                   0
NumCompaniesWorked      0
OverTime                0
SalaryHikelastYear    169
WorkExperience        187
LastPromotion          70
CurrentProfile        311
MaritalStatus           0
MonthlyIncome          93
dtype: int64

In [9]:
X_df.select_dtypes(include=np.number)

Unnamed: 0,Age,HomeToWork,HourlnWeek,Involvement,WorkLifeBalance,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MonthlyIncome
0,35.0,5.0,69.0,1,1,1,1,1,1,20.0,7.0,2.0,,18932.0
1,32.0,5.0,62.0,4,3,2,0,8,0,20.0,4.0,1.0,,18785.0
2,31.0,5.0,45.0,5,3,2,1,3,0,26.0,12.0,1.0,3.0,22091.0
3,34.0,10.0,32.0,3,2,4,1,1,0,23.0,5.0,1.0,3.0,20302.0
4,37.0,27.0,49.0,3,4,4,1,8,0,21.0,12.0,1.0,9.0,21674.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5175,36.0,13.0,53.0,3,3,3,0,4,1,22.0,11.0,1.0,5.0,22142.0
5176,,9.0,66.0,1,4,4,1,9,0,23.0,8.0,0.0,2.0,17109.0
5177,29.0,,62.0,4,2,1,1,6,0,19.0,8.0,0.0,3.0,17532.0
5178,26.0,26.0,61.0,3,4,3,0,1,0,28.0,8.0,5.0,7.0,17942.0


In [10]:
X_df.select_dtypes(exclude=np.number)

Unnamed: 0,TravelProfile,Department,EducationField,Gender,Designation,MaritalStatus
0,Rarely,Analytics,CA,Male,Executive,M
1,Yes,Sales,Statistics,Female,Executive,Single
2,Rarely,Analytics,Statistics,F,Manager,Single
3,Yes,Sales,Statistics,Female,Manager,Divorsed
4,No,Analytics,Statistics,Female,Manager,Divorsed
...,...,...,...,...,...,...
5175,Rarely,Analytics,CA,F,Manager,Single
5176,Rarely,Marketing,CA,Female,Executive,Single
5177,Rarely,Analytics,CA,Female,Executive,M
5178,Rarely,Marketing,Statistics,Female,Executive,Divorsed


In [11]:
binary_num_cols = ['ESOPs','OverTime']
ordinal_num_cols = ['Involvement', 'WorkLifeBalance', 'JobSatisfaction']

In [12]:
X = train_data.drop(columns=['Attrition', 'EmployeeID'])
y = train_data['Attrition']

In [13]:
for col in ordinal_num_cols:
    X[col] = X[col].astype('category')
    test_data[col] = test_data[col].astype('category')

In [14]:
for col in binary_num_cols:
    X[col] = X[col].astype('object')
    test_data[col] = test_data[col].astype('object')

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5180 entries, 0 to 5179
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   Age                 4864 non-null   float64 
 1   TravelProfile       5180 non-null   object  
 2   Department          5056 non-null   object  
 3   HomeToWork          4925 non-null   float64 
 4   EducationField      5180 non-null   object  
 5   Gender              5134 non-null   object  
 6   HourlnWeek          4893 non-null   float64 
 7   Involvement         5180 non-null   category
 8   WorkLifeBalance     5180 non-null   category
 9   Designation         5142 non-null   object  
 10  JobSatisfaction     5180 non-null   category
 11  ESOPs               5180 non-null   object  
 12  NumCompaniesWorked  5180 non-null   int64   
 13  OverTime            5180 non-null   object  
 14  SalaryHikelastYear  5011 non-null   float64 
 15  WorkExperience      4993 non-null   fl

In [16]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2630 entries, 0 to 2629
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   EmployeeID          2630 non-null   int64   
 1   Age                 2488 non-null   float64 
 2   TravelProfile       2630 non-null   object  
 3   Department          2572 non-null   object  
 4   HomeToWork          2504 non-null   float64 
 5   EducationField      2630 non-null   object  
 6   Gender              2600 non-null   object  
 7   HourlnWeek          2494 non-null   float64 
 8   Involvement         2630 non-null   category
 9   WorkLifeBalance     2630 non-null   category
 10  Designation         2600 non-null   object  
 11  JobSatisfaction     2630 non-null   category
 12  ESOPs               2630 non-null   object  
 13  NumCompaniesWorked  2630 non-null   int64   
 14  OverTime            2630 non-null   object  
 15  SalaryHikelastYear  2536 non-null   fl

In [17]:
cat_cols = [col for col in X.select_dtypes(include=['object']).columns.tolist() if col not in ordinal_num_cols]
num_cols = X.select_dtypes(exclude=['object', 'category']).columns.tolist()

In [18]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [19]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='N.A')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [20]:
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())])

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols),
        ('ord', ordinal_transformer, ordinal_num_cols)])

In [22]:
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42,))])
# model = Pipeline(steps=[('preprocessor', preprocessor),
#                         ('classifier', GaussianNB())])

In [23]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__criterion' : ["gini", "entropy", "log_loss"],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [26]:
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best parameters found:  {'classifier__criterion': 'gini', 'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}


In [27]:
y_pred = best_model.predict(X_val)
print('Validation Accuracy:', accuracy_score(y_val, y_pred))#0.9884169884169884

Validation Accuracy: 0.9874517374517374


In [28]:
X_test = test_data.drop(columns=['EmployeeID'])
test_data['Attrition'] = best_model.predict(X_test)

In [29]:
submission = test_data[['EmployeeID', 'Attrition']]
submission.to_csv('submission.csv', index=False)
print("Submission file created.")

Submission file created.


In [30]:
from google.colab import files
files.download('submission.csv')

ModuleNotFoundError: No module named 'google'

In [None]:
submission = test_data[['EmployeeID', 'Attrition']]
submission.to_csv('submission.csv', index=False)
print("Submission file created.")