In [1]:
%pip install import-ipynb

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing the needed Libraries
import import_ipynb
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB


import warnings
warnings.filterwarnings('ignore')

In [3]:
# Reading Stored values from other .ipynb file
%store -r X
%store -r y

In [4]:
X.head()

Unnamed: 0,Age,Gender,EducationBackground,MaritalStatus,EmpDepartment,EmpJobRole,BusinessTravelFrequency,DistanceFromHome,EmpEducationLevel,EmpEnvironmentSatisfaction,...,EmpLastSalaryHikePercent,EmpRelationshipSatisfaction,TotalWorkExperienceInYears,TrainingTimesLastYear,EmpWorkLifeBalance,ExperienceYearsAtThisCompany,ExperienceYearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,32,1,2,2,5,13,2,10,3,4,...,12,4,10,2,2,10,7,0,8,0
1,47,1,2,2,5,13,2,14,4,4,...,12,4,20,2,3,7,7,1,7,0
2,40,1,1,1,5,13,1,5,4,4,...,21,3,20,2,3,18,13,1,12,0
3,41,1,0,0,3,8,2,10,4,2,...,15,2,23,2,2,21,6,12,6,0
4,60,1,2,2,5,13,2,16,4,1,...,14,4,10,1,3,2,2,2,2,0


In [5]:
y.head()

Unnamed: 0,PerformanceRating
0,3
1,3
2,4
3,3
4,3


# Logistic Regression

In [6]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


feature_selector = SelectKBest(score_func=chi2, k=10)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)

In [7]:
# Fine tuning the parameters
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg_model = LogisticRegression()
parameters = {'solver':['liblinear', 'lbfgs', 'sag'],
             'C':[0.1,1,5,2]}

grid_model = GridSearchCV(reg_model, parameters, cv=5, scoring='accuracy', error_score=0)
grid_model.fit(X_train,y_train)

GridSearchCV(cv=5, error_score=0, estimator=LogisticRegression(),
             param_grid={'C': [0.1, 1, 5, 2],
                         'solver': ['liblinear', 'lbfgs', 'sag']},
             scoring='accuracy')

In [8]:
# Viewing the best parameters from the Search CV for fine tuning parameters
grid_model.best_params_

{'C': 5, 'solver': 'liblinear'}

In [9]:
# Viewing the best Score from the Search CV for fine tuning parameters
grid_model.best_score_

0.8421559268098647

In [10]:
# Logistic Regression modeling and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
reg_model = LogisticRegression(solver='liblinear' ,C=5)
reg_model.fit(X_train, y_train)

LogisticRegression(C=5, solver='liblinear')

In [11]:
reg_df = pd.concat([X_test, y_test], axis=1)
%store reg_model
%store reg_df

Stored 'reg_model' (LogisticRegression)
Stored 'reg_df' (DataFrame)


# Random Forest Classifier

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest Classifier
rfc_model = RandomForestClassifier(random_state=42)

# Train the model
rfc_model.fit(X_train, y_train)

# Calculate feature importances
feature_importances = rfc_model.feature_importances_

# Create a DataFrame to store feature importance scores along with corresponding feature names
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by feature importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 important features
top_k_features = 10
top_features = feature_importance_df.head(top_k_features)

print("Top", top_k_features, "features:")
print(top_features)

# Select the top K features for further analysis
selected_features = top_features['Feature'].tolist()
X_train = X_train[selected_features]
X_test= X_test[selected_features]

Top 10 features:
                         Feature  Importance
16      EmpLastSalaryHikePercent    0.241547
9     EmpEnvironmentSatisfaction    0.199806
23       YearsSinceLastPromotion    0.106140
22  ExperienceYearsInCurrentRole    0.043889
5                     EmpJobRole    0.036006
10                 EmpHourlyRate    0.032357
4                  EmpDepartment    0.031001
21  ExperienceYearsAtThisCompany    0.026876
7               DistanceFromHome    0.026859
20            EmpWorkLifeBalance    0.024379


In [13]:
# Spliiting Train and Test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rfc_model = RandomForestClassifier(n_estimators=50)
parameters = {"max_depth": [3,10,8],
              "max_features": [6,9,12,15],
              "criterion": ["gini","log_loss"]}

rscv_model = RandomizedSearchCV (rfc_model, param_distributions=parameters,cv=3)
rscv_model.fit(X_train,y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(n_estimators=50),
                   param_distributions={'criterion': ['gini', 'log_loss'],
                                        'max_depth': [3, 10, 8],
                                        'max_features': [6, 9, 12, 15]})

In [14]:
# Viewing the best parameters from the Search CV for fine tuning parameters
rscv_model.best_params_

{'max_features': 6, 'max_depth': 10, 'criterion': 'gini'}

In [15]:
# Viewing the best Score from the Search CV for fine tuning parameters
rscv_model.best_score_

0.9456366237482117

In [16]:
# Spliiting Train and Test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Modle Training and evaluation
rfc_model = RandomForestClassifier(max_features=6,max_depth= 10,criterion='gini')
rfc_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, max_features=6)

In [17]:
rfc_df = pd.concat([X_test, y_test], axis=1)
%store rfc_model
%store rfc_df

Stored 'rfc_model' (RandomForestClassifier)
Stored 'rfc_df' (DataFrame)


# Decision Tree Classifier

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Random Forest Classifier
dtc_model = DecisionTreeClassifier(random_state=42)

# Train the model
dtc_model.fit(X_train, y_train)

# Calculate feature importances
feature_importances = dtc_model.feature_importances_

# Create a DataFrame to store feature importance scores along with corresponding feature names
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by feature importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top 10 important features
top_k_features = 10
top_features = feature_importance_df.head(top_k_features)

print("Top", top_k_features, "features:")
print(top_features)

# Select the top K features for further analysis (optional)
selected_features = top_features['Feature'].tolist()
X_train = X_train[selected_features]
X_test= X_test[selected_features]

Top 10 features:
                         Feature  Importance
16      EmpLastSalaryHikePercent    0.299059
9     EmpEnvironmentSatisfaction    0.245824
23       YearsSinceLastPromotion    0.138450
22  ExperienceYearsInCurrentRole    0.060815
5                     EmpJobRole    0.047253
20            EmpWorkLifeBalance    0.032761
7               DistanceFromHome    0.026996
10                 EmpHourlyRate    0.022690
17   EmpRelationshipSatisfaction    0.016596
6        BusinessTravelFrequency    0.014188


In [19]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dtc_model = DecisionTreeClassifier()
parameters = {'max_depth':[10,15,5,3],
             'criterion': ['gini','entropy']}

rscv_model = RandomizedSearchCV (dtc_model, param_distributions=parameters,cv=5)
rscv_model.fit(X_train,y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 15, 5, 3]})

In [20]:
# Viewing the best parameters from the Search CV for fine tuning parameters
rscv_model.best_params_

{'max_depth': 10, 'criterion': 'entropy'}

In [21]:
# Viewing the best Score from the Search CV for fine tuning parameters
rscv_model.best_score_

0.9213115126718947

In [22]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Modle Training and evaluation
dtc_model = DecisionTreeClassifier(criterion='entropy',max_depth=10)
dtc_model.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=10)

In [23]:
dtc_df = pd.concat([X_test, y_test], axis=1)
%store dtc_model
%store dtc_df

Stored 'dtc_model' (DecisionTreeClassifier)
Stored 'dtc_df' (DataFrame)


# K Nearest Neighbors Classifier

In [24]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection
feature_selector = SelectKBest(score_func=chi2, k=10)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)

In [25]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
knn_model = KNeighborsClassifier()
parameters = {"n_neighbors": [3, 5, 7, 10, 15],
              "weights": ['uniform', 'distance'],
              "algorithm": ['auto','ball_tree','kd_tree','brute']}
rscv_model = RandomizedSearchCV (knn_model, param_distributions=parameters,cv=3)
rscv_model.fit(X_train,y_train)

RandomizedSearchCV(cv=3, estimator=KNeighborsClassifier(),
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'n_neighbors': [3, 5, 7, 10, 15],
                                        'weights': ['uniform', 'distance']})

In [26]:
# Viewing the best parameters from the Search CV for fine tuning parameters
rscv_model.best_params_

{'weights': 'distance', 'n_neighbors': 3, 'algorithm': 'brute'}

In [27]:
# Viewing the best Score from the Search CV for fine tuning parameters
rscv_model.best_score_

0.8140200286123033

In [28]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

# Modle Training and evaluation
knn_model = KNeighborsClassifier(n_neighbors=3,algorithm = 'auto', weights = 'uniform')
knn_model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [29]:
knn_df = pd.concat([X_test, y_test], axis=1)
%store knn_model
%store knn_df

Stored 'knn_model' (KNeighborsClassifier)
Stored 'knn_df' (DataFrame)


# Multinomial Naive Bayes

In [30]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection
feature_selector = SelectKBest(score_func=chi2, k=10)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)

In [31]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
mnb_model = MultinomialNB()
parameters = {"alpha": [0.1,1,10,12,0]}
rscv_model = RandomizedSearchCV (mnb_model, param_distributions=parameters,cv=3)
rscv_model.fit(X_train,y_train)

RandomizedSearchCV(cv=3, estimator=MultinomialNB(),
                   param_distributions={'alpha': [0.1, 1, 10, 12, 0]})

In [32]:
# Viewing the best parameters from the Search CV for fine tuning parameters
rscv_model.best_params_

{'alpha': 0.1}

In [33]:
# Viewing the best Score from the Search CV for fine tuning parameters
rscv_model.best_score_

0.6003814973772056

In [34]:
# Multinomial Naive Bayes modeling and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
nb_model_Multi = MultinomialNB(alpha=0.1)
nb_model_Multi.fit(X_train, y_train)

MultinomialNB(alpha=0.1)

In [35]:
nb_Multi_df = pd.concat([X_test, y_test], axis=1)
%store nb_model_Multi
%store nb_Multi_df

Stored 'nb_model_Multi' (MultinomialNB)
Stored 'nb_Multi_df' (DataFrame)


# Gaussian Naive Bayes

In [36]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, random_state=42)

# Feature Selection
feature_selector = SelectKBest(score_func=chi2, k=10)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)

In [37]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, random_state=42)
mnb_model = GaussianNB()
parameters = {"var_smoothing": [0.1,1,10,12,0]}
rscv_model = RandomizedSearchCV (mnb_model, param_distributions=parameters,cv=3)
rscv_model.fit(X_train,y_train)

RandomizedSearchCV(cv=3, estimator=GaussianNB(),
                   param_distributions={'var_smoothing': [0.1, 1, 10, 12, 0]})

In [38]:
# Viewing the best parameters from the Search CV for fine tuning parameters
rscv_model.best_params_

{'var_smoothing': 0}

In [39]:
# Viewing the best Score from the Search CV for fine tuning parameters
rscv_model.best_score_

0.8032102968798801

In [40]:
# Multinomial Naive Bayes modeling and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=12, random_state=30)
nb_model_Gauss = GaussianNB(var_smoothing=0)
nb_model_Gauss.fit(X_train, y_train)

GaussianNB(var_smoothing=0)

In [41]:
nb_Gauss_df = pd.concat([X_test, y_test], axis=1)
%store nb_model_Gauss
%store nb_Gauss_df

Stored 'nb_model_Gauss' (GaussianNB)
Stored 'nb_Gauss_df' (DataFrame)


# Bernoulli Naive Bayes

In [42]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, random_state=42)

# Feature selection
feature_selector = SelectKBest(score_func=chi2, k=10)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)

In [43]:
# Splitting the X and y into Test and Train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, random_state=42)
bnb_model = BernoulliNB()
parameters = {'alpha': [0.1, 0.5, 1.0, 1.5]}
rscv_model = RandomizedSearchCV (bnb_model, param_distributions=parameters,cv=3)
rscv_model.fit(X_train,y_train)

RandomizedSearchCV(cv=3, estimator=BernoulliNB(),
                   param_distributions={'alpha': [0.1, 0.5, 1.0, 1.5]})

In [44]:
# Viewing the best parameters from the Search CV for fine tuning parameters
rscv_model.best_params_

{'alpha': 1.0}

In [45]:
# Viewing the best Score from the Search CV for fine tuning parameters
rscv_model.best_score_

0.6018343296778705

In [46]:
# Multinomial Naive Bayes modeling and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=15, random_state=15)
nb_model_Bern = BernoulliNB(alpha=1.0)
nb_model_Bern.fit(X_train, y_train)

BernoulliNB()

In [47]:
nb_Bern_df = pd.concat([X_test, y_test], axis=1)
%store nb_model_Bern
%store nb_Bern_df

Stored 'nb_model_Bern' (BernoulliNB)
Stored 'nb_Bern_df' (DataFrame)
