In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report,f1_score,r2_score
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('H:\DATA\MY\practice\Practice-28\watson_healthcare_modified.csv')
data.head()

Unnamed: 0,EmployeeID,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,Shift,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1313919,41,No,Travel_Rarely,1102,Cardiology,1,2,Life Sciences,1,...,1,80,0,8,0,1,6,4,0,5
1,1200302,49,No,Travel_Frequently,279,Maternity,8,1,Life Sciences,1,...,4,80,1,10,3,3,10,7,1,7
2,1060315,37,Yes,Travel_Rarely,1373,Maternity,2,2,Other,1,...,2,80,0,7,3,3,0,0,0,0
3,1272912,33,No,Travel_Frequently,1392,Maternity,3,4,Life Sciences,1,...,3,80,0,8,3,3,8,7,3,0
4,1414939,27,No,Travel_Rarely,591,Maternity,2,1,Medical,1,...,4,80,1,6,3,3,2,2,2,2


In [3]:
print(data.info())
print(data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   EmployeeID                1676 non-null   int64 
 1   Age                       1676 non-null   int64 
 2   Attrition                 1676 non-null   object
 3   BusinessTravel            1676 non-null   object
 4   DailyRate                 1676 non-null   int64 
 5   Department                1676 non-null   object
 6   DistanceFromHome          1676 non-null   int64 
 7   Education                 1676 non-null   int64 
 8   EducationField            1676 non-null   object
 9   EmployeeCount             1676 non-null   int64 
 10  EnvironmentSatisfaction   1676 non-null   int64 
 11  Gender                    1676 non-null   object
 12  HourlyRate                1676 non-null   int64 
 13  JobInvolvement            1676 non-null   int64 
 14  JobLevel                

In [4]:
print(data['EmployeeID'].nunique())

1676


In [5]:
data = data.drop('EmployeeID', axis = 1)

In [6]:
print(data.isnull().sum())

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
Shift                       0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithC

In [7]:
# we are evaluating classification model 
# there are many models to test , here is some of them : random forest , logistic regression, KNeighborsClassifier, SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

encoder = LabelEncoder()
X = data.drop('Attrition', axis = 1).apply(lambda column: encoder.fit_transform(column) if  column.dtype == 'object' else column )
Y = data['Attrition']

model = RandomForestClassifier(random_state= 42)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state= 42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cfx = confusion_matrix(y_test, y_pred)

print('the accuracy score : ',accuracy)
print(report)
print(cfx)


cv_score = cross_val_score(model, X, Y, cv=5, scoring='accuracy')

print('the cross validation score : ',cv_score)
print('the mean cross validation score : ',cv_score.mean())


the accuracy score :  0.8928571428571429
              precision    recall  f1-score   support

          No       0.90      0.99      0.94       289
         Yes       0.82      0.30      0.44        47

    accuracy                           0.89       336
   macro avg       0.86      0.64      0.69       336
weighted avg       0.89      0.89      0.87       336

[[286   3]
 [ 33  14]]
the cross validation score :  [0.89285714 0.93134328 0.92238806 0.89552239 0.91343284]
the mean cross validation score :  0.9111087420042644


In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {
  'n_estimators': [100, 200, 150],  # Number of trees in the forest
  'max_depth': [None, 10, 20, 15],  # Maximum depth of the tree
  'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
  'min_samples_leaf': [1, 2, 4],    # Minimum number of samples required to be at a leaf node
}


grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
print('Best hyperparameters:', best_params)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)
cfx = confusion_matrix(y_test, y_pred)

print('Accuracy with best hyperparameters:', accuracy)
print(report)
print(cfx)


KeyboardInterrupt: 

In [None]:
model = KNeighborsClassifier(n_neighbors=3)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state= 42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
cfx = confusion_matrix(y_test, y_pred)

print('the accuracy score : ',accuracy)
print(report)
print(cfx)

cv_score = cross_val_score(model, X, Y, cv=5, scoring='accuracy')

print('the cross validation score : ',cv_score)
print('the mean cross validation score : ',cv_score.mean())

the accuracy score :  0.8392857142857143
              precision    recall  f1-score   support

          No       0.89      0.93      0.91       289
         Yes       0.39      0.26      0.31        47

    accuracy                           0.84       336
   macro avg       0.64      0.59      0.61       336
weighted avg       0.82      0.84      0.82       336

[[270  19]
 [ 35  12]]
the cross validation score :  [0.84821429 0.86865672 0.85074627 0.85373134 0.89850746]
the mean cross validation score :  0.8639712153518124


In [None]:
model = LogisticRegression(random_state=42)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state= 42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)
cfx = confusion_matrix(y_test, y_pred)

print('the accuracy score : ',accuracy)
print(report)
print(cfx)

cv_score = cross_val_score(model, X, Y, cv=5, scoring='accuracy')

print('the cross validation score : ',cv_score)
print('the mean cross validation score : ',cv_score.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


the accuracy score :  0.8660714285714286
              precision    recall  f1-score   support

          No       0.87      0.99      0.93       289
         Yes       0.62      0.11      0.18        47

    accuracy                           0.87       336
   macro avg       0.75      0.55      0.55       336
weighted avg       0.84      0.87      0.82       336

[[286   3]
 [ 42   5]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


the cross validation score :  [0.88392857 0.88059701 0.89552239 0.89253731 0.89850746]
the mean cross validation score :  0.8902185501066097


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model = SVC(random_state=42, )

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state= 42)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)
cfx = confusion_matrix(y_test, y_pred)

print('the accuracy score : ',accuracy)
print(report)
print(cfx)

cv_score = cross_val_score(model, X, Y, cv=5, scoring='accuracy')

print('the cross validation score : ',cv_score)
print('the mean cross validation score : ',cv_score.mean())

the accuracy score :  0.8601190476190477
              precision    recall  f1-score   support

          No       0.86      1.00      0.92       289
         Yes       0.00      0.00      0.00        47

    accuracy                           0.86       336
   macro avg       0.43      0.50      0.46       336
weighted avg       0.74      0.86      0.80       336

[[289   0]
 [ 47   0]]
the cross validation score :  [0.88095238 0.88358209 0.88059701 0.88059701 0.88059701]
the mean cross validation score :  0.8812651030561478


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
  'C': [0.1, 1, 10],
  'kernel': ['linear', 'poly', 'rbf'],
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Best parameters and model
best_params = grid_search.best_params_
print('Best hyperparameters:', best_params)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)
cfx = confusion_matrix(y_test, y_pred)

print('Accuracy with best hyperparameters:', accuracy)
print(report)
print(cfx)


Best hyperparameters: {'C': 0.1, 'kernel': 'linear'}
Accuracy with best hyperparameters: 0.8779761904761905
              precision    recall  f1-score   support

          No       0.89      0.99      0.93       289
         Yes       0.71      0.21      0.33        47

    accuracy                           0.88       336
   macro avg       0.80      0.60      0.63       336
weighted avg       0.86      0.88      0.85       336

[[285   4]
 [ 37  10]]
