In [2]:
#import packages
import numpy as np #for array and matrices
import pandas as pd #for data manipulation and analysis
from sklearn.model_selection  import train_test_split #for split the data into training and test
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score #for confusion matrix
import warnings
warnings.filterwarnings("ignore") #to ignore all the warnings

In [3]:
#load the dataset
#read_csv is from pandas and it is use for loading the data
m_health = pd.read_csv('D:\\Semester 3\\healthcare Analytics\\project\\m_health.csv', index_col=0)

In [4]:
m_health.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,19,0,44,0,0,1,2,4,0,1,...,2,2,1,1,1,2,1,0,2,0
1,26,1,44,0,0,0,3,5,0,0,...,0,0,0,1,0,0,1,1,0,0
2,14,1,6,0,0,0,3,4,0,1,...,0,1,1,1,2,2,2,2,1,0
3,13,1,43,0,1,1,2,2,0,1,...,1,1,2,2,1,0,0,0,1,1
4,13,1,44,0,0,0,1,1,1,1,...,0,0,1,1,1,2,2,2,0,0


# Machine Learning approches

In [5]:
# drop country column because after encoding it has series of numbers which will affect the result
m_health = m_health.drop('Country', axis =1)

In [6]:
# select features and target
X = m_health.drop('treatment', axis =1)
y = m_health.treatment

In [7]:
# Split the data with train_test_split which by default split 75% as training and 25% as test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 14)

## K-nearest Neighbors

In [8]:
#KNeighborsClassifier from sklearn
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#print the accuracy score
print('training accuracy is ',knn.score(X_train, y_train))
print('test accuracy is ',knn.score(X_test, y_test))

training accuracy is  0.8560767590618337
test accuracy is  0.7412140575079872


In [9]:
# Predicting the Test set results
y_pred = knn.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Making the Confusion Matrix
print('accuracy_score:',accuracy_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred)) # tp / (tp + fn)
print('precision_score:',precision_score(y_test, y_pred)) # tp / (tp + fp)
print('f1_score:',f1_score(y_test, y_pred))

df_cm = pd.DataFrame(cm, index = ('Actual: No','Actual: Yes'), columns = ('Predicted: No', 'Predicted: Yes'))
print("Test Data Accuracy: %.4f" %accuracy_score(y_test, y_pred))
df_cm

accuracy_score: 0.7412140575079872
recall_score: 0.7298850574712644
precision_score: 0.7888198757763976
f1_score: 0.7582089552238808
Test Data Accuracy: 0.7412


Unnamed: 0,Predicted: No,Predicted: Yes
Actual: No,105,34
Actual: Yes,47,127


## Decision Tree Classifier

In [10]:
# Find Best Max Depth

# Loop through a few different max depths and check the performance
# Try different max depths. We want to optimize our ML models to make the best predictions possible.
# For regular decision trees, max_depth, which is a hyperparameter, limits the number of splits in a tree.
# You can find the best value of max_depth based on the R-squared score of the model on the test set.

from sklearn.tree import DecisionTreeClassifier 
for d in [2,3,4,5,6,7,8,9,10]:
    # Create the tree and fit it
    decision_tree = DecisionTreeClassifier(max_depth=d, min_samples_split=6)
    decision_tree.fit(X_train, y_train)

    # Print out the scores on train and test
    print('max_depth=', str(d))
    print(decision_tree.score(X_train, y_train))
    print(decision_tree.score(X_test, y_test), '\n')  # You want the test score to be positive
    
# R-square for train and test scores are below. 
# Always remember that best accuracy that doesn't mean our model is perfect always check for confusion matrix.

max_depth= 2
0.8304904051172708
0.8274760383386581 

max_depth= 3
0.8304904051172708
0.8274760383386581 

max_depth= 4
0.8326226012793176
0.8146964856230032 

max_depth= 5
0.8464818763326226
0.7667731629392971 

max_depth= 6
0.8710021321961621
0.7699680511182109 

max_depth= 7
0.8891257995735607
0.7731629392971247 

max_depth= 8
0.9040511727078892
0.7763578274760383 

max_depth= 9
0.9168443496801706
0.7667731629392971 

max_depth= 10
0.9264392324093816
0.7763578274760383 



In [11]:
# DecisionTreeClassifier from sklearn.tree
tree = DecisionTreeClassifier(random_state=0, max_depth = 9, min_samples_split = 6)
tree.fit(X_train, y_train) 

#print the accuracy score
print('training accuracy is ',tree.score(X_train, y_train))
print('test accuracy is ',tree.score(X_test, y_test))

training accuracy is  0.9168443496801706
test accuracy is  0.7795527156549521


In [12]:
# Predicting the Test set results
y_pred = tree.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Making the Confusion Matrix
print('accuracy_score:',accuracy_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred)) # tp / (tp + fn)
print('precision_score:',precision_score(y_test, y_pred)) # tp / (tp + fp)
print('f1_score:',f1_score(y_test, y_pred))

df_cm = pd.DataFrame(cm, index = ('Actual: No','Actual: Yes'), columns = ('Predicted: No', 'Predicted: Yes'))
print("Test Data Accuracy: %.4f" %accuracy_score(y_test, y_pred))
df_cm

accuracy_score: 0.7795527156549521
recall_score: 0.8103448275862069
precision_score: 0.7966101694915254
f1_score: 0.8034188034188032
Test Data Accuracy: 0.7796


Unnamed: 0,Predicted: No,Predicted: Yes
Actual: No,103,36
Actual: Yes,33,141


## Random Forest Classifier

In [13]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# Create the random forest model and fit to the training data
cfr = RandomForestClassifier()
cfr.fit(X_train, y_train)

# Use ParameterGrid Search
from sklearn.model_selection import ParameterGrid

# Create a dictionary of hyperparameters to search
# n_estimators is the number of trees in the forest. The larger the better, but also takes longer it will take to compute. 
# Run grid search
grid = {'n_estimators': [50, 100, 150, 200], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'random_state': [0]}
test_scores = []

# Loop through the parameter grid, set the hyperparameters, and save the scores
for g in ParameterGrid(grid):
    cfr.set_params(**g)  # ** is "unpacking" the dictionary
    cfr.fit(X_train, y_train)
    test_scores.append(cfr.score(X_test, y_test))

# Find best hyperparameters from the test score and print
best_idx = np.argmax(test_scores)
print('The best test score is:',test_scores[best_idx], ParameterGrid(grid)[best_idx])  

The best test score is: 0.8338658146964856 {'random_state': 0, 'n_estimators': 200, 'min_samples_split': 12, 'max_depth': 7}


In [14]:
#RandomForestClassifier from sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, max_depth=7, min_samples_split = 12, random_state=0)
clf.fit(X_train, y_train)

#print the accuracy score
print('training accuracy is ',clf.score(X_train, y_train))
print('test accuracy is ',clf.score(X_test, y_test))

training accuracy is  0.8848614072494669
test accuracy is  0.8338658146964856


In [15]:
# Predicting the Test set results
y_pred = clf.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Making the Confusion Matrix
print('accuracy_score:',accuracy_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred, average="weighted", pos_label="pos")) # tp / (tp + fn)
print('precision_score:',precision_score(y_test, y_pred, average="weighted", pos_label="pos")) # tp / (tp + fp)
print('f1_score:',f1_score(y_test, y_pred, average="weighted", pos_label="pos"))

df_cm = pd.DataFrame(cm, index = ('Actual: No','Actual: Yes'), columns = ('Predicted: No', 'Predicted: Yes'))
print("Test Data Accuracy: %.4f" %accuracy_score(y_test, y_pred))
df_cm

accuracy_score: 0.8338658146964856
recall_score: 0.8338658146964856
precision_score: 0.8392470334684532
f1_score: 0.8313761228161434
Test Data Accuracy: 0.8339


Unnamed: 0,Predicted: No,Predicted: Yes
Actual: No,101,38
Actual: Yes,14,160


## Support Vector Machine

In [17]:
# Support vector classification form sklearn.svm
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, y_train)

#print the accuracy score
print('training accuracy is ',svm.score(X_train, y_train))
print('test accuracy is ',svm.score(X_test, y_test))

training accuracy is  0.814498933901919
test accuracy is  0.7955271565495208


In [18]:
# Predicting the Test set results
y_pred = svm.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Making the Confusion Matrix
print('accuracy_score:',accuracy_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred, average="weighted", pos_label="pos")) # tp / (tp + fn)
print('precision_score:',precision_score(y_test, y_pred, average="weighted", pos_label="pos")) # tp / (tp + fp)
print('f1_score:',f1_score(y_test, y_pred, average="weighted", pos_label="pos"))

df_cm = pd.DataFrame(cm, index = ('Actual: No','Actual: Yes'), columns = ('Predicted: No', 'Predicted: Yes'))
print("Test Data Accuracy: %.4f" %accuracy_score(y_test, y_pred))
df_cm

accuracy_score: 0.7955271565495208
recall_score: 0.7955271565495208
precision_score: 0.7975476610722678
f1_score: 0.7931581798074884
Test Data Accuracy: 0.7955


Unnamed: 0,Predicted: No,Predicted: Yes
Actual: No,97,42
Actual: Yes,22,152


## Naive Bayes Algorithm

In [20]:
# Fitting classifier to the Training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

#print the accuracy score
print('training accuracy is ',gnb.score(X_train, y_train))
print('test accuracy is ',gnb.score(X_test, y_test))

training accuracy is  0.7921108742004265
test accuracy is  0.7763578274760383


In [21]:
# Predicting the Test set results
y_pred = gnb.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Making the Confusion Matrix
print('accuracy_score:',accuracy_score(y_test, y_pred))
print('recall_score:',recall_score(y_test, y_pred)) # tp / (tp + fn)
print('precision_score:',precision_score(y_test, y_pred)) # tp / (tp + fp)
print('f1_score:',f1_score(y_test, y_pred))

df_cm = pd.DataFrame(cm, index = ('Actual: No','Actual: Yes'), columns = ('Predicted: No', 'Predicted: Yes'))
print("Test Data Accuracy: %.4f" %accuracy_score(y_test, y_pred))
df_cm

accuracy_score: 0.7763578274760383
recall_score: 0.7873563218390804
precision_score: 0.8058823529411765
f1_score: 0.7965116279069767
Test Data Accuracy: 0.7764


Unnamed: 0,Predicted: No,Predicted: Yes
Actual: No,106,33
Actual: Yes,37,137
