# Random Forest Model

In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sqlalchemy import create_engine

# Load data from SQLite database

In [2]:
# load data back into a dataframe from the database
mental_df = pd.read_sql_table('pre_encoded_survey', 'sqlite:///mental_health.db').drop(columns=['index'])
mental_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,No,Yes,Yes,Somewhat easy,No,No,Yes,No,1,"Yes, they all did",...,Yes,Yes,Yes,Yes,1,Rarely,Sometimes,Male,United States of America,United States of America
1,Yes,Yes,No,Neither easy nor difficult,Yes,Maybe,No,No,1,I don't know,...,Yes,Yes,Yes,Yes,1,Sometimes,Sometimes,Female,United States of America,United States of America
2,Yes,I am not sure,Yes,Somewhat easy,Yes,Yes,No,Yes,1,"No, none did",...,No,No,Yes,No,1,Not applicable to me,Often,Male,United Kingdom,United Kingdom
3,I don't know,No,No,Somewhat easy,No,No,Yes,No,1,Some did,...,No,No,No,No,0,Not applicable to me,Not applicable to me,Male,United States of America,United States of America
4,Yes,Yes,Yes,Very easy,No,No,I don't know,No,1,Some did,...,Yes,Yes,Yes,Yes,1,Sometimes,Often,Female,United States of America,United States of America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,Yes,I am not sure,Yes,Somewhat easy,No,No,Yes,No,1,Some did,...,I don't know,Maybe,Maybe,No,0,Rarely,Rarely,Male,Canada,Canada
857,I don't know,I am not sure,I don't know,Somewhat easy,Maybe,No,I don't know,No,1,I don't know,...,I don't know,Yes,Yes,Yes,1,Rarely,Often,Female,Canada,Canada
858,Yes,No,No,Somewhat easy,No,No,Yes,No,1,Some did,...,Yes,Yes,Yes,Yes,1,Rarely,Often,Female,United States of America,United States of America
859,Yes,Yes,Yes,Somewhat difficult,Maybe,Maybe,I don't know,Yes,1,Some did,...,Yes,Yes,Maybe,Yes,1,Rarely,Sometimes,Male,United States of America,United States of America


In [3]:
mental_df.dtypes

0     object
1     object
2     object
3     object
4     object
5     object
6     object
7     object
8      int64
9     object
10    object
11    object
12    object
13    object
14    object
15    object
16    object
17    object
18    object
19    object
20    object
21    object
22    object
23    object
24    object
25    object
26     int64
27    object
28    object
29    object
30    object
31    object
dtype: object

# Preprocessing

In [4]:
# Generate our categorical variable list
mental_cat = mental_df.dtypes[mental_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
mental_df[mental_cat].nunique()

0     4
1     3
2     3
3     6
4     3
5     3
6     3
7     2
9     4
10    4
11    4
12    4
13    4
14    4
15    4
16    3
17    3
18    5
19    5
20    6
21    4
22    3
23    3
24    3
25    2
27    5
28    5
29    3
30    7
31    8
dtype: int64

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(mental_df[mental_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(mental_cat)
encode_df.head()

Unnamed: 0,0_I don't know,0_No,0_Not eligible for coverage / N/A,0_Yes,1_I am not sure,1_No,1_Yes,2_I don't know,2_No,2_Yes,...,30_United Kingdom,30_United States of America,31_Australia,31_Canada,31_Germany,31_Netherlands,31_Other,31_Sweden,31_United Kingdom,31_United States of America
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
# Merge one-hot encoded features and drop the originals
mental_df = mental_df.merge(encode_df,left_index=True, right_index=True)
mental_df = mental_df.drop(mental_cat,1)
mental_df.head()

Unnamed: 0,8,26,0_I don't know,0_No,0_Not eligible for coverage / N/A,0_Yes,1_I am not sure,1_No,1_Yes,2_I don't know,...,30_United Kingdom,30_United States of America,31_Australia,31_Canada,31_Germany,31_Netherlands,31_Other,31_Sweden,31_United Kingdom,31_United States of America
0,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Send encoded dataframe to database to be used in model

In [7]:
# Create the engine
engine = create_engine("sqlite:///mental_health.db", echo=False)

In [8]:
# Use pandas to_sql to write records stored in a dataframe to a SQL database

# Write dataframe of all survey questions to a sqlite table
mental_df.to_sql('encoded_survey', con=engine, if_exists='replace')
#engine.execute("SELECT * FROM encoded_survey").fetchall()

In [9]:
# load data back into a dataframe from the database
mental_df = pd.read_sql_table('encoded_survey', 'sqlite:///mental_health.db').drop(columns=['index'])
mental_df

Unnamed: 0,8,26,0_I don't know,0_No,0_Not eligible for coverage / N/A,0_Yes,1_I am not sure,1_No,1_Yes,2_I don't know,...,30_United Kingdom,30_United States of America,31_Australia,31_Canada,31_Germany,31_Netherlands,31_Other,31_Sweden,31_United Kingdom,31_United States of America
0,1,1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
857,1,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
858,1,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
859,1,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# load data back into a dataframe from the database
questions_df = pd.read_sql_table('pre_encoded_questions', 'sqlite:///mental_health.db').drop(columns=['index'])
questions_df

Unnamed: 0,Question
0,Does your employer provide mental health benef...
1,Do you know the options for mental health care...
2,Does your employer offer resources to learn mo...
3,If a mental health issue prompted you to reque...
4,Do you think that discussing a mental health d...
5,Do you think that discussing a physical health...
6,Do you feel that your employer takes mental he...
7,Have you heard of or observed negative consequ...
8,Do you have previous employers?
9,Have your previous employers provided mental h...


In [11]:
# Define the target set.
y = mental_df["25_Yes"]

# Define the features set.
X = mental_df.drop(columns=["25_Yes","25_No"])

In [12]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [13]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [14]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [15]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test)
predictions

array([0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 1., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1.,
       0., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1.,
       0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0.])

In [16]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,75,23
Actual 1,7,111


In [17]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Create a Confusion Matrix

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,75,23
Actual 1,7,111


Accuracy Score : 0.8611111111111112
Classification Report
              precision    recall  f1-score   support

         0.0       0.91      0.77      0.83        98
         1.0       0.83      0.94      0.88       118

    accuracy                           0.86       216
   macro avg       0.87      0.85      0.86       216
weighted avg       0.87      0.86      0.86       216



In [19]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00000000e+00, 1.10419050e-01, 5.67529724e-03, 1.65687097e-03,
       3.34889219e-03, 4.77398577e-03, 2.86263014e-03, 6.59232613e-03,
       6.87360840e-03, 4.75727324e-03, 4.52849706e-03, 4.32812773e-03,
       3.84626568e-03, 2.44265684e-03, 4.74462438e-03, 3.93220193e-03,
       1.96594474e-03, 2.45223494e-03, 4.32743555e-03, 4.14851247e-03,
       2.76312046e-03, 3.94917584e-03, 3.04315937e-03, 2.26892999e-03,
       4.42000131e-03, 3.18407639e-03, 3.68580684e-03, 2.50538604e-03,
       1.92557573e-03, 4.96987345e-03, 4.02072982e-03, 7.53026483e-03,
       2.36400258e-03, 5.49362358e-03, 6.82871236e-03, 2.98918112e-03,
       1.99578370e-03, 1.32254134e-03, 3.64470239e-03, 4.60907129e-03,
       8.87053686e-04, 4.09469487e-03, 1.66981912e-03, 3.11731938e-03,
       2.70249597e-03, 3.47603523e-03, 2.49857420e-03, 4.73271396e-03,
       5.25853416e-03, 1.09213371e-03, 3.55670457e-03, 3.82392394e-03,
       1.48931802e-03, 5.92142403e-03, 4.83229807e-03, 5.52944146e-03,
      

In [20]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.12024805797660318, '23_Yes'),
 (0.11041904981287384, '26'),
 (0.08948881936577566, '24_Yes'),
 (0.07240868904721894, '27_Not applicable to me'),
 (0.060197667718725555, '23_No'),
 (0.043720185842798506, '28_Not applicable to me'),
 (0.036815395479663025, '24_No'),
 (0.02912006295329515, '28_Often'),
 (0.020877304353417295, '22_No'),
 (0.01873988058843284, '22_Yes'),
 (0.01588588843548007, '23_Maybe'),
 (0.015299481191566024, '27_Rarely'),
 (0.013867646180869025, '24_Maybe'),
 (0.009319877532035678, '27_Sometimes'),
 (0.009232451751315603, '28_Sometimes'),
 (0.007530264833722061, '9_Some did'),
 (0.006873608396547313, '1_Yes'),
 (0.006828712358086473, '10_N/A (not currently aware)'),
 (0.006592326132076189, '1_No'),
 (0.006098751799722544, '21_Yes, I experienced'),
 (0.0059214240286992025, "15_I don't know"),
 (0.005675297236103962, "0_I don't know"),
 (0.005529441456887535, '15_Some did'),
 (0.005493623584711995, '10_I was aware of some'),
 (0.005258534156281813, '13_Yes, all of th

In [21]:
# save our model to use later
import pickle

# Save to file in the current working directory
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf_model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_test, y_test)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_test)

Test score: 86.11 %


Our model’s accuracy score is 86.11, meaning that it accurately predicts if an individual has a mental health disorder 86.11% of the time, based off how they answer survey questions (assuming they answer honestly). Since this model is not making a prediction that has high consequences, it is merely for the interest of an individual working in tech and wanting to know their likelihood of having (or developing) a mental health disorder, or for the interest of a tech company and wanting to know if offering certain mental health services would be of benefit to their employees, an accuracy of 86.11% is sufficient.