In [1]:
#dependencies
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.simplefilter('ignore')
%matplotlib inline
from sklearn.utils import shuffle

In [2]:
aggr_encoded = pd.read_csv('../data/agg_encode_non_self_employee.csv')
# aggr_encoded = shuffle(aggr_encoded)

In [4]:
target_list=["treatment", "tech_company", "wellness_program"]
#14 X values no_employees, benefits,care_options, seek_help, anonymity, leave, supervisor, coworkers, family_history, phys_health_interview, mental_health_interview, obs_consequence, age, gender
X = aggr_encoded.drop(target_list, axis=1)


In [5]:
# y1=Has the employee seek mental health treatment? (treatment)
# y2=Is your employer primarily a tech company/organization? (tech_company)
# y3=Will the employer bring awareness to mental health in a wellness program? (wellness_program)# Treatment

In [6]:
y1 = aggr_encoded["treatment"]
print(X.shape, y1.shape)

(3089, 47) (3089,)


## Logistic Regression

In [None]:
#import 
from sklearn.model_selection import train_test_split

In [None]:
#split data
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, random_state=42, stratify=y1)
X_train.head()

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier


In [None]:
#Fit model using the training data
classifier.fit(X_train, y1_train)

In [None]:
#Validate the model using the test data
treatment_log_reg_train = classifier.score(X_train, y1_train)
treatment_log_reg_test = classifier.score(X_test, y1_test)
print(f"Training Data Score: {treatment_log_reg_train}")
print(f"Testing Data Score: {treatment_log_reg_test}")


In [None]:
#Make predictions
predictions = classifier.predict(X_test)
print(f"First 30 Predictions:   {predictions[:30]}")
print(f"First 30 Actual labels: {y1_test[:30].tolist()}")


In [None]:
plt.scatter(classifier.predict(X_train), classifier.predict(X_train) - y1_train, c="blue", label="Training Data", s=100)
plt.scatter(classifier.predict(X_test), classifier.predict(X_test) - y1_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y1_test.min(), xmax=y1_test.max())
plt.title("Residual Plot")
plt.show()


In [None]:
# Look at weights for each column
pd.DataFrame({"cols": X.columns, "weights": classifier.coef_.reshape(-1)})

Having a Family history of mental illness has a strong weight to this model

## Tree

In [None]:
#use decision tree model
feature_names = X.columns
X.head()


In [None]:
#import decision tree library
from sklearn import tree

In [None]:
#Decision Tree classifier y1 
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y1_train)  
clf.score(X_test, y1_test)

print(f"Testing Data Score: {clf.score(X_test, y1_test)}")

In [None]:
#import Random Forest library
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Random Forest classifier y1
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y1_train)
treatment_rf_test = rf.score(X_test, y1_test) 
print(f"Test Data Score: {treatment_rf_train}")
treatment_rf_test

In [None]:
treatment_rf_train = rf.score(X_train, y1_train)
print(f"Train Data Score: {treatment_rf_train}")

In [None]:
#sort feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

Age and family history are important features

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#KNN for y1 Treatment
# Loop through different k values to see which has the highest accuracy
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y1_train)
    train_score = knn.score(X_train, y1_train)
    test_score = knn.score(X_test, y1_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(X_train, y1_train)
treatment_knn_train = knn.score(X_train, y1_train)
treatment_knn_test = knn.score(X_test, y1_test)
print('k=19 Train Acc: %.3f' % treatment_knn_train)
print('k=19 Test Acc: %.3f' % treatment_knn_test)


In [None]:
predict = knn.predict(X_test)
predict

## SVM

In [None]:
target_names = ["no", "yes"]

# Create a support vector machine linear classifier and fit it to the training data
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y1_train)
predictions = model.predict(X_test)

treatment_svm_train = model.score(X_train, y1_train)
treatment_svm_test = model.score(X_test, y1_test)

# Print the model score using the test data
print(f"Training Data Score: {treatment_svm_train}")
print(f"Testing Data Score: {treatment_svm_test}")

In [None]:
# Calculate the classification report
from sklearn.metrics import classification_report
print(classification_report(y1_test, predictions,
                            target_names=target_names))


# Tech Company

In [None]:
# Target values is tech_company
y2 = aggr_encoded["tech_company"]
print(X.shape, y2.shape)

In [None]:
#split and train
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, random_state=42, stratify=y2)

## Logistic Regression

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y2_train)

tech_log_reg_train = classifier.score(X_train, y2_train)
tech_log_reg_test = classifier.score(X_test, y2_test)
print(f"Training Data Score: {tech_log_reg_train}")
print(f"Testing Data Score: {tech_log_reg_test}")


Model has an accuracy of 78%

In [None]:
#Make predictions
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y2_test[:10].tolist()}")


In [None]:
plt.scatter(classifier.predict(X_train), classifier.predict(X_train) - y2_train, c="blue", label="Training Data", s=100)
plt.scatter(classifier.predict(X_test), classifier.predict(X_test) - y2_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y2_test.min(), xmax=y2_test.max())
plt.title("Residual Plot")
plt.show()


In [None]:
# Look at weights for each column
pd.DataFrame({"cols": X.columns, "weights": classifier.coef_.reshape(-1)})

number of employees (1-5) has a strong weight

## Tree

In [None]:
#target is tech_company
target_names = ["no", "yes"]

#Decision Tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y2_train)  
clf.score(X_test, y2_test)

In [None]:
#Random Forest
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y2_train)
tech_rf_train = rf.score(X_train, y2_train)
tech_rf_train

In [None]:
tech_rf_test = rf.score(X_test, y2_test)
print(f"Testing Data Score: {tech_rf_test}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)


## KNN

In [None]:
# Loop through different k values to see which has the highest accuracy
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y2_train)
    train_score = knn.score(X_train, y2_train)
    test_score = knn.score(X_test, y2_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    


In [None]:
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()


In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y2_train)

tech_knn_train = knn.score(X_train, y2_train)
tech_knn_test = knn.score(X_test, y2_test)

print('k=11 Test Acc: %.3f' % tech_knn_train)
print('k=11 Train Acc: %.3f' % tech_knn_test)

## SVM

In [None]:
target_names = ["no", "yes"]

# Create a support vector machine linear classifier and fit it to the training data
model = SVC(kernel='linear')
model.fit(X_train, y2_train)
predictions = model.predict(X_test)

tech_svm_train = model.score(X_train, y2_train)
tech_svm_test = model.score(X_test, y2_test)

print(f"Training Data Score: {tech_svm_train}")
# Print the model score using the test data
print(f"Testing Data Score: {tech_svm_test}")


In [None]:
# Calculate the classification report
print(classification_report(y2_test, predictions,
                            target_names=target_names))

# Wellness Program

In [None]:
# Target values is wellness_program
y3 = aggr_encoded["wellness_program"]
print(X.shape, y3.shape)

In [None]:
#split data
X_train, X_test, y3_train, y3_test = train_test_split(X, y3, random_state=42, stratify=y3)

## Logistic Regression

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y3_train)

wellness_log_reg_train = classifier.score(X_train, y3_train)
wellness_log_reg_test = classifier.score(X_test, y3_test)

print(f"Training Data Score: {wellness_log_reg_train}")
print(f"Testing Data Score: {wellness_log_reg_test}")


75% Accuracy 

In [None]:
#Make predictions
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y3_test[:10].tolist()}")

In [None]:
plt.scatter(classifier.predict(X_train), classifier.predict(X_train) - y3_train, c="blue", label="Training Data", s=100)
plt.scatter(classifier.predict(X_test), classifier.predict(X_test) - y3_test, c="orange", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y3_test.min(), xmax=y3_test.max())
plt.title("Residual Plot")
plt.show()



## Tree

In [None]:
#target is wellness_program
target_names = ["no", "yes", "I don't know"]

#split data
# from sklearn.model_selection import train_test_split
# X_train, X_test, y3_train, y3_test = train_test_split(X, y3, random_state=42, stratify=y3) 

#Decision Tree
# from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y3_train)  
clf.score(X_test, y3_test)

In [None]:
#Random Forest
# classifier = classifier.fit(X_train, y3_train)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y3_train)
wellness_rf_train = rf.score(X_train, y3_train) 
wellness_rf_train

In [None]:
wellness_rf_test = rf.score(X_test, y3_test)
print(f"Testing Data Score: {wellness_rf_test}")

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

## KNN

In [None]:
# Loop through different k values to see which has the highest accuracy
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y3_train)
    train_score = knn.score(X_train, y3_train)
    test_score = knn.score(X_test, y3_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    


In [None]:
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(X_train, y3_train)

wellness_knn_train = knn.score(X_train, y3_train)
wellness_knn_test = knn.score(X_test, y3_test)

print('k=19 Test Acc: %.3f' % wellness_knn_test)
print('k=19 Train Acc: %.3f' % wellness_knn_train)


# SVM

In [None]:
target_names = ["no", "yes", "I don't know"]

# Create a support vector machine linear classifier and fit it to the training data
model = SVC(kernel='linear')
model.fit(X_train, y3_train)
predictions = model.predict(X_test)

wellness_svm_train = model.score(X_train, y3_train)
wellness_svm_test = model.score(X_test, y3_test)

# Print the model score using the test data
print(f"Testing Data Score: {wellness_svm_test}")
print(f"Training Data Score: {wellness_svm_train}")

In [None]:
# Calculate the classification report
print(classification_report(y3_test, predictions,
                            target_names=target_names))

# Deep Learning Model 

In [None]:
#preprocess data
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping


In [None]:
from tensorflow.python.keras import backend as k

## Treatment

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y1_train)
encoded_y_train = label_encoder.transform(y1_train)
encoded_y_test = label_encoder.transform(y1_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

y_train_categorical.shape

In [None]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=47))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# set early stopping as callback
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    callbacks=callbacks,
    epochs=60,
    shuffle=True,
    verbose=2)

In [None]:
# Evaluate the model using the training data
treatment_dl_loss_train, treatment_dl_train = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(f"Train Loss: {treatment_dl_loss_train}, Train Accuracy: {treatment_dl_train}")

In [None]:
# Evaluate the model using the testing data
treatment_dl_loss_test, treatment_dl_test = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Test Loss: {treatment_dl_loss_test}, Test Accuracy: {treatment_dl_test}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.25, random_state=1)


model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_predict = model.predict(X_test)
y_predict



In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_predict)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

y_predict_probabilities = model.predict_proba(X_test)[:,0]

fpr, tpr, _ = roc_curve(y_test, y_predict_probabilities)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
y_predict_probabilities

## Tech Company

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y2_train)
encoded_y_train = label_encoder.transform(y2_train)
encoded_y_test = label_encoder.transform(y2_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

y_train_categorical.shape

In [None]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=47))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# set early stopping as callback
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    callbacks=callbacks,
    epochs=60,
    shuffle=True,
    verbose=2)

In [None]:
# Evaluate the model using the training data
tech_dl_loss_train, tech_dl_train = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(f"Train Loss: {tech_dl_loss_train}, Train Accuracy: {tech_dl_train}")

In [None]:
# Evaluate the model using the testing data
tech_dl_loss_test, tech_dl_test = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {tech_dl_loss_test}, Accuracy: {tech_dl_test}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.25, random_state=1)


model = KNeighborsClassifier()
model.fit(X_train, y_train)

y_predict = model.predict(X_test)
y_predict

In [None]:
confusion_matrix(y_test, y_predict)

In [None]:
y_predict_probabilities = model.predict_proba(X_test)[:,0]

fpr, tpr, _ = roc_curve(y_test, y_predict_probabilities)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
y_predict_probabilities

## Wellness Program

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y3_train)
encoded_y_train = label_encoder.transform(y3_train)
encoded_y_test = label_encoder.transform(y3_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

y_train_categorical.shape

In [None]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=47))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [None]:
model.summary()

In [None]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# set early stopping as callback
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    callbacks=callbacks,
    epochs=60,
    shuffle=True,
    verbose=2)

In [None]:
# Evaluate the model using the training data
wellness_dl_loss_train, wellness_dl_train = model.evaluate(
    X_train_scaled, y_train_categorical, verbose=2)
print(f"Train Loss: {wellness_dl_loss_train}, Train Accuracy: {wellness_dl_train}")

In [None]:
# Evaluate the model using the testing data
wellness_dl_loss_test, wellness_dl_test = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Test Loss: {wellness_dl_loss_test}, Test Accuracy: {wellness_dl_test}")

In [None]:
model_results = pd.DataFrame(
    {"target": ["treatment", "treatment", "treatment","treatment","treatment","tech_company","tech_company","tech_company","tech_company","tech_company","wellness_program","wellness_program","wellness_program","wellness_program","wellness_program"],
     "train_score": [treatment_log_reg_train, treatment_rf_train, treatment_knn_train, treatment_svm_train, treatment_dl_train, tech_log_reg_train, tech_rf_train, tech_knn_train, tech_svm_train, tech_dl_train, wellness_log_reg_train, wellness_rf_train, wellness_knn_train, wellness_svm_train, wellness_dl_train],
     "test_score": [treatment_log_reg_test, treatment_rf_test, treatment_knn_test, treatment_svm_test, treatment_dl_test, tech_log_reg_test, tech_rf_test, tech_knn_test, tech_svm_test, tech_dl_test, wellness_log_reg_test, wellness_rf_test, wellness_knn_test, wellness_svm_test, wellness_dl_test],
     "model": ["log_regression", "random_forest", "knn", "svm", "deep_learning", "log_regression", "random_forest", "knn", "svm", "deep_learning", "log_regression", "random_forest", "knn", "svm", "deep_learning"]
     })

model_results


In [None]:
model_results.to_csv("model_acc.csv",
                  encoding="utf-8", index=False, header=True)