In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from google.colab import files


uploaded = files.upload()
df = pd.read_csv("mental_health_workplace_survey.csv")


x = df.drop('BurnoutRisk', axis=1)
y = df['BurnoutRisk']
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42)

# Encoding
categorical_cols = x_train.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoder.fit(x_train[categorical_cols])

x_train_encoded = pd.DataFrame(encoder.transform(x_train[categorical_cols]),columns=encoder.get_feature_names_out(categorical_cols),index=x_train.index)
x_test_encoded = pd.DataFrame(encoder.transform(x_test[categorical_cols]),columns=encoder.get_feature_names_out(categorical_cols),index=x_test.index)

# Numerical columns
numerical_cols = x_train.select_dtypes(include=['int64', 'float64']).columns
x_train_num = x_train[numerical_cols]
x_test_num = x_test[numerical_cols]


x_train_final = pd.concat([x_train_encoded, x_train_num], axis=1)
x_test_final = pd.concat([x_test_encoded, x_test_num], axis=1)

# Scaling
scaler = StandardScaler()
x_train_scaled = pd.DataFrame(scaler.fit_transform(x_train_final),index=x_train.index,columns=x_train_final.columns)
x_test_scaled = pd.DataFrame(scaler.transform(x_test_final),index=x_test.index,columns=x_test_final.columns)

models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(x_train_scaled, y_train)
    y_pred = model.predict(x_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(confusion_matrix(y_test, y_pred))

# Feature Importance
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(x_train_scaled, y_train)
features = pd.Series(random_forest.feature_importances_, index=x_train_scaled.columns)
top_features = features.sort_values(ascending=False).head(3).index.tolist()
print(top_features)

x_train_new = x_train_scaled[top_features]
x_test_new = x_test_scaled[top_features]

for name, model in models.items():
    model.fit(x_train_new, y_train)
    y_pred_new = model.predict(x_test_new)
    accuracy_new = accuracy_score(y_test, y_pred_new)
    print(f"{name} Accuracy: {accuracy_new:.2f}")
    print(confusion_matrix(y_test, y_pred_new))

Saving mental_health_workplace_survey.csv to mental_health_workplace_survey.csv
Decision Tree Accuracy: 1.00
[[399   0]
 [  0 201]]
Random Forest Accuracy: 1.00
[[399   0]
 [  0 201]]
K-Nearest Neighbors Accuracy: 0.74
[[355  44]
 [109  92]]
['BurnoutLevel', 'StressLevel', 'ManagerSupportScore']
Decision Tree Accuracy: 1.00
[[399   0]
 [  0 201]]
Random Forest Accuracy: 1.00
[[399   0]
 [  0 201]]
K-Nearest Neighbors Accuracy: 0.98
[[392   7]
 [  7 194]]
