In [45]:
import pandas as pd
import requests
import zipfile
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split

In [46]:
KAGGLE_DATASET_API_URL: str = (
    "https://www.kaggle.com/api/v1/datasets/download/osmi/mental-health-in-tech-2016"
)
CSV_FILE_NAME: str = "mental-heath-in-tech-2016_20161114.csv"

DATA_DIR: Path = Path.cwd().parent / "data"

In [47]:
def download_dataset(data_dir: str = DATA_DIR) -> None:
    """
    Downloads the 'mental-health-in-tech-2016' ZIP from Kaggle and extracts only
    the 'mental-heath-in-tech-2016_20161114.csv' file into data_dir. Anything else
    is deleted.
    """

    data_path: Path = Path(data_dir)
    data_path.mkdir(parents=True, exist_ok=True)

    csv_file: Path = data_path / CSV_FILE_NAME

    # Local path for the downloaded zip
    zip_path: Path = data_path / "mental-health-in-tech-2016.zip"

    # Download the ZIP file from Kaggle
    with requests.get(KAGGLE_DATASET_API_URL, stream=True) as r:
        r.raise_for_status()
        with zip_path.open("wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    # Extract only the csv
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        for fileinfo in zip_ref.infolist():
            if fileinfo.filename.endswith(CSV_FILE_NAME):
                zip_ref.extract(fileinfo, path=str(data_path))

    zip_path.unlink()

    print(f"Dataset saved to '{csv_file}'.")

In [48]:
if not (DATA_DIR / CSV_FILE_NAME).exists():
    print("Downloading dataset...")
    download_dataset()

df = pd.read_csv(DATA_DIR / CSV_FILE_NAME)

df.sample(n=5, random_state=42)

Unnamed: 0,Are you self-employed?,How many employees does your company or organization have?,Is your employer primarily a tech company/organization?,Is your primary role within your company related to tech/IT?,Does your employer provide mental health benefits as part of healthcare coverage?,Do you know the options for mental health care available under your employer-provided coverage?,"Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?",Does your employer offer resources to learn more about mental health concerns and options for seeking help?,Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?,"If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:",...,"If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?","If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?",What is your age?,What is your gender?,What country do you live in?,What US state or territory do you live in?,What country do you work in?,What US state or territory do you work in?,Which of the following best describes your work position?,Do you work remotely?
370,0,500-1000,1.0,,Yes,Yes,I don't know,I don't know,Yes,Somewhat easy,...,Not applicable to me,Not applicable to me,35,Female,United States of America,California,United States of America,California,Supervisor/Team Lead,Sometimes
560,0,6-25,1.0,,No,,No,No,Yes,Very easy,...,Not applicable to me,Not applicable to me,30,Male,South Africa,,South Africa,,Designer|Front-end Developer|Back-end Develope...,Sometimes
1007,0,6-25,1.0,,I don't know,I am not sure,No,I don't know,I don't know,Somewhat easy,...,Not applicable to me,Not applicable to me,25,Male,United States of America,Ohio,United States of America,Ohio,Supervisor/Team Lead|Support|Front-end Develop...,Sometimes
589,0,100-500,1.0,,I don't know,I am not sure,No,I don't know,No,Somewhat easy,...,Rarely,Often,33,Male,Bangladesh,,Bangladesh,,Back-end Developer,Sometimes
983,1,,,,,,,,,,...,Not applicable to me,Sometimes,35,m,United States of America,Michigan,United States of America,Michigan,Back-end Developer,Always


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433 entries, 0 to 1432
Data columns (total 63 columns):
 #   Column                                                                                                                                                                            Non-Null Count  Dtype  
---  ------                                                                                                                                                                            --------------  -----  
 0   Are you self-employed?                                                                                                                                                            1433 non-null   int64  
 1   How many employees does your company or organization have?                                                                                                                        1146 non-null   object 
 2   Is your employer primarily a tech company/organization?     

Rename columns

In [50]:
column_rename_dict = {
    "Are you self-employed?": "self_employed",
    "How many employees does your company or organization have?": "company_size",
    "Is your employer primarily a tech company/organization?": "employer_tech_company",
    "Is your primary role within your company related to tech/IT?": "role_in_tech",
    "Does your employer provide mental health benefits as part of healthcare coverage?": "mental_health_benefits",
    "Do you know the options for mental health care available under your employer-provided coverage?": "mental_health_coverage_options",
    "Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?": "employer_discussed_mental_health",
    "Does your employer offer resources to learn more about mental health concerns and options for seeking help?": "mental_health_resources",
    "Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?": "anonymity_protection",
    "If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:": "mental_health_leave_request",
    "Do you think that discussing a mental health disorder with your employer would have negative consequences?": "negative_consequences_discussing_mental_health",
    "Do you think that discussing a physical health issue with your employer would have negative consequences?": "negative_consequences_discussing_physical_health",
    "Would you feel comfortable discussing a mental health disorder with your coworkers?": "comfortable_discussing_mental_health_coworkers",
    "Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?": "comfortable_discussing_mental_health_supervisor",
    "Do you feel that your employer takes mental health as seriously as physical health?": "employer_prioritizes_mental_health",
    "Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?": "observed_negative_consequences",
    "Do you have medical coverage (private insurance or state-provided) which includes treatment of \xa0mental health issues?": "medical_coverage_mental_health",
    "Do you know local or online resources to seek help for a mental health disorder?": "knowledge_of_resources",
    "If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to clients or business contacts?": "reveal_mental_health_to_clients",
    "If you have revealed a mental health issue to a client or business contact, do you believe this has impacted you negatively?": "negative_impact_revealing_to_clients",
    "If you have been diagnosed or treated for a mental health disorder, do you ever reveal this to coworkers or employees?": "reveal_mental_health_to_coworkers",
    "If you have revealed a mental health issue to a coworker or employee, do you believe this has impacted you negatively?": "negative_impact_revealing_to_coworkers",
    "Do you believe your productivity is ever affected by a mental health issue?": "productivity_affected_by_mental_health",
    "If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?": "percentage_work_time_affected",
    "Do you have previous employers?": "previous_employers",
    "Have your previous employers provided mental health benefits?": "previous_employers_mental_health_benefits",
    "Were you aware of the options for mental health care provided by your previous employers?": "previous_employers_coverage_options",
    "Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?": "previous_employers_discussed_mental_health",
    "Did your previous employers provide resources to learn more about mental health issues and how to seek help?": "previous_employers_resources",
    "Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?": "previous_employers_anonymity_protection",
    "Do you think that discussing a mental health disorder with previous employers would have negative consequences?": "negative_consequences_previous_employers",
    "Do you think that discussing a physical health issue with previous employers would have negative consequences?": "negative_consequences_physical_health_previous_employers",
    "Would you have been willing to discuss a mental health issue with your previous co-workers?": "willing_to_discuss_mental_health_previous_coworkers",
    "Would you have been willing to discuss a mental health issue with your direct supervisor(s)?": "willing_to_discuss_mental_health_previous_supervisor",
    "Did you feel that your previous employers took mental health as seriously as physical health?": "previous_employers_prioritized_mental_health",
    "Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?": "observed_negative_consequences_previous",
    "Would you be willing to bring up a physical health issue with a potential employer in an interview?": "willing_to_discuss_physical_health_in_interview",
    "Why or why not?": "reason_discussing_physical_health",
    "Would you bring up a mental health issue with a potential employer in an interview?": "willing_to_discuss_mental_health_in_interview",
    "Why or why not?.1": "reason_discussing_mental_health",
    "Do you feel that being identified as a person with a mental health issue would hurt your career?": "mental_health_career_impact",
    "Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?": "coworkers_negative_view",
    "How willing would you be to share with friends and family that you have a mental illness?": "willing_to_share_with_family_friends",
    "Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?": "observed_unsupportive_response",
    "Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?": "observations_made_less_likely_to_reveal",
    "Do you have a family history of mental illness?": "family_history_of_mental_illness",
    "Have you had a mental health disorder in the past?": "past_mental_health_disorder",
    "Do you currently have a mental health disorder?": "current_mental_health_disorder",
    "If yes, what condition(s) have you been diagnosed with?": "diagnosed_conditions",
    "If maybe, what condition(s) do you believe you have?": "believed_conditions",
    "Have you been diagnosed with a mental health condition by a medical professional?": "diagnosed_by_professional",
    "If so, what condition(s) were you diagnosed with?": "conditions_diagnosed",
    "Have you ever sought treatment for a mental health issue from a mental health professional?": "sought_treatment",
    "If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?": "interference_when_treated",
    "If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?": "interference_when_not_treated",
    "What is your age?": "age",
    "What is your gender?": "gender",
    "What country do you live in?": "country_of_residence",
    "What US state or territory do you live in?": "state_of_residence",
    "What country do you work in?": "work_country",
    "What US state or territory do you work in?": "work_state",
    "Which of the following best describes your work position?": "work_position",
    "Do you work remotely?": "work_remotely",
}


In [51]:
df.rename(columns=column_rename_dict, inplace=True)
df.sample(n=5, random_state=42)

Unnamed: 0,self_employed,company_size,employer_tech_company,role_in_tech,mental_health_benefits,mental_health_coverage_options,employer_discussed_mental_health,mental_health_resources,anonymity_protection,mental_health_leave_request,...,interference_when_treated,interference_when_not_treated,age,gender,country_of_residence,state_of_residence,work_country,work_state,work_position,work_remotely
370,0,500-1000,1.0,,Yes,Yes,I don't know,I don't know,Yes,Somewhat easy,...,Not applicable to me,Not applicable to me,35,Female,United States of America,California,United States of America,California,Supervisor/Team Lead,Sometimes
560,0,6-25,1.0,,No,,No,No,Yes,Very easy,...,Not applicable to me,Not applicable to me,30,Male,South Africa,,South Africa,,Designer|Front-end Developer|Back-end Develope...,Sometimes
1007,0,6-25,1.0,,I don't know,I am not sure,No,I don't know,I don't know,Somewhat easy,...,Not applicable to me,Not applicable to me,25,Male,United States of America,Ohio,United States of America,Ohio,Supervisor/Team Lead|Support|Front-end Develop...,Sometimes
589,0,100-500,1.0,,I don't know,I am not sure,No,I don't know,No,Somewhat easy,...,Rarely,Often,33,Male,Bangladesh,,Bangladesh,,Back-end Developer,Sometimes
983,1,,,,,,,,,,...,Not applicable to me,Sometimes,35,m,United States of America,Michigan,United States of America,Michigan,Back-end Developer,Always


Drop columns that are contingency questions

In [52]:
contingency_question_columns = [
    "diagnosed_conditions",  # Only applicable to those diagnosed
    "believed_conditions",  # Only applicable to those who answered "maybe"
    "diagnosed_by_professional",  # Only applicable to those diagnosed
    "conditions_diagnosed",  # Only applicable to those diagnosed
    "sought_treatment",  # Only applicable to those diagnosed
    "interference_when_treated",  # Only applicable to those diagnosed
    "interference_when_not_treated",  # Only applicable to those diagnosed
    "work_state",  # Only applicable to US residents
]

Drop open ended questions with unique values exceeding 10% of the total number of responses

In [53]:
open_ended_columns = [
    "reason_discussing_physical_health",
    "reason_discussing_mental_health",
    "work_position",
]

Drop columns with >30% missing values (15 columns)

In [54]:
threshold = 0.3
cols_with_over_30_missing = [
    col for col in df.columns if df[col].isna().mean() > threshold
]
cols_with_over_30_missing

['role_in_tech',
 'medical_coverage_mental_health',
 'knowledge_of_resources',
 'reveal_mental_health_to_clients',
 'negative_impact_revealing_to_clients',
 'reveal_mental_health_to_coworkers',
 'negative_impact_revealing_to_coworkers',
 'productivity_affected_by_mental_health',
 'percentage_work_time_affected',
 'observations_made_less_likely_to_reveal',
 'diagnosed_conditions',
 'believed_conditions',
 'conditions_diagnosed',
 'state_of_residence',
 'work_state']

Clean gender column

In [55]:
gender_mapping = {
    "male": [
        "male",
        "m",
        "male-ish",
        "maile",
        "mal",
        "male (cis)",
        "make",
        "male ",
        "man",
        "msle",
        "mail",
        "malr",
        "cis man",
        "cis male",
    ],
    "female": [
        "cis female",
        "f",
        "female",
        "woman",
        "femake",
        "female ",
        "cis-female/femme",
        "female (cis)",
        "femail",
    ],
    "other": [
        "trans-female",
        "something kinda male?",
        "queer/she/they",
        "non-binary",
        "nah",
        "all",
        "enby",
        "fluid",
        "genderqueer",
        "androgyne",
        "agender",
        "male leaning androgynous",
        "guy (-ish) ^_^",
        "trans woman",
        "neuter",
        "female (trans)",
        "queer",
        "ostensibly male, unsure what that really means",
        "p",
        "a little about you",
    ],
}

Prepare gender column transformer

In [56]:
class GenderTransformer(BaseEstimator, TransformerMixin):
    """
    Transforms the values of any column(s) passed in by ColumnTransformer
    using the specified mapping dictionary.
    """

    def __init__(self, mapping_dict: dict):
        self.mapping_dict = mapping_dict

    def fit(self, X: pd.DataFrame, y=None) -> "GenderTransformer":
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X_copy = X.copy()
        for col in X_copy.columns:
            X_copy[col] = X_copy[col].str.lower().fillna("")
            for category, variants in self.mapping_dict.items():
                X_copy.loc[X_copy[col].isin([v.lower() for v in variants]), col] = (
                    category
                )
        return X_copy

    def get_feature_names_out(self, input_features=None):
        return input_features


Prepare age column transformer

In [57]:
class AgeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, min_age: int = 18, max_age: int = 122):
        self.min_age = min_age
        self.max_age = max_age

    def fit(self, X: pd.DataFrame, y=None) -> "AgeTransformer":
        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X_copy = X.copy()
        median_age = X_copy["age"].median()
        X_copy.loc[
            (X_copy["age"] < self.min_age) | (X_copy["age"] > self.max_age), "age"
        ] = median_age
        return X_copy

    def get_feature_names_out(self, input_features=None):
        return input_features

Make preprocessor

In [58]:
drop_columns = list(
    set(contingency_question_columns + open_ended_columns + cols_with_over_30_missing)
)

num_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", MinMaxScaler()),
    ]
)

cat_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder()),
    ]
)

gender_pipeline = Pipeline(
    [
        ("gender_transformer", GenderTransformer(mapping_dict=gender_mapping)),
        ("encoder", OrdinalEncoder()),
    ]
)

age_pipeline = Pipeline(
    [
        ("age_transformer", AgeTransformer(min_age=18, max_age=122)),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", MinMaxScaler()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("drop", "drop", drop_columns),
        ("age", age_pipeline, ["age"]),
        ("gender", gender_pipeline, ["gender"]),
        (
            "num",
            num_pipeline,
            make_column_selector(pattern="^(?!age$).*", dtype_include="number"),
        ),
        ("y", OrdinalEncoder(), ["current_mental_health_disorder"]),
    ],
    remainder=cat_pipeline,
    # verbose_feature_names_out=False,
)

In [59]:
preprocessor = preprocessor.fit(df)

In [60]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

df_train_transformed = pd.DataFrame(
    preprocessor.transform(df_train), columns=preprocessor.get_feature_names_out()
)

df_test_transformed = pd.DataFrame(
    preprocessor.transform(df_test), columns=preprocessor.get_feature_names_out()
)

df_train_transformed.sample(n=5, random_state=42)

ValueError: Shape of passed values is (1146, 1), indices imply (1146, 235)

In [27]:
df_train_transformed.columns

Index(['age', 'gender', 'self_employed', 'employer_tech_company',
       'role_in_tech', 'medical_coverage_mental_health', 'previous_employers',
       'sought_treatment', 'company_size', 'mental_health_benefits',
       'mental_health_coverage_options', 'employer_discussed_mental_health',
       'mental_health_resources', 'anonymity_protection',
       'mental_health_leave_request',
       'negative_consequences_discussing_mental_health',
       'negative_consequences_discussing_physical_health',
       'comfortable_discussing_mental_health_coworkers',
       'comfortable_discussing_mental_health_supervisor',
       'employer_prioritizes_mental_health', 'observed_negative_consequences',
       'previous_employers_mental_health_benefits',
       'previous_employers_coverage_options',
       'previous_employers_discussed_mental_health',
       'previous_employers_resources',
       'previous_employers_anonymity_protection',
       'negative_consequences_previous_employers',
       'nega

In [19]:
X_train = df_train_transformed.drop(columns=["current_mental_health_disorder"])
y_train = df_train_transformed["current_mental_health_disorder"]

X_test = df_test_transformed.drop(columns=["current_mental_health_disorder"])
y_test = df_test_transformed["current_mental_health_disorder"]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_clf.predict(X_test)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.2f}")

Confusion Matrix:
[[ 0 24 36]
 [ 0 52 54]
 [ 0 24 97]]
Accuracy Score: 0.52


In [21]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=10000)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_log_reg = log_reg.predict(X_test)

# Calculate confusion matrix
conf_matrix_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print("Confusion Matrix:")
print(conf_matrix_log_reg)

# Calculate accuracy score
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Accuracy Score: {accuracy_log_reg:.2f}")

Confusion Matrix:
[[ 13  26  21]
 [ 15  71  20]
 [  6  14 101]]
Accuracy Score: 0.64


In [None]:
from xgboost import XGBClassifier

# Initialize the XGBoost model
xgb_clf = XGBClassifier()

# Train the model
xgb_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_clf.predict(X_test)

# Calculate confusion matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
print("Confusion Matrix:")
print(conf_matrix_xgb)

# Calculate accuracy score
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy Score: {accuracy_xgb:.2f}")

Confusion Matrix:
[[ 17  16  27]
 [ 16  69  21]
 [  8   9 104]]
Accuracy Score: 0.66


In [None]:
from xgboost import XGBRFClassifier

# Initialize the XGBoost model
xgbrf_clf = XGBRFClassifier()

# Train the model
xgbrf_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_xgbrf = xgbrf_clf.predict(X_test)

# Calculate confusion matrix
conf_matrix_xgbrf = confusion_matrix(y_test, y_pred_xgbrf)
print("Confusion Matrix:")
print(conf_matrix_xgbrf)

# Calculate accuracy score
accuracy_xgbrf = accuracy_score(y_test, y_pred_xgbrf)
print(f"Accuracy Score: {accuracy_xgbrf:.2f}")

Confusion Matrix:
[[ 26   9  25]
 [ 10  72  24]
 [  6   3 112]]
Accuracy Score: 0.73
