In [1]:
# Importing Libraries required.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer


In [2]:
# Load the dataset from the CSV file
study_data= pd.read_csv("mental-heath-in-tech-2016_20161114.csv")

In [3]:
for col in study_data.columns:
    print(col)

Are you self-employed?
How many employees does your company or organization have?
Is your employer primarily a tech company/organization?
Is your primary role within your company related to tech/IT?
Does your employer provide mental health benefits as part of healthcare coverage?
Do you know the options for mental health care available under your employer-provided coverage?
Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?
Does your employer offer resources to learn more about mental health concerns and options for seeking help?
Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?
If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:
Do you think that discussing a mental health disorder with your employer would have negative consequences?
Do you think that disc

In [4]:
# Make a copy of the original colums to compare later
orig_columns = study_data.columns

# Rename the columns for better readability
study_data.columns = [
    "self_employed",
    "num_employees",
    "employer_tech_company",
    "role_related_to_tech",
    "mental_health_benefits",
    "know_mental_health_options",
    "employer_discussed_mh",
    "employer_mh_resources",
    "anonymity_protected",
    "mh_leave_comfort",
    "mh_discussion_negative",
    "ph_discussion_negative",
    "mh_comfort_coworkers",
    "mh_comfort_supervisor",
    "employer_mh_priority",
    "negative_consequences_observed",
    "medical_coverage_mh",
    "know_local_mh_resources",
    "reveal_mh_clients",
    "reveal_mh_clients_negative",
    "reveal_mh_coworkers",
    "reveal_mh_coworkers_negative",
    "productivity_affected",
    "work_time_affected_pct",
    "previous_employers",
    "prev_employers_mh_benefits",
    "prev_employers_mh_options",
    "prev_employers_discussed_mh",
    "prev_employers_mh_resources",
    "prev_employers_anonymity",
    "prev_employers_mh_negative",
    "prev_employers_ph_negative",
    "mh_comfort_prev_coworkers",
    "mh_comfort_prev_supervisor",
    "prev_employers_mh_priority",
    "prev_employers_negative_obs",
    "physical_health_in_interview",
    "physical_health_in_interview_reason",
    "mental_health_in_interview",
    "mental_health_in_interview_reason",
    "mh_hurt_career",
    "mh_viewed_negatively",
    "mh_share_friends_family",
    "unsupportive_response",
    "observed_mh_discussion_effect",
    "family_history_mh",
    "past_mh_disorder",
    "current_mh_disorder",
    "current_mh_condition",
    "maybe_mh_condition",
    "diagnosed_mh_condition",
    "diagnosed_mh_condition_details",
    "sought_mh_treatment",
    "mh_treatment_effective",
    "mh_treatment_ineffective",
    "age",
    "gender",
    "country_residence",
    "us_state_residence",
    "country_work",
    "us_state_work",
    "work_position",
    "work_remote"
]

print(orig_columns)

Index(['Are you self-employed?',
       'How many employees does your company or organization have?',
       'Is your employer primarily a tech company/organization?',
       'Is your primary role within your company related to tech/IT?',
       'Does your employer provide mental health benefits as part of healthcare coverage?',
       'Do you know the options for mental health care available under your employer-provided coverage?',
       'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?',
       'Does your employer offer resources to learn more about mental health concerns and options for seeking help?',
       'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?',
       'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:',
       'Do you think that dis

In [5]:
# Compare the original columns with the new columns
for i in range(len(orig_columns)):
    print(orig_columns[i], " -> ", study_data.columns[i])

Are you self-employed?  ->  self_employed
How many employees does your company or organization have?  ->  num_employees
Is your employer primarily a tech company/organization?  ->  employer_tech_company
Is your primary role within your company related to tech/IT?  ->  role_related_to_tech
Does your employer provide mental health benefits as part of healthcare coverage?  ->  mental_health_benefits
Do you know the options for mental health care available under your employer-provided coverage?  ->  know_mental_health_options
Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?  ->  employer_discussed_mh
Does your employer offer resources to learn more about mental health concerns and options for seeking help?  ->  employer_mh_resources
Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?  ->  anonymity_protected
If a me

# Get a brief overview of the data

In [6]:
# Display main statistics of each column
describe = study_data.describe(include='all').T.to_string()
print(describe)

                                      count unique                                                                                                    top  freq       mean        std  min   25%   50%   75%    max
self_employed                        1433.0    NaN                                                                                                    NaN   NaN   0.200279   0.400349  0.0   0.0   0.0   0.0    1.0
num_employees                          1146      6                                                                                                 26-100   292        NaN        NaN  NaN   NaN   NaN   NaN    NaN
employer_tech_company                1146.0    NaN                                                                                                    NaN   NaN   0.770506   0.420691  0.0   1.0   1.0   1.0    1.0
role_related_to_tech                  263.0    NaN                                                                                                    Na

# Data cleaning

## Remove useless answers

In [7]:
# Removing uncertain answers  
study_data = study_data.replace("Not applicable to me",np.nan)
study_data = study_data.replace("I don't know",np.nan)
study_data = study_data.replace("I'm not sure",np.nan)

## Replace age outliers

In [8]:
# Display unique age values
print(study_data['age'].unique())

[ 39  29  38  43  42  30  37  44  28  34  35  52  32  25  31  26  33  27
  36  40  46  41  45  19  21  24  17  23  22  51  48  55  50  49  20  54
  47  56  57  63  99  61 323  62  53  58   3  66  59  15  65  74  70]


In [9]:
# Replace age outliers with NaN
age_copy = study_data['age'].copy()                     # Create a copy for later comparison
study_data['age'] = study_data['age'].apply(
    lambda x: np.nan if x < 15 or x > 75 else x         # Replace age outliers with NaN
    )
age_checksum = (study_data['age'] != age_copy).sum()    # Check how many rows were changed

print(f"Replaced age outliers with NaN for {age_checksum} rows.")

Replaced age outliers with NaN for 3 rows.


## Reduce age to categories

In [21]:
study_data['age'] = pd.cut(
    study_data['age'], 
    bins=[18, 30, 40, 50, 60, float('inf')],  # Extend bins to include all ages above 60
    labels=['18 to 30', '30 to 40', '40 to 50', '50 to 60', '60+'], 
    right=False
)
print(study_data['age'].value_counts())

age
30 to 40    535
18 to 30    306
40 to 50    178
50 to 60     41
60+           9
Name: count, dtype: int64


## Drop rows with too many missing values


In [10]:
original_count = study_data.shape[0]                                            # Get the original row count
study_data = study_data.dropna(thresh=study_data.shape[1] * 0.65)               # Drop rows with over 50% missing values
drop_lines_percent = (1 - (study_data.shape[0] / original_count)) *100          # Calculate the percentage of dropped rows

print(f"Dropped {drop_lines_percent}% of rows due to missing values.")

Dropped 25.33147243545011% of rows due to missing values.


## Drop columns with to many missing values

#### Get an overview of the nan percentage in the columns

In [11]:
# Calculate NaN percentages
nan_percentage = (study_data.isna().sum() / len(study_data)) * 100
# Sort by percentage in descending order
nan_percentage_sorted = nan_percentage.sort_values(ascending=False)

# Display the sorted percentages
print(nan_percentage_sorted.to_string())

reveal_mh_clients_negative             94.859813
reveal_mh_coworkers_negative           94.112150
reveal_mh_coworkers                    89.626168
work_time_affected_pct                 89.532710
reveal_mh_clients                      89.158879
medical_coverage_mh                    88.691589
productivity_affected                  88.691589
know_local_mh_resources                88.691589
role_related_to_tech                   79.252336
maybe_mh_condition                     77.663551
anonymity_protected                    67.102804
prev_employers_anonymity               63.831776
current_mh_condition                   54.018692
observed_mh_discussion_effect          48.224299
employer_mh_priority                   47.289720
diagnosed_mh_condition_details         42.056075
us_state_residence                     34.859813
employer_mh_resources                  34.672897
us_state_work                          34.299065
mental_health_benefits                 33.925234
mh_treatment_effecti

#### Drop all with more than 30% of missing values

In [12]:
for col_name, percentage in nan_percentage_sorted.items():         # Iterate over the sorted percentages 
    if percentage > 35:                                            # If the percentage is over 30%
        study_data.drop(col_name, axis=1, inplace=True)            # Drop the column from study_data

In [13]:
# Check the nan percentage again
# Calculate NaN percentages
nan_percentage = (study_data.isna().sum() / len(study_data)) * 100
# Sort by percentage in descending order
nan_percentage_sorted = nan_percentage.sort_values(ascending=False)

# Display the sorted percentages
print(nan_percentage_sorted.to_string())

us_state_residence                     34.859813
employer_mh_resources                  34.672897
us_state_work                          34.299065
mental_health_benefits                 33.925234
mh_treatment_effective                 30.934579
mh_treatment_ineffective               25.046729
mh_leave_comfort                       21.869159
prev_employers_mh_benefits             21.775701
prev_employers_mh_priority             21.682243
know_mental_health_options             20.654206
physical_health_in_interview_reason    20.654206
prev_employers_mh_negative             20.467290
employer_discussed_mh                  18.504673
mental_health_in_interview_reason      18.224299
family_history_mh                      16.261682
negative_consequences_observed         11.308411
mh_comfort_coworkers                   11.308411
employer_tech_company                  11.308411
mh_discussion_negative                 11.308411
num_employees                          11.308411
ph_discussion_negati

## Fill missing age values

In [14]:
age_copy_2 = study_data['age'].copy()                       # Create a copy for later comparison
age_median = study_data['age'].median()                     # Calculate the median age
study_data.fillna({'age': age_median}, inplace=True)        # Fill missing age values with the median
age_checksum_2 = (age_copy_2 != study_data['age']).sum()    # Check how many rows were changed

print(f"Filled missing age values with the median: {age_median}  for {age_checksum_2} rows.")

Filled missing age values with the median: 33.0  for 2 rows.


## Replace genders

In [15]:
gender_before = study_data["gender"].value_counts()

male = ['Male', 'male', 'Male ', 'M', 'm',
       'man', 'Cis male', 'Male.', 'male 9:1 female, roughly', 'Male (cis)', 'Man', 'Sex is male',
       'cis male', 'Malr', 'Dude', "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? ",
       'mail', 'M|', 'Male/genderqueer', 'male ',
       'Cis Male', 'Male (trans, FtM)',
       'cisdude', 'cis man', 'MALE']
female = ['Female', 'female', 'I identify as female.', 'female ',
       'Female assigned at birth ', 'F', 'Woman', 'fm', 'f', 'Cis female ', 'Transitioned, M2F',
       'Genderfluid (born female)', 'Female or Multi-Gender Femme', 'Female ', 'woman', 'female/woman',
       'Cisgender Female', 'fem', 'Female (props for making this a freeform field, though)',
       ' Female', 'Cis-woman', 'female-bodied; no feelings about gender',
       'AFAB']

study_data["gender"] = study_data["gender"].replace(male, 0)
study_data["gender"] = study_data["gender"].replace(female, 2)
study_data["gender"] = study_data["gender"].apply(lambda x: 1 if x not in [0, 2] else x)

print("Before:\n",
      gender_before,
      "\n\n\nAfter:\n",
      study_data["gender"].value_counts())


Before:
 gender
Male                     458
male                     174
Female                   128
female                    81
M                         62
                        ... 
Man                        1
Sex is male                1
none of your business      1
genderqueer                1
Transgender woman          1
Name: count, Length: 61, dtype: int64 


After:
 gender
0    761
2    284
1     25
Name: count, dtype: int64


## Reducing countries of work

In [16]:
# Get the unique values of the 'country_work' column
print(study_data['country_work'].value_counts())

country_work
United States of America    703
United Kingdom              115
Canada                       52
Germany                      31
Australia                    28
Netherlands                  24
Ireland                      12
Brazil                       10
Sweden                       10
New Zealand                   7
France                        7
Denmark                       6
Switzerland                   6
Finland                       5
India                         4
Russia                        4
South Africa                  4
Bulgaria                      3
Norway                        3
Austria                       3
Chile                         3
Pakistan                      2
Israel                        2
Romania                       2
Estonia                       2
Belgium                       2
Spain                         2
Colombia                      2
Vietnam                       1
Hungary                       1
Guatemala                  

In [17]:
# Map countries to continents
# Define the continent mappings
continent_mapping = {
    **{country: "north_america" for country in ["United States of America", "Canada", "Mexico"]},
    **{country: "europe" for country in [
        "United Kingdom", "Netherlands", "Germany", "Sweden", "France",
        "Ireland", "Switzerland", "Bulgaria", "Finland", "Denmark",
        "Russia", "Spain", "Norway", "Austria", "Bosnia and Herzegovina",
        "Italy", "Poland", "Belgium", "Czech Republic"]},
}

# Replace the values in 'country_work' column with the mapped values or "Other"
study_data['country_work'] = study_data['country_work'].apply(lambda x: continent_mapping.get(x, "Other"))

# Display the unique values of the 'country_work' column
print(study_data['country_work'].value_counts())

country_work
north_america    756
europe           237
Other             77
Name: count, dtype: int64


## Reducing work position


In [18]:
# Get the unique values of the 'work_position' column
print(study_data['work_position'].value_counts().to_string())

work_position
Back-end Developer                                                                                                                            189
Front-end Developer                                                                                                                            98
Other                                                                                                                                          94
Supervisor/Team Lead                                                                                                                           54
Back-end Developer|Front-end Developer                                                                                                         47
DevOps/SysAdmin                                                                                                                                45
Executive Leadership                                                                                          

In [19]:
# Optimized category mapping with direct keyword-to-category association
keyword_to_category = {
    "supervisor": "Management",
    "team lead": "Management",
    "leadership": "Management",
    "executive": "Management",
    "back": "Development",
    "developer": "Development",
    "front": "Development",
    "devops": "Development",
    "sysadmin": "Development",
    "dev": "Development",
    "support": "Support",
    "helpdesk": "Support",
    "customer": "Support",
    "design": "Design",
    "ui": "Design",
    "ux": "Design",
    "creative": "Design",
    "sales": "Sales",
    "marketing": "Sales",
    "business": "Sales",
    "evangelist": "Advocacy",
    "advocate": "Advocacy",
    "hr": "Human Resources",
    "human resources": "Human Resources",
    "recruitment": "Human Resources",
}

# Function to classify roles
def categorize_role_optimized(role):
    role = role.lower()  # Standardize to lowercase
    for keyword, category in keyword_to_category.items():
        if keyword in role:
            return category
    return "Other"  # Default for unmatched roles

# Convert the entire column to lowercase for efficiency
study_data['work_position'] = study_data['work_position'].str.lower()

# Apply the optimized categorization function
study_data['work_position'] = study_data['work_position'].apply(categorize_role_optimized)

# Display the unique values and their counts
print(study_data['work_position'].value_counts().to_string())

work_position
Development        622
Management         269
Other              116
Support             34
Design              23
Human Resources      3
Sales                3


In [20]:
# Display main statistics of each column
describe = study_data.describe(include='all').T.to_string()

print("\n\n\n", study_data.shape, "\n\n\n")
print(describe)




 (1070, 47) 



                                      count unique                            top freq       mean       std   min   25%   50%   75%   max
self_employed                        1070.0    NaN                            NaN  NaN   0.113084  0.316844   0.0   0.0   0.0   0.0   1.0
num_employees                           949      6                         26-100  243        NaN       NaN   NaN   NaN   NaN   NaN   NaN
employer_tech_company                 949.0    NaN                            NaN  NaN    0.76607  0.423552   0.0   1.0   1.0   1.0   1.0
mental_health_benefits                  707      3                            Yes  474        NaN       NaN   NaN   NaN   NaN   NaN   NaN
know_mental_health_options              849      3                  I am not sure  287        NaN       NaN   NaN   NaN   NaN   NaN   NaN
employer_discussed_mh                   872      2                             No  670        NaN       NaN   NaN   NaN   NaN   NaN   NaN
employer_mh_res