In [102]:
# Loading the dataset

import pandas as pd
import numpy as np
from sklearn.utils import resample

df = pd.read_csv("survey.csv").dropna()


In [103]:
# Previewing the data dimensions
print(df.shape)

(86, 27)


In [104]:
#oversampling
df_minority = df[df['treatment'] == 'No']
df_majority = df[df['treatment'] == 'Yes']

df_minority_upsampled = resample(
    df_minority,
    replace = True,
    n_samples = len(df_majority),
    random_state = 42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])

df_balanced = df_balanced.sample(frac=1, random_state=42)

print(df_balanced.shape)

(132, 27)


In [105]:
# Previewing the data columns
print(df_balanced.columns)

Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')


In [106]:
# Previewing the data quick summary
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86 entries, 24 to 1249
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  86 non-null     object
 1   Age                        86 non-null     int64 
 2   Gender                     86 non-null     object
 3   Country                    86 non-null     object
 4   state                      86 non-null     object
 5   self_employed              86 non-null     object
 6   family_history             86 non-null     object
 7   treatment                  86 non-null     object
 8   work_interfere             86 non-null     object
 9   no_employees               86 non-null     object
 10  remote_work                86 non-null     object
 11  tech_company               86 non-null     object
 12  benefits                   86 non-null     object
 13  care_options               86 non-null     object
 14  wellness_

In [107]:
# Previewing the data - viewing first 5 records
print(df_balanced.head())

                Timestamp  Age  Gender        Country state self_employed  \
1078  2014-08-29 14:59:43   37  female  United States    NJ            No   
849   2014-08-28 16:57:46   40       M  United States    IL            No   
373   2014-08-27 15:22:20   32       F  United States    WA            No   
627   2014-08-28 02:17:42   34    male  United States    CA           Yes   
568   2014-08-27 19:45:36   31    male  United States    CA            No   

     family_history treatment work_interfere    no_employees  ...  \
1078            Yes       Yes      Sometimes          26-100  ...   
849              No        No         Rarely  More than 1000  ...   
373              No       Yes      Sometimes          26-100  ...   
627             Yes       Yes      Sometimes            6-25  ...   
568             Yes        No      Sometimes  More than 1000  ...   

              leave mental_health_consequence phys_health_consequence  \
1078     Don't know                       Yes    

In [108]:
# Preview the selected dataframe and check null values
print(df_balanced.info())
print(df_balanced.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 1078 to 1024
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  132 non-null    object
 1   Age                        132 non-null    int64 
 2   Gender                     132 non-null    object
 3   Country                    132 non-null    object
 4   state                      132 non-null    object
 5   self_employed              132 non-null    object
 6   family_history             132 non-null    object
 7   treatment                  132 non-null    object
 8   work_interfere             132 non-null    object
 9   no_employees               132 non-null    object
 10  remote_work                132 non-null    object
 11  tech_company               132 non-null    object
 12  benefits                   132 non-null    object
 13  care_options               132 non-null    object
 14  wellne

In [109]:
# Replace null values in state column with "Unknown"
df_balanced['state'] = df_balanced['state'].fillna('Unknown')

In [110]:
# Replace null values in self_employed column with "Unknown"
df_balanced['self_employed'] = df_balanced['self_employed'].fillna('Unknown')

In [111]:
# Replace null values in work_interfere column with "Unknown"
df_balanced['work_interfere'] = df_balanced['work_interfere'].fillna('Unknown')

In [112]:
# Replace null values in work_interfere column with "Unknown"
df_balanced['comments'] = df_balanced['comments'].fillna('')

In [113]:
# I decided to fill misisng values with Unknown/empty string (for comments) to avoid dropping any records and losing data.
# If needed, these records can be dropped/handled when modeling.
# Comments has a significant amount of missing values that were filled with empty strings, so this column may need to be dropped in modeling if needed.

In [114]:
# Checking that there are no null values in any columns
print(df_balanced.isnull().sum())

Timestamp                    0
Age                          0
Gender                       0
Country                      0
state                        0
self_employed                0
family_history               0
treatment                    0
work_interfere               0
no_employees                 0
remote_work                  0
tech_company                 0
benefits                     0
care_options                 0
wellness_program             0
seek_help                    0
anonymity                    0
leave                        0
mental_health_consequence    0
phys_health_consequence      0
coworkers                    0
supervisor                   0
mental_health_interview      0
phys_health_interview        0
mental_vs_physical           0
obs_consequence              0
comments                     0
dtype: int64


In [115]:
# Standardizing inconsistent values in columns

# Timestamp
df_balanced['Timestamp'].unique()
df_balanced['Timestamp'].value_counts()

2014-08-28 16:57:46    6
2014-08-28 17:47:49    6
2014-08-27 14:39:20    6
2014-08-27 16:55:04    5
2014-08-27 19:45:36    5
                      ..
2014-08-27 16:27:47    1
2014-08-27 16:01:39    1
2014-08-29 12:54:31    1
2014-08-27 12:23:59    1
2014-08-27 13:35:23    1
Name: Timestamp, Length: 85, dtype: int64

In [116]:
# Age
df_balanced['Age'].unique()

array([37, 40, 32, 34, 31, 27, 46, 25, 29, 38, 30, 39, 33, 42, 44, 55, 45,
       26, 41, 50, 35, 49, 56, 36, 48, 43, 21, 23,  8, -1], dtype=int64)

In [117]:
min_age = 18
max_age = 120
df_balanced['Age'] = df_balanced['Age'].apply(lambda x: x if min_age <= x <= max_age else np.nan)

In [118]:
df_balanced['Age'].unique()

# Remains numeric, and any invalid values will just be NaN
# Skipped in calculations by default or can be filled with a statistic (mean, median, etc.) or can be dropped

array([37., 40., 32., 34., 31., 27., 46., 25., 29., 38., 30., 39., 33.,
       42., 44., 55., 45., 26., 41., 50., 35., 49., 56., 36., 48., 43.,
       21., 23., nan])

In [119]:
# Gender
df_balanced['Gender'].unique()
df_balanced['Gender'].value_counts()

Male                  63
M                     18
male                  14
female                10
Female                10
m                      4
F                      3
Female (cis)           3
cis-female/femme       1
Female                 1
f                      1
A little about you     1
Cis Male               1
woman                  1
p                      1
Name: Gender, dtype: int64

In [120]:
gender_map = {
    # Male variants
    'male': 'Male', 'm': 'Male', 'man': 'Male', 'cis male': 'Male', 'cis man': 'Male',
    'male ': 'Male', 'msle': 'Male', 'mal': 'Male', 'malr': 'Male', 'maile': 'Male',
    'make': 'Male', 'mail': 'Male', 'male (cis)': 'Male', 'm ': 'Male', 'guy (-ish) ^_^': 'Male',
    'male leaning androgynous': 'Male', 'ostensibly male, unsure what that really means': 'Male',
    'male-ish': 'Male',

    # Female variants
    'female': 'Female', 'f': 'Female', 'woman': 'Female', 'cis female': 'Female',
    'femake': 'Female', 'femail': 'Female', 'female ': 'Female', 'female (cis)': 'Female',
    'female (trans)': 'Female', 'trans-female': 'Female', 'trans woman': 'Female',
    'cis-female/femme': 'Female', 'cis female': 'Female', 'cisfemale': 'Female',

    # Everything else
    'queer': 'Other', 'non-binary': 'Other', 'enby': 'Other', 'agender': 'Other',
    'androgyne': 'Other', 'genderqueer': 'Other', 'fluid': 'Other', 'nah': 'Other',
    'all': 'Other', 'a little about you': 'Other', 'queer/she/they': 'Other',
    'something kinda male?': 'Other', 'p': 'Other'
}

In [121]:
df_balanced['Gender'] = df_balanced['Gender'].str.strip().str.lower().map(gender_map).fillna('Other')

In [122]:
df_balanced['Gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [123]:
df_balanced['Gender'].value_counts()

Male      100
Female     30
Other       2
Name: Gender, dtype: int64

In [124]:
# Country
df_balanced['Country'].unique()
df_balanced['Country'].value_counts()

United States    129
Israel             2
Bahamas, The       1
Name: Country, dtype: int64

In [125]:
# State
df_balanced['state'].unique()
df_balanced['state'].value_counts()

# Note: "Unknown" records may be those who left this field empty in the survey, which could indicate they are not in the US

CA    25
WA    21
TN    11
IL    11
TX     8
VA     8
OH     6
OR     5
MD     4
IN     4
VT     3
AL     3
GA     3
PA     3
FL     3
NY     2
MA     2
MN     2
WY     2
UT     1
NJ     1
NC     1
WI     1
MI     1
LA     1
Name: state, dtype: int64

In [126]:
# self_employed
df_balanced['self_employed'].unique()
df_balanced['self_employed'].value_counts()

No     113
Yes     19
Name: self_employed, dtype: int64

In [127]:
# family_history
df_balanced['family_history'].unique()
df_balanced['family_history'].value_counts()

No     69
Yes    63
Name: family_history, dtype: int64

In [128]:
# treatment
df_balanced['treatment'].unique()
df_balanced['treatment'].value_counts()

Yes    66
No     66
Name: treatment, dtype: int64

In [129]:
# work_interfere
df_balanced['work_interfere'].unique()
df_balanced['work_interfere'].value_counts()

# Ordinal Categorical Variable

Sometimes    73
Never        23
Rarely       22
Often        14
Name: work_interfere, dtype: int64

In [130]:
# no_employees
df_balanced['no_employees'].unique()
df_balanced['no_employees'].value_counts()

# Ordinal Categorical Variable

More than 1000    40
26-100            29
1-5               26
6-25              17
100-500           17
500-1000           3
Name: no_employees, dtype: int64

In [131]:
# remote_work
df_balanced['remote_work'].unique()
df_balanced['remote_work'].value_counts()

No     79
Yes    53
Name: remote_work, dtype: int64

In [132]:
# tech_company
df_balanced['tech_company'].unique()
df_balanced['tech_company'].value_counts()

Yes    115
No      17
Name: tech_company, dtype: int64

In [133]:
# benefits
df_balanced['benefits'].unique()
df_balanced['benefits'].value_counts()

Yes           67
Don't know    37
No            28
Name: benefits, dtype: int64

In [134]:
# care_options
df_balanced['care_options'].unique()
df_balanced['care_options'].value_counts()

Yes         63
Not sure    35
No          34
Name: care_options, dtype: int64

In [135]:
# wellness_program
df_balanced['wellness_program'].unique()
df_balanced['wellness_program'].value_counts()

No            68
Yes           41
Don't know    23
Name: wellness_program, dtype: int64

In [136]:
# seek_help
df_balanced['seek_help'].unique()
df_balanced['seek_help'].value_counts()

Yes           52
No            48
Don't know    32
Name: seek_help, dtype: int64

In [137]:
# anonymity
df_balanced['anonymity'].unique()
df_balanced['anonymity'].value_counts()

Don't know    75
Yes           51
No             6
Name: anonymity, dtype: int64

In [138]:
# leave
df_balanced['leave'].unique()
df_balanced['leave'].value_counts()

# Ordinal Categorical Variable

Don't know            48
Very easy             33
Somewhat easy         26
Very difficult        15
Somewhat difficult    10
Name: leave, dtype: int64

In [139]:
# mental_health_consequence
df_balanced['mental_health_consequence'].unique()
df_balanced['mental_health_consequence'].value_counts()

No       56
Maybe    46
Yes      30
Name: mental_health_consequence, dtype: int64

In [140]:
# phys_health_consequence
df_balanced['phys_health_consequence'].unique()
df_balanced['phys_health_consequence'].value_counts()

No       96
Maybe    25
Yes      11
Name: phys_health_consequence, dtype: int64

In [141]:
# coworkers
df_balanced['coworkers'].unique()
df_balanced['coworkers'].value_counts()

Some of them    77
No              30
Yes             25
Name: coworkers, dtype: int64

In [142]:
# supervisor
df_balanced['supervisor'].unique()
df_balanced['supervisor'].value_counts()

Yes             61
Some of them    42
No              29
Name: supervisor, dtype: int64

In [143]:
# mental_health_interview
df_balanced['mental_health_interview'].unique()
df_balanced['mental_health_interview'].value_counts()

No       112
Maybe     14
Yes        6
Name: mental_health_interview, dtype: int64

In [144]:
# phys_health_interview
df_balanced['phys_health_interview'].unique()
df_balanced['phys_health_interview'].value_counts()

Maybe    63
No       47
Yes      22
Name: phys_health_interview, dtype: int64

In [145]:
# mental_vs_physical
df_balanced['mental_vs_physical'].unique()
df_balanced['mental_vs_physical'].value_counts()

Don't know    52
Yes           44
No            36
Name: mental_vs_physical, dtype: int64

In [146]:
# obs_consequence
df_balanced['obs_consequence'].unique()
df_balanced['obs_consequence'].value_counts()

No     108
Yes     24
Name: obs_consequence, dtype: int64

In [147]:
# comments
df_balanced['comments'].unique()
df_balanced['comments'].value_counts()

While not personally affected I do have immediate family with mental health illness and my employer has been very supportive. Thanks for doing this survey.                                                                                                                    6
A lot of these answers aren't really applicable since I'm self employed as a sole proprietor.                                                                                                                                                                                  6
Regardless of a stated lack of negative consequences for discussing mental health issues with coworkers/superiors unconscious bias is a very real thing - as long as I don't *need* to inform my co-workers my mental health issues do not need to be public knowledge.        6
* Small family business - YMMV.                                                                                                                                                      

In [148]:
print(df_balanced.dtypes)

Timestamp                     object
Age                          float64
Gender                        object
Country                       object
state                         object
self_employed                 object
family_history                object
treatment                     object
work_interfere                object
no_employees                  object
remote_work                   object
tech_company                  object
benefits                      object
care_options                  object
wellness_program              object
seek_help                     object
anonymity                     object
leave                         object
mental_health_consequence     object
phys_health_consequence       object
coworkers                     object
supervisor                    object
mental_health_interview       object
phys_health_interview         object
mental_vs_physical            object
obs_consequence               object
comments                      object
d

In [149]:
# Object type variables need to be encoded because decision tree models require numerical values to make splits based on thresholds.
# Timestamp and comments were not encoded
# Timestamp can be dropped or encoded into separate numeric features if needed
# Comments can be dropped or encoded into binary incidating if the person did leave a comment or not.

# Create a binary column indicating if a comment was provided
df_balanced['comments_binary'] = df_balanced['comments'].apply(lambda x: 1 if pd.notnull(x) and x.strip() != '' else 0)

# Drop the 'comments' column as it is now represented by 'comment_provided'
df_balanced.drop(columns=['comments'], inplace=True)

# Map ordinal categorical variables to numeric values
work_interfere_map = {
    'Never': 0,
    'Rarely': 1,
    'Sometimes': 2,
    'Often': 3,
    'Unknown': -1
}

df_balanced['work_interfere'] = df_balanced['work_interfere'].map(work_interfere_map)

no_employees_map = {
    '1-5': 1,
    '6-25': 2,
    '26-100': 3,
    '100-500': 4,
    '500-1000': 5,
    'More than 1000': 6
}

df_balanced['no_employees'] = df_balanced['no_employees'].map(no_employees_map)

leave_map = {
    'Very easy': 0,
    'Somewhat easy': 1,
    'Somewhat difficult': 2,
    'Very difficult': 3,
    'Don\'t know': -1
}

df_balanced['leave'] = df_balanced['leave'].map(leave_map)


# For nominal categorical variables, one-hot encoding.
nominal_features = ['Gender', 'Country', 'state', 'self_employed', 'family_history', 'remote_work',
                    'tech_company', 'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity',
                    'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor', 
                    'mental_health_interview', 'phys_health_interview', 'mental_vs_physical', 'obs_consequence']

df_encoded_nominal = pd.get_dummies(df_balanced[nominal_features], drop_first=False)
df_balanced = pd.concat([df_balanced, df_encoded_nominal], axis=1)
df_balanced.drop(columns=nominal_features, inplace=True)

In [150]:
# Check column names in the DataFrame
print(df_balanced.columns)


Index(['Timestamp', 'Age', 'treatment', 'work_interfere', 'no_employees',
       'leave', 'comments_binary', 'Gender_Female', 'Gender_Male',
       'Gender_Other', 'Country_Bahamas, The', 'Country_Israel',
       'Country_United States', 'state_AL', 'state_CA', 'state_FL', 'state_GA',
       'state_IL', 'state_IN', 'state_LA', 'state_MA', 'state_MD', 'state_MI',
       'state_MN', 'state_NC', 'state_NJ', 'state_NY', 'state_OH', 'state_OR',
       'state_PA', 'state_TN', 'state_TX', 'state_UT', 'state_VA', 'state_VT',
       'state_WA', 'state_WI', 'state_WY', 'self_employed_No',
       'self_employed_Yes', 'family_history_No', 'family_history_Yes',
       'remote_work_No', 'remote_work_Yes', 'tech_company_No',
       'tech_company_Yes', 'benefits_Don't know', 'benefits_No',
       'benefits_Yes', 'care_options_No', 'care_options_Not sure',
       'care_options_Yes', 'wellness_program_Don't know',
       'wellness_program_No', 'wellness_program_Yes', 'seek_help_Don't know',
       'seek

In [151]:
df_balanced.head()

Unnamed: 0,Timestamp,Age,treatment,work_interfere,no_employees,leave,comments_binary,Gender_Female,Gender_Male,Gender_Other,...,mental_health_interview_No,mental_health_interview_Yes,phys_health_interview_Maybe,phys_health_interview_No,phys_health_interview_Yes,mental_vs_physical_Don't know,mental_vs_physical_No,mental_vs_physical_Yes,obs_consequence_No,obs_consequence_Yes
1078,2014-08-29 14:59:43,37.0,Yes,2,3,-1,1,1,0,0,...,1,0,1,0,0,1,0,0,1,0
849,2014-08-28 16:57:46,40.0,No,1,6,-1,1,0,1,0,...,1,0,1,0,0,1,0,0,1,0
373,2014-08-27 15:22:20,32.0,Yes,2,3,1,1,1,0,0,...,1,0,1,0,0,0,0,1,1,0
627,2014-08-28 02:17:42,34.0,Yes,2,2,1,1,0,1,0,...,1,0,0,1,0,0,1,0,1,0
568,2014-08-27 19:45:36,31.0,No,2,6,1,1,0,1,0,...,1,0,0,0,1,0,1,0,1,0


In [152]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 1078 to 1024
Data columns (total 84 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Timestamp                        132 non-null    object 
 1   Age                              130 non-null    float64
 2   treatment                        132 non-null    object 
 3   work_interfere                   132 non-null    int64  
 4   no_employees                     132 non-null    int64  
 5   leave                            132 non-null    int64  
 6   comments_binary                  132 non-null    int64  
 7   Gender_Female                    132 non-null    uint8  
 8   Gender_Male                      132 non-null    uint8  
 9   Gender_Other                     132 non-null    uint8  
 10  Country_Bahamas, The             132 non-null    uint8  
 11  Country_Israel                   132 non-null    uint8  
 12  Country_United Sta

In [153]:
df_balanced['treatment'] = df_balanced['treatment'].map({'Yes': 1, 'No': 0})

In [154]:
df_balanced = df_balanced.drop(columns = ['Timestamp'])

In [155]:
# Saving full cleaned, encoded dataset to csv file that can be used for modeling
df_balanced.to_csv("full_cleaned_tech_survey_oversampling.csv", index=False)