In [1]:
# Loading the dataset

import pandas as pd
import numpy as np

df = pd.read_csv("survey.csv")

In [2]:
# Previewing the data dimensions
print(df.shape)

(1259, 27)


In [3]:
# Previewing the data columns
print(df.columns)

Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')


In [4]:
# Previewing the data quick summary
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

In [5]:
# Previewing the data - viewing first 5 records
print(df.head())

             Timestamp  Age  Gender         Country state self_employed  \
0  2014-08-27 11:29:31   37  Female   United States    IL           NaN   
1  2014-08-27 11:29:37   44       M   United States    IN           NaN   
2  2014-08-27 11:29:44   32    Male          Canada   NaN           NaN   
3  2014-08-27 11:29:46   31    Male  United Kingdom   NaN           NaN   
4  2014-08-27 11:30:22   31    Male   United States    TX           NaN   

  family_history treatment work_interfere    no_employees  ...  \
0             No       Yes          Often            6-25  ...   
1             No        No         Rarely  More than 1000  ...   
2             No        No         Rarely            6-25  ...   
3            Yes       Yes          Often          26-100  ...   
4             No        No          Never         100-500  ...   

                leave mental_health_consequence phys_health_consequence  \
0       Somewhat easy                        No                      No   
1 

In [6]:
# Selecting features of interest and creating new dataframe with only those columns
# 11 features selected (1 target feature: treatment)

selected_features = [
    'Age', 'Gender', 'self_employed', 'family_history', 'remote_work',
    'tech_company', 'benefits', 'care_options', 'seek_help', 'obs_consequence', 'treatment'
]

df_selected = df[selected_features]

print(df_selected.head())

   Age  Gender self_employed family_history remote_work tech_company  \
0   37  Female           NaN             No          No          Yes   
1   44       M           NaN             No          No           No   
2   32    Male           NaN             No          No          Yes   
3   31    Male           NaN            Yes          No          Yes   
4   31    Male           NaN             No         Yes          Yes   

     benefits care_options   seek_help obs_consequence treatment  
0         Yes     Not sure         Yes              No       Yes  
1  Don't know           No  Don't know              No        No  
2          No           No          No              No        No  
3          No          Yes          No             Yes       Yes  
4         Yes           No  Don't know              No        No  


In [7]:
# Preview the selected dataframe and check null values
print(df_selected.info())
print(df_selected.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              1259 non-null   int64 
 1   Gender           1259 non-null   object
 2   self_employed    1241 non-null   object
 3   family_history   1259 non-null   object
 4   remote_work      1259 non-null   object
 5   tech_company     1259 non-null   object
 6   benefits         1259 non-null   object
 7   care_options     1259 non-null   object
 8   seek_help        1259 non-null   object
 9   obs_consequence  1259 non-null   object
 10  treatment        1259 non-null   object
dtypes: int64(1), object(10)
memory usage: 108.3+ KB
None
Age                 0
Gender              0
self_employed      18
family_history      0
remote_work         0
tech_company        0
benefits            0
care_options        0
seek_help           0
obs_consequence     0
treatment           0
dtype: i

In [8]:
# Replace null values in self_employed column with "Unknown"
df_selected['self_employed'] = df_selected['self_employed'].fillna("Unknown")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['self_employed'] = df_selected['self_employed'].fillna("Unknown")


In [9]:
# Checking that there are no null values in any columns
print(df_selected.isnull().sum())

Age                0
Gender             0
self_employed      0
family_history     0
remote_work        0
tech_company       0
benefits           0
care_options       0
seek_help          0
obs_consequence    0
treatment          0
dtype: int64


In [10]:
# Standardizing inconsistent values in columns

# Age
df_selected['Age'].unique()

array([         37,          44,          32,          31,          33,
                35,          39,          42,          23,          29,
                36,          27,          46,          41,          34,
                30,          40,          38,          50,          24,
                18,          28,          26,          22,          19,
                25,          45,          21,         -29,          43,
                56,          60,          54,         329,          55,
       99999999999,          48,          20,          57,          58,
                47,          62,          51,          65,          49,
             -1726,           5,          53,          61,           8,
                11,          -1,          72], dtype=int64)

In [11]:
min_age = 0
max_age = 120
df_selected['Age'] = df_selected['Age'].apply(lambda x: x if min_age <= x <= max_age else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Age'] = df_selected['Age'].apply(lambda x: x if min_age <= x <= max_age else np.nan)


In [12]:
df_selected['Age'].unique()

# Remains numeric, and any invalid values will just be NaN
# Skipped in calculations by default or can be filled with a statistic (mean, median, etc.) or can be dropped

array([37., 44., 32., 31., 33., 35., 39., 42., 23., 29., 36., 27., 46.,
       41., 34., 30., 40., 38., 50., 24., 18., 28., 26., 22., 19., 25.,
       45., 21., nan, 43., 56., 60., 54., 55., 48., 20., 57., 58., 47.,
       62., 51., 65., 49.,  5., 53., 61.,  8., 11., 72.])

In [13]:
# Gender
df_selected['Gender'].unique()
df_selected['Gender'].value_counts()

Male                                              615
male                                              206
Female                                            121
M                                                 116
female                                             62
F                                                  38
m                                                  34
f                                                  15
Make                                                4
Male                                                3
Woman                                               3
Cis Male                                            2
Man                                                 2
Female (trans)                                      2
Female                                              2
Trans woman                                         1
msle                                                1
male leaning androgynous                            1
Neuter                      

In [14]:
gender_map = {
    # Male variants
    'male': 'Male', 'm': 'Male', 'man': 'Male', 'cis male': 'Male', 'cis man': 'Male',
    'male ': 'Male', 'msle': 'Male', 'mal': 'Male', 'malr': 'Male', 'maile': 'Male',
    'make': 'Male', 'mail': 'Male', 'male (cis)': 'Male', 'm ': 'Male', 'guy (-ish) ^_^': 'Male',
    'male leaning androgynous': 'Male', 'ostensibly male, unsure what that really means': 'Male',
    'male-ish': 'Male',

    # Female variants
    'female': 'Female', 'f': 'Female', 'woman': 'Female', 'cis female': 'Female',
    'femake': 'Female', 'femail': 'Female', 'female ': 'Female', 'female (cis)': 'Female',
    'female (trans)': 'Female', 'trans-female': 'Female', 'trans woman': 'Female',
    'cis-female/femme': 'Female', 'cis female': 'Female', 'cisfemale': 'Female',

    # Everything else
    'queer': 'Other', 'non-binary': 'Other', 'enby': 'Other', 'agender': 'Other',
    'androgyne': 'Other', 'genderqueer': 'Other', 'fluid': 'Other', 'nah': 'Other',
    'all': 'Other', 'a little about you': 'Other', 'queer/she/they': 'Other',
    'something kinda male?': 'Other', 'p': 'Other'
}

In [15]:
df_selected['Gender'] = df_selected['Gender'].str.strip().str.lower().map(gender_map).fillna('Other')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Gender'] = df_selected['Gender'].str.strip().str.lower().map(gender_map).fillna('Other')


In [16]:
df_selected['Gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [17]:
df_selected['Gender'].value_counts()

Male      994
Female    251
Other      14
Name: Gender, dtype: int64

In [18]:
# self_employed
df_selected['self_employed'].unique()
df_selected['self_employed'].value_counts()

No         1095
Yes         146
Unknown      18
Name: self_employed, dtype: int64

In [19]:
# family_history
df_selected['family_history'].unique()
df_selected['family_history'].value_counts()

No     767
Yes    492
Name: family_history, dtype: int64

In [20]:
# remote_work
df_selected['remote_work'].unique()
df_selected['remote_work'].value_counts()

No     883
Yes    376
Name: remote_work, dtype: int64

In [21]:
# tech_company
df_selected['tech_company'].unique()
df_selected['tech_company'].value_counts()

Yes    1031
No      228
Name: tech_company, dtype: int64

In [22]:
# benefits
df_selected['benefits'].unique()
df_selected['benefits'].value_counts()

Yes           477
Don't know    408
No            374
Name: benefits, dtype: int64

In [23]:
# care_options
df_selected['care_options'].unique()
df_selected['care_options'].value_counts()

No          501
Yes         444
Not sure    314
Name: care_options, dtype: int64

In [24]:
# seek_help
df_selected['seek_help'].unique()
df_selected['seek_help'].value_counts()

No            646
Don't know    363
Yes           250
Name: seek_help, dtype: int64

In [25]:
# obs_consequence
df_selected['obs_consequence'].unique()
df_selected['obs_consequence'].value_counts()

No     1075
Yes     184
Name: obs_consequence, dtype: int64

In [26]:
# treatment
df_selected['treatment'].unique()
df_selected['treatment'].value_counts()

Yes    637
No     622
Name: treatment, dtype: int64

In [27]:
print(df_selected.dtypes)

Age                float64
Gender              object
self_employed       object
family_history      object
remote_work         object
tech_company        object
benefits            object
care_options        object
seek_help           object
obs_consequence     object
treatment           object
dtype: object


In [28]:
df_selected.to_csv("cleaned_tech_survey_with_unknown_not_encoded.csv", index=False)

In [29]:
# Object type variables need to be encoded because decision tree models require numerical values to make splits based on thresholds.
# For nominal categorical variables, one-hot encoding.

# List of columns to apply one-hot encoding to
categorical_columns = ['Gender', 'self_employed', 'family_history', 'remote_work', 
                       'tech_company', 'benefits', 'care_options', 'seek_help', 
                       'obs_consequence']

# Apply one-hot encoding using pandas
df_encoded = pd.get_dummies(df_selected, columns=categorical_columns, drop_first=False)

# Decision Trees don’t suffer from multicollinearity
# Decision trees split the data based on feature values, so having redundant features doesn’t cause the same issues as in models like linear regression.
# For decision trees, you can choose whether to drop the first category or not.

In [30]:
df_encoded.head()

Unnamed: 0,Age,treatment,Gender_Female,Gender_Male,Gender_Other,self_employed_No,self_employed_Unknown,self_employed_Yes,family_history_No,family_history_Yes,...,benefits_No,benefits_Yes,care_options_No,care_options_Not sure,care_options_Yes,seek_help_Don't know,seek_help_No,seek_help_Yes,obs_consequence_No,obs_consequence_Yes
0,37.0,Yes,1,0,0,0,1,0,1,0,...,0,1,0,1,0,0,0,1,1,0
1,44.0,No,0,1,0,0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
2,32.0,No,0,1,0,0,1,0,1,0,...,1,0,1,0,0,0,1,0,1,0
3,31.0,Yes,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,1,0,0,1
4,31.0,No,0,1,0,0,1,0,1,0,...,0,1,1,0,0,1,0,0,1,0


In [31]:
df_encoded['treatment'] = df_encoded['treatment'].map({'Yes': 1, 'No': 0})

In [32]:
# Saving cleaned, encoded dataset to csv file that can be used for modeling
df_encoded.to_csv("cleaned_tech_survey_with_unknown.csv", index=False)

In [33]:
# Scaled version of that encoded dataset, for K-Means clustering

from sklearn.preprocessing import StandardScaler

df_encoded = pd.read_csv("cleaned_tech_survey_with_unknown.csv")

# Scale all the features using StandardScaler
scaler = StandardScaler()
df_encoded_scaled = pd.DataFrame(scaler.fit_transform(df_encoded), columns=df_encoded.columns)

# Saving the scaled dataset for K-Means clustering
df_encoded_scaled.to_csv("cleaned_tech_survey_scaled_with_unknown.csv", index=False)

In [34]:
# Scaled dataset shows that each numerical value has been transformed using standardization (z-score scaling)
# Each feature (column) was scaled so the mean of each feature is 0 and std dev is 1
df_encoded_scaled.head()

Unnamed: 0,Age,treatment,Gender_Female,Gender_Male,Gender_Other,self_employed_No,self_employed_Unknown,self_employed_Yes,family_history_No,family_history_Yes,...,benefits_No,benefits_Yes,care_options_No,care_options_Not sure,care_options_Yes,seek_help_Don't know,seek_help_No,seek_help_Yes,obs_consequence_No,obs_consequence_Yes
0,0.67564,0.988156,2.00398,-1.936735,-0.106042,-2.583956,8.303279,-0.362184,0.800912,-0.800912,...,-0.650076,1.280396,-0.812988,1.734807,-0.738096,-0.636501,-1.026564,2.00898,0.413718,-0.413718
1,1.625171,-1.011986,-0.499007,0.516333,-0.106042,-2.583956,8.303279,-0.362184,0.800912,-0.800912,...,-0.650076,-0.781009,1.23003,-0.576433,-0.738096,1.571089,-1.026564,-0.497765,0.413718,-0.413718
2,-0.002596,-1.011986,-0.499007,0.516333,-0.106042,-2.583956,8.303279,-0.362184,0.800912,-0.800912,...,1.538282,-0.781009,1.23003,-0.576433,-0.738096,-0.636501,0.974123,-0.497765,0.413718,-0.413718
3,-0.138243,0.988156,-0.499007,0.516333,-0.106042,-2.583956,8.303279,-0.362184,-1.248576,1.248576,...,1.538282,-0.781009,-0.812988,-0.576433,1.354838,-0.636501,0.974123,-0.497765,-2.417104,2.417104
4,-0.138243,-1.011986,-0.499007,0.516333,-0.106042,-2.583956,8.303279,-0.362184,0.800912,-0.800912,...,-0.650076,1.280396,1.23003,-0.576433,-0.738096,1.571089,-1.026564,-0.497765,0.413718,-0.413718
