In [2]:
# Loading the dataset

import pandas as pd
import numpy as np
from sklearn.utils import resample

df = pd.read_csv("survey.csv").dropna()

In [3]:
# Previewing the data dimensions
print(df.shape)

(86, 27)


In [4]:
#oversampling
df_minority = df[df['treatment'] == 'No']
df_majority = df[df['treatment'] == 'Yes']

In [5]:
df_minority_upsampled = resample(
    df_minority,
    replace = True,
    n_samples = len(df_majority),
    random_state = 42)

In [6]:
df_balanced = pd.concat([df_majority, df_minority_upsampled])

In [7]:
df_balanced = df_balanced.sample(frac=1, random_state=42)

In [8]:
# Previewing the data dimensions
print(df_balanced.shape)

(132, 27)


In [9]:
# Previewing the data columns
print(df_balanced.columns)

Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')


In [10]:
# Previewing the data quick summary
print(df_balanced.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 1078 to 1024
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  132 non-null    object
 1   Age                        132 non-null    int64 
 2   Gender                     132 non-null    object
 3   Country                    132 non-null    object
 4   state                      132 non-null    object
 5   self_employed              132 non-null    object
 6   family_history             132 non-null    object
 7   treatment                  132 non-null    object
 8   work_interfere             132 non-null    object
 9   no_employees               132 non-null    object
 10  remote_work                132 non-null    object
 11  tech_company               132 non-null    object
 12  benefits                   132 non-null    object
 13  care_options               132 non-null    object
 14  wellne

In [11]:
# Previewing the data - viewing first 5 records
print(df_balanced.head())

                Timestamp  Age  Gender        Country state self_employed  \
1078  2014-08-29 14:59:43   37  female  United States    NJ            No   
849   2014-08-28 16:57:46   40       M  United States    IL            No   
373   2014-08-27 15:22:20   32       F  United States    WA            No   
627   2014-08-28 02:17:42   34    male  United States    CA           Yes   
568   2014-08-27 19:45:36   31    male  United States    CA            No   

     family_history treatment work_interfere    no_employees  ...  \
1078            Yes       Yes      Sometimes          26-100  ...   
849              No        No         Rarely  More than 1000  ...   
373              No       Yes      Sometimes          26-100  ...   
627             Yes       Yes      Sometimes            6-25  ...   
568             Yes        No      Sometimes  More than 1000  ...   

              leave mental_health_consequence phys_health_consequence  \
1078     Don't know                       Yes    

In [12]:
# Selecting features of interest and creating new dataframe with only those columns
# 11 features selected (1 target feature: treatment)

selected_features = [
    'Age', 'Gender', 'self_employed', 'family_history', 'remote_work',
    'tech_company', 'benefits', 'care_options', 'seek_help', 'obs_consequence', 'treatment'
]

df_selected = df_balanced[selected_features]

print(df_selected.head())

      Age  Gender self_employed family_history remote_work tech_company  \
1078   37  female            No            Yes         Yes          Yes   
849    40       M            No             No          No          Yes   
373    32       F            No             No          No          Yes   
627    34    male           Yes            Yes          No          Yes   
568    31    male            No            Yes          No          Yes   

     benefits care_options   seek_help obs_consequence treatment  
1078      Yes          Yes  Don't know              No       Yes  
849       Yes          Yes         Yes              No        No  
373       Yes          Yes         Yes              No       Yes  
627        No          Yes          No              No       Yes  
568       Yes     Not sure  Don't know              No        No  


In [13]:
# Preview the selected dataframe and check null values
print(df_selected.info())
print(df_selected.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 1078 to 1024
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Age              132 non-null    int64 
 1   Gender           132 non-null    object
 2   self_employed    132 non-null    object
 3   family_history   132 non-null    object
 4   remote_work      132 non-null    object
 5   tech_company     132 non-null    object
 6   benefits         132 non-null    object
 7   care_options     132 non-null    object
 8   seek_help        132 non-null    object
 9   obs_consequence  132 non-null    object
 10  treatment        132 non-null    object
dtypes: int64(1), object(10)
memory usage: 12.4+ KB
None
Age                0
Gender             0
self_employed      0
family_history     0
remote_work        0
tech_company       0
benefits           0
care_options       0
seek_help          0
obs_consequence    0
treatment          0
dtype: int64


In [14]:
# Replace null values in self_employed column with "Unknown"
df_selected['self_employed'] = df_selected['self_employed'].fillna("Unknown")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['self_employed'] = df_selected['self_employed'].fillna("Unknown")


In [15]:
# Checking that there are no null values in any columns
print(df_selected.isnull().sum())

Age                0
Gender             0
self_employed      0
family_history     0
remote_work        0
tech_company       0
benefits           0
care_options       0
seek_help          0
obs_consequence    0
treatment          0
dtype: int64


In [16]:
# Standardizing inconsistent values in columns

# Age
df_selected['Age'].unique()

array([37, 40, 32, 34, 31, 27, 46, 25, 29, 38, 30, 39, 33, 42, 44, 55, 45,
       26, 41, 50, 35, 49, 56, 36, 48, 43, 21, 23,  8, -1], dtype=int64)

In [17]:
min_age = 0
max_age = 120
df_selected['Age'] = df_selected['Age'].apply(lambda x: x if min_age <= x <= max_age else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Age'] = df_selected['Age'].apply(lambda x: x if min_age <= x <= max_age else np.nan)


In [18]:
df_selected['Age'].unique()

# Remains numeric, and any invalid values will just be NaN
# Skipped in calculations by default or can be filled with a statistic (mean, median, etc.) or can be dropped

array([37., 40., 32., 34., 31., 27., 46., 25., 29., 38., 30., 39., 33.,
       42., 44., 55., 45., 26., 41., 50., 35., 49., 56., 36., 48., 43.,
       21., 23.,  8., nan])

In [19]:
# Gender
df_selected['Gender'].unique()
df_selected['Gender'].value_counts()

Male                  63
M                     18
male                  14
female                10
Female                10
m                      4
F                      3
Female (cis)           3
cis-female/femme       1
Female                 1
f                      1
A little about you     1
Cis Male               1
woman                  1
p                      1
Name: Gender, dtype: int64

In [20]:
gender_map = {
    # Male variants
    'male': 'Male', 'm': 'Male', 'man': 'Male', 'cis male': 'Male', 'cis man': 'Male',
    'male ': 'Male', 'msle': 'Male', 'mal': 'Male', 'malr': 'Male', 'maile': 'Male',
    'make': 'Male', 'mail': 'Male', 'male (cis)': 'Male', 'm ': 'Male', 'guy (-ish) ^_^': 'Male',
    'male leaning androgynous': 'Male', 'ostensibly male, unsure what that really means': 'Male',
    'male-ish': 'Male',

    # Female variants
    'female': 'Female', 'f': 'Female', 'woman': 'Female', 'cis female': 'Female',
    'femake': 'Female', 'femail': 'Female', 'female ': 'Female', 'female (cis)': 'Female',
    'female (trans)': 'Female', 'trans-female': 'Female', 'trans woman': 'Female',
    'cis-female/femme': 'Female', 'cis female': 'Female', 'cisfemale': 'Female',

    # Everything else
    'queer': 'Other', 'non-binary': 'Other', 'enby': 'Other', 'agender': 'Other',
    'androgyne': 'Other', 'genderqueer': 'Other', 'fluid': 'Other', 'nah': 'Other',
    'all': 'Other', 'a little about you': 'Other', 'queer/she/they': 'Other',
    'something kinda male?': 'Other', 'p': 'Other'
}

In [21]:
df_selected['Gender'] = df_selected['Gender'].str.strip().str.lower().map(gender_map).fillna('Other')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Gender'] = df_selected['Gender'].str.strip().str.lower().map(gender_map).fillna('Other')


In [22]:
df_selected['Gender'].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [23]:
df_selected['Gender'].value_counts()

Male      100
Female     30
Other       2
Name: Gender, dtype: int64

In [24]:
# self_employed
df_selected['self_employed'].unique()
df_selected['self_employed'].value_counts()

No     113
Yes     19
Name: self_employed, dtype: int64

In [25]:
# family_history
df_selected['family_history'].unique()
df_selected['family_history'].value_counts()

No     69
Yes    63
Name: family_history, dtype: int64

In [26]:
# remote_work
df_selected['remote_work'].unique()
df_selected['remote_work'].value_counts()

No     79
Yes    53
Name: remote_work, dtype: int64

In [27]:
# tech_company
df_selected['tech_company'].unique()
df_selected['tech_company'].value_counts()

Yes    115
No      17
Name: tech_company, dtype: int64

In [28]:
# benefits
df_selected['benefits'].unique()
df_selected['benefits'].value_counts()

Yes           67
Don't know    37
No            28
Name: benefits, dtype: int64

In [29]:
# care_options
df_selected['care_options'].unique()
df_selected['care_options'].value_counts()

Yes         63
Not sure    35
No          34
Name: care_options, dtype: int64

In [30]:
# seek_help
df_selected['seek_help'].unique()
df_selected['seek_help'].value_counts()

Yes           52
No            48
Don't know    32
Name: seek_help, dtype: int64

In [31]:
# obs_consequence
df_selected['obs_consequence'].unique()
df_selected['obs_consequence'].value_counts()

No     108
Yes     24
Name: obs_consequence, dtype: int64

In [32]:
# treatment
df_selected['treatment'].unique()
df_selected['treatment'].value_counts()

Yes    66
No     66
Name: treatment, dtype: int64

In [33]:
print(df_selected.dtypes)

Age                float64
Gender              object
self_employed       object
family_history      object
remote_work         object
tech_company        object
benefits            object
care_options        object
seek_help           object
obs_consequence     object
treatment           object
dtype: object


In [34]:
# Object type variables need to be encoded because decision tree models require numerical values to make splits based on thresholds.
# For nominal categorical variables, one-hot encoding.

# List of columns to apply one-hot encoding to
categorical_columns = ['Gender', 'self_employed', 'family_history', 'remote_work', 
                       'tech_company', 'benefits', 'care_options', 'seek_help', 
                       'obs_consequence']

# Apply one-hot encoding using pandas
df_encoded = pd.get_dummies(df_selected, columns=categorical_columns, drop_first=False)

# Decision Trees don’t suffer from multicollinearity
# Decision trees split the data based on feature values, so having redundant features doesn’t cause the same issues as in models like linear regression.
# For decision trees, you can choose whether to drop the first category or not.

In [35]:
df_encoded.head()

Unnamed: 0,Age,treatment,Gender_Female,Gender_Male,Gender_Other,self_employed_No,self_employed_Yes,family_history_No,family_history_Yes,remote_work_No,...,benefits_No,benefits_Yes,care_options_No,care_options_Not sure,care_options_Yes,seek_help_Don't know,seek_help_No,seek_help_Yes,obs_consequence_No,obs_consequence_Yes
1078,37.0,Yes,1,0,0,1,0,0,1,0,...,0,1,0,0,1,1,0,0,1,0
849,40.0,No,0,1,0,1,0,1,0,1,...,0,1,0,0,1,0,0,1,1,0
373,32.0,Yes,1,0,0,1,0,1,0,1,...,0,1,0,0,1,0,0,1,1,0
627,34.0,Yes,0,1,0,0,1,0,1,1,...,1,0,0,0,1,0,1,0,1,0
568,31.0,No,0,1,0,1,0,0,1,1,...,0,1,0,1,0,1,0,0,1,0


In [36]:
df_encoded['treatment'] = df_encoded['treatment'].map({'Yes': 1, 'No': 0})

In [37]:
# Saving cleaned, encoded dataset to csv file that can be used for modeling
df_encoded.to_csv("cleaned_tech_survey_oversample.csv", index=False)