In [237]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# PreProcessing


In [238]:
file_path = "survey.csv" 
data = pd.read_csv(file_path)

In [239]:
data.shape

(1259, 27)

In [240]:
data.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [241]:
data.dtypes

Timestamp                    object
Age                           int64
Gender                       object
Country                      object
state                        object
self_employed                object
family_history               object
treatment                    object
work_interfere               object
no_employees                 object
remote_work                  object
tech_company                 object
benefits                     object
care_options                 object
wellness_program             object
seek_help                    object
anonymity                    object
leave                        object
mental_health_consequence    object
phys_health_consequence      object
coworkers                    object
supervisor                   object
mental_health_interview      object
phys_health_interview        object
mental_vs_physical           object
obs_consequence              object
comments                     object
dtype: object

In [242]:
data.describe()

Unnamed: 0,Age
count,1259.0
mean,79428150.0
std,2818299000.0
min,-1726.0
25%,27.0
50%,31.0
75%,36.0
max,100000000000.0


In [243]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-null   object
 14  wellness

#### Step 1: Check for missing values

In [244]:
data.isnull().sum()

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64

Removing all the columns with more than 50% missing values and the columns which are not relevant for the analysis.

In [245]:

df = data[[  #'Timestamp',
        'Age', 'Gender', 'Country',
        #'state', 'self_employed',    
        'family_history', 'treatment', 
        'work_interfere', 'no_employees',
        #'remote_work','tech_company','benefits', 
        'care_options',
        #'wellness_program',
        'seek_help', 'anonymity', 'leave',
        'mental_health_consequence', 'phys_health_consequence', 'coworkers',
        'supervisor', 'mental_health_interview', 'phys_health_interview',
        'mental_vs_physical',
        # 'obs_consequence',
        #'comments']
]].copy()
df

Unnamed: 0,Age,Gender,Country,family_history,treatment,work_interfere,no_employees,care_options,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical
0,37,Female,United States,No,Yes,Often,6-25,Not sure,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes
1,44,M,United States,No,No,Rarely,More than 1000,No,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know
2,32,Male,Canada,No,No,Rarely,6-25,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No
3,31,Male,United Kingdom,Yes,Yes,Often,26-100,Yes,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No
4,31,Male,United States,No,No,Never,100-500,No,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,26,male,United Kingdom,No,Yes,,26-100,No,No,Don't know,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know
1255,32,Male,United States,Yes,Yes,Often,26-100,Yes,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes
1256,34,male,United States,Yes,Yes,Sometimes,More than 1000,Yes,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No,No
1257,46,f,United States,No,No,,100-500,Yes,No,Don't know,Don't know,Yes,No,No,No,No,No,No


Renaming columns for better understanding

In [246]:
df.rename(columns={
                'family_history':'Family History',
                'treatment':'Treatment',
                'work_interfere':'Work Interfere',
                'no_employees':'No. of Employees',
                'care_options':'Care Options',
                'seek_help':'Seek Help',
                'anonymity':'Anonymity',
                'leave':'Leave',
                'mental_health_consequence':'Mental Health Consequence',
                'phys_health_consequence':'Physical Health Consequence',
                'coworkers':'Coworkers',
                'supervisor':'Supervisor',
                'mental_health_interview':'Mental Health Interview',
                'mental_vs_physical':'Mental vs Physical',
                'phys_health_interview':'Physical Health Interview'},
                
inplace=True)



In [247]:
df.head()

Unnamed: 0,Age,Gender,Country,Family History,Treatment,Work Interfere,No. of Employees,Care Options,Seek Help,Anonymity,Leave,Mental Health Consequence,Physical Health Consequence,Coworkers,Supervisor,Mental Health Interview,Physical Health Interview,Mental vs Physical
0,37,Female,United States,No,Yes,Often,6-25,Not sure,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes
1,44,M,United States,No,No,Rarely,More than 1000,No,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know
2,32,Male,Canada,No,No,Rarely,6-25,No,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No
3,31,Male,United Kingdom,Yes,Yes,Often,26-100,Yes,No,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No
4,31,Male,United States,No,No,Never,100-500,No,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know


In [248]:
df.isna().sum()

Age                              0
Gender                           0
Country                          0
Family History                   0
Treatment                        0
Work Interfere                 264
No. of Employees                 0
Care Options                     0
Seek Help                        0
Anonymity                        0
Leave                            0
Mental Health Consequence        0
Physical Health Consequence      0
Coworkers                        0
Supervisor                       0
Mental Health Interview          0
Physical Health Interview        0
Mental vs Physical               0
dtype: int64

Resolving Missing Values

In [249]:
work_interfere_mode = df['Work Interfere'].mode()[0]
df['Work Interfere'] = df['Work Interfere'].fillna(work_interfere_mode)
print(df['Work Interfere'].isna().sum())

0


Checking for duplicate values and removing them

In [250]:
df.loc[df.duplicated]

Unnamed: 0,Age,Gender,Country,Family History,Treatment,Work Interfere,No. of Employees,Care Options,Seek Help,Anonymity,Leave,Mental Health Consequence,Physical Health Consequence,Coworkers,Supervisor,Mental Health Interview,Physical Health Interview,Mental vs Physical
821,35,Male,Denmark,Yes,Yes,Often,1-5,Yes,No,Don't know,Somewhat easy,No,No,Some of them,Some of them,Maybe,Yes,Yes
860,32,male,United Kingdom,Yes,Yes,Rarely,6-25,No,No,No,Don't know,Yes,Maybe,Some of them,No,No,No,No
1134,27,M,New Zealand,No,No,Rarely,26-100,Yes,Yes,No,Somewhat difficult,Yes,No,No,No,No,Maybe,No
1171,30,Male,United States,No,No,Sometimes,More than 1000,No,Yes,Don't know,Don't know,Yes,Maybe,No,No,No,No,Don't know
1218,28,male,Netherlands,No,No,Sometimes,6-25,Not sure,No,Don't know,Somewhat easy,Maybe,Maybe,Some of them,Some of them,No,No,Don't know


In [251]:
df.drop_duplicates(inplace=True)
df.loc[df.duplicated]

Unnamed: 0,Age,Gender,Country,Family History,Treatment,Work Interfere,No. of Employees,Care Options,Seek Help,Anonymity,Leave,Mental Health Consequence,Physical Health Consequence,Coworkers,Supervisor,Mental Health Interview,Physical Health Interview,Mental vs Physical


Cleaning All the Values in Gender Column

In [252]:
df.Gender = df.Gender.str.lower()
male = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "cis male"]
female = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]
other = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", 
         "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", 
         "female (trans)", "queer", "ostensibly male, unsure what that really means", "p", "a little about you"]
def categorize_gender(gender):
    if gender in female:
        return 'Female'
    elif gender in male:
        return 'Male'
    else:
        return 'Other'

# Apply the function to the Gender column
df['Gender'] = df['Gender'].apply(categorize_gender)
df.groupby('Gender').size().to_frame().reset_index().rename(columns={0: 'Count'}).style.background_gradient(cmap='Blues')

Unnamed: 0,Gender,Count
0,Female,247
1,Male,986
2,Other,21


<h5>Handling Outliers in Age:</h5>
To facilitate analysis by age, we create a new feature called 'Age Group' that categorizes individuals into 5-year intervals

In [253]:
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.99)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]
df = df[df['Age'] >= 20]
bins = list(range(20, 71, 5)) + [float('inf')]
labels = [f'{i}-{i+4}' for i in range(20, 70, 5)] + ['70+']
df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)


In [254]:
df.shape

(1230, 19)

In [255]:
df['Mental vs Physical'] = df['Mental vs Physical'].replace({'Don\'t know': 'Not Sure'})
df['Mental vs Physical'].value_counts()

Mental vs Physical
Not Sure    562
Yes         336
No          332
Name: count, dtype: int64

In [256]:
print(df['Work Interfere'].unique())

['Often' 'Rarely' 'Never' 'Sometimes']


In [257]:
work_interfere_counts = df['Work Interfere'].value_counts().reset_index()
work_interfere_counts.columns = ['Work Interfere', 'count']


In [258]:
print(df['Work Interfere'].unique())

['Often' 'Rarely' 'Never' 'Sometimes']


Confirming no missing values.

In [259]:
df.isnull().sum()

Age                            0
Gender                         0
Country                        0
Family History                 0
Treatment                      0
Work Interfere                 0
No. of Employees               0
Care Options                   0
Seek Help                      0
Anonymity                      0
Leave                          0
Mental Health Consequence      0
Physical Health Consequence    0
Coworkers                      0
Supervisor                     0
Mental Health Interview        0
Physical Health Interview      0
Mental vs Physical             0
Age Group                      0
dtype: int64

In [260]:
df['Age Group'].value_counts()

Age Group
25-29    358
30-34    345
35-39    206
20-24    140
40-44    113
45-49     36
50-54     15
55-59     11
60-64      4
65-69      1
70+        1
Name: count, dtype: int64

#### Step 2: Preparing for Encoding the categorical variables

In [261]:
df.columns

Index(['Age', 'Gender', 'Country', 'Family History', 'Treatment',
       'Work Interfere', 'No. of Employees', 'Care Options', 'Seek Help',
       'Anonymity', 'Leave', 'Mental Health Consequence',
       'Physical Health Consequence', 'Coworkers', 'Supervisor',
       'Mental Health Interview', 'Physical Health Interview',
       'Mental vs Physical', 'Age Group'],
      dtype='object')

In [262]:
df.drop(columns=['Age'], inplace=True)

In [263]:
ordinal_columns = ['Work Interfere', 'No. of Employees', 'Leave','Treatment']

nominal_columns = ['Gender', 'Country', 'Family History',
        'Care Options', 'Seek Help','Anonymity','Mental Health Consequence',
       'Physical Health Consequence', 'Coworkers', 'Supervisor',
       'Mental Health Interview', 'Physical Health Interview',
       'Mental vs Physical', 'Age Group']

In [264]:
print(df['Work Interfere'].unique())

['Often' 'Rarely' 'Never' 'Sometimes']


In [265]:
for i in ordinal_columns:
    print(df[i].unique())

['Often' 'Rarely' 'Never' 'Sometimes']
['6-25' 'More than 1000' '26-100' '100-500' '1-5' '500-1000']
['Somewhat easy' "Don't know" 'Somewhat difficult' 'Very difficult'
 'Very easy']
['Yes' 'No']


In [266]:
ordinal_categories = [["Never", "Rarely", "Sometimes", "Often"],
                      ["1-5", "6-25", "26-100", "100-500", "500-1000", "More than 1000"],
                      ["Very easy", "Somewhat easy", "Don't know", "Somewhat difficult", "Very difficult"],
                      ["No", "Yes"]]

#### Step 3:  Apply Label Encoding to ordinal columns


In [267]:
encoder = OrdinalEncoder(categories=ordinal_categories)
df[ordinal_columns] = encoder.fit_transform(df[ordinal_columns])
df[ordinal_columns] = df[ordinal_columns].astype(int)

In [268]:
df.head()

Unnamed: 0,Gender,Country,Family History,Treatment,Work Interfere,No. of Employees,Care Options,Seek Help,Anonymity,Leave,Mental Health Consequence,Physical Health Consequence,Coworkers,Supervisor,Mental Health Interview,Physical Health Interview,Mental vs Physical,Age Group
0,Female,United States,No,1,3,1,Not sure,Yes,Yes,1,No,No,Some of them,Yes,No,Maybe,Yes,35-39
1,Male,United States,No,0,1,5,No,Don't know,Don't know,2,Maybe,No,No,No,No,No,Not Sure,40-44
2,Male,Canada,No,0,1,1,No,No,Don't know,3,No,No,Yes,Yes,Yes,Yes,No,30-34
3,Male,United Kingdom,Yes,1,3,2,Yes,No,No,3,Yes,Yes,Some of them,No,Maybe,Maybe,No,30-34
4,Male,United States,No,0,0,3,No,Don't know,Don't know,2,No,No,Some of them,Yes,Yes,Yes,Not Sure,30-34


#### Step 4: Apply one-hot encoding to nominal columns

In [269]:
column_transformer = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(sparse_output=False), nominal_columns)],
    remainder='passthrough')
df_transformed = column_transformer.fit_transform(df[nominal_columns])
encoded_col_names = column_transformer.get_feature_names_out()
df_encoded = pd.DataFrame(df_transformed, columns=encoded_col_names, index=df.index)
df = pd.concat([df_encoded, df.drop(columns=nominal_columns)], axis=1)


In [270]:
df.head()

Unnamed: 0,onehot__Gender_Female,onehot__Gender_Male,onehot__Gender_Other,onehot__Country_Australia,onehot__Country_Austria,onehot__Country_Belgium,onehot__Country_Bosnia and Herzegovina,onehot__Country_Brazil,onehot__Country_Bulgaria,onehot__Country_Canada,...,onehot__Age Group_45-49,onehot__Age Group_50-54,onehot__Age Group_55-59,onehot__Age Group_60-64,onehot__Age Group_65-69,onehot__Age Group_70+,Treatment,Work Interfere,No. of Employees,Leave
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,3,1,1
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,5,2
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1,3
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,3,2,3
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,3,2



No Normalization is required as no numerical columns are present in the dataset.

#### Step 5: Split the data into training and testing sets

The target variable is "Treatment" and the rest of the columns are the features.

In [271]:
X = df.drop(columns=['Treatment'])
y = df['Treatment']
