# Preprocessing
Link to data set: https://www.kaggle.com/datasets/osmi/mental-health-in-tech-survey


In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [41]:
file_path = "mental_health.csv" 
data = pd.read_csv(file_path)

In [42]:
data.shape

In [43]:
data.head()

In [44]:
data.dtypes

In [45]:
data.describe()

In [46]:
data.info()

#### Step 1: Check for missing values

In [47]:
data.isnull().sum()

Removing all the columns with more than 50% missing values and the columns which are not relevant for the analysis.

In [48]:

df = data[[  #'Timestamp',
        'Age', 'Gender', 'Country',
        #'state', 'self_employed',    
        'family_history', 'treatment', 
        'work_interfere', 'no_employees',
        #'remote_work','tech_company','benefits', 
        'care_options',
        #'wellness_program',
        'seek_help', 'anonymity', 'leave',
        'mental_health_consequence', 'phys_health_consequence', 'coworkers',
        'supervisor', 'mental_health_interview', 'phys_health_interview',
        'mental_vs_physical',
        # 'obs_consequence',
        #'comments']
]].copy()
df

Renaming columns for better understanding

In [49]:
df.rename(columns={
                'family_history':'Family History',
                'treatment':'Treatment',
                'work_interfere':'Work Interfere',
                'no_employees':'No. of Employees',
                'care_options':'Care Options',
                'seek_help':'Seek Help',
                'anonymity':'Anonymity',
                'leave':'Leave',
                'mental_health_consequence':'Mental Health Consequence',
                'phys_health_consequence':'Physical Health Consequence',
                'coworkers':'Coworkers',
                'supervisor':'Supervisor',
                'mental_health_interview':'Mental Health Interview',
                'mental_vs_physical':'Mental vs Physical',
                'phys_health_interview':'Physical Health Interview'},
                
inplace=True)



In [50]:
df.head()

In [51]:
df.isna().sum()

Resolving Missing Values

In [52]:
work_interfere_mode = df['Work Interfere'].mode()[0]
df['Work Interfere'] = df['Work Interfere'].fillna(work_interfere_mode)
print(df['Work Interfere'].isna().sum())

Checking for duplicate values and removing them

In [53]:
df.loc[df.duplicated]

In [54]:
df.drop_duplicates(inplace=True)
df.loc[df.duplicated]

Cleaning All the Values in Gender Column

In [55]:
df.Gender = df.Gender.str.lower()
male = ["male", "m", "male-ish", "maile", "mal", "male (cis)", "make", "male ", "man","msle", "mail", "malr","cis man", "cis male"]
female = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)", "femail"]
other = ["trans-female", "something kinda male?", "queer/she/they", "non-binary","nah", "all", "enby", "fluid", 
         "genderqueer", "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "trans woman", "neuter", 
         "female (trans)", "queer", "ostensibly male, unsure what that really means", "p", "a little about you"]
def categorize_gender(gender):
    if gender in female:
        return 'Female'
    elif gender in male:
        return 'Male'
    else:
        return 'Other'

# Apply the function to the Gender column
df['Gender'] = df['Gender'].apply(categorize_gender)
df.groupby('Gender').size().to_frame().reset_index().rename(columns={0: 'Count'}).style.background_gradient(cmap='Blues')

<h5>Handling Outliers in Age:</h5>
To facilitate analysis by age, we create a new feature called 'Age Group' that categorizes individuals into 5-year intervals

In [56]:
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.99)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]
df = df[df['Age'] >= 20]
bins = list(range(20, 71, 5)) + [float('inf')]
labels = [f'{i}-{i+4}' for i in range(20, 70, 5)] + ['70+']
df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)


In [57]:
df.shape

In [58]:
df['Mental vs Physical'] = df['Mental vs Physical'].replace({'Don\'t know': 'Not Sure'})
df['Mental vs Physical'].value_counts()

In [59]:
print(df['Work Interfere'].unique())

In [60]:
work_interfere_counts = df['Work Interfere'].value_counts().reset_index()
work_interfere_counts.columns = ['Work Interfere', 'count']


In [61]:
print(df['Work Interfere'].unique())

Confirming no missing values.

In [62]:
df.isnull().sum()

In [63]:
df['Age Group'].value_counts()

#### Step 2: Preparing for Encoding the categorical variables

In [64]:
df.columns

In [65]:
df.drop(columns=['Age'], inplace=True)

In [66]:
ordinal_columns = ['Work Interfere', 'No. of Employees', 'Leave','Treatment']

nominal_columns = ['Gender', 'Country', 'Family History',
        'Care Options', 'Seek Help','Anonymity','Mental Health Consequence',
       'Physical Health Consequence', 'Coworkers', 'Supervisor',
       'Mental Health Interview', 'Physical Health Interview',
       'Mental vs Physical', 'Age Group']

In [67]:
print(df['Work Interfere'].unique())

In [68]:
for i in ordinal_columns:
    print(df[i].unique())

In [69]:
ordinal_categories = [["Never", "Rarely", "Sometimes", "Often"],
                      ["1-5", "6-25", "26-100", "100-500", "500-1000", "More than 1000"],
                      ["Very easy", "Somewhat easy", "Don't know", "Somewhat difficult", "Very difficult"],
                      ["No", "Yes"]]

#### Step 3:  Apply Label Encoding to ordinal columns


In [70]:
encoder = OrdinalEncoder(categories=ordinal_categories)
df[ordinal_columns] = encoder.fit_transform(df[ordinal_columns])
df[ordinal_columns] = df[ordinal_columns].astype(int)

In [71]:
df.head()

#### Step 4: Apply one-hot encoding to nominal columns

In [72]:
column_transformer = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(sparse_output=False), nominal_columns)],
    remainder='passthrough')
df_transformed = column_transformer.fit_transform(df[nominal_columns])
encoded_col_names = column_transformer.get_feature_names_out()
df_encoded = pd.DataFrame(df_transformed, columns=encoded_col_names, index=df.index)
df = pd.concat([df_encoded, df.drop(columns=nominal_columns)], axis=1)


In [73]:
df.head()


No Normalization is required as no numerical columns are present in the dataset.

#### Step 5: Split the data into training and testing sets

The target variable is "Treatment" and the rest of the columns are the features.

In [74]:
X = df.drop(columns=['Treatment'])
y = df['Treatment']


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
print("Training Set:", X_train.shape, y_train.shape)
print("Testing Set:", X_test.shape, y_test.shape)