## Preparing data for modelling

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../data/survey.csv')
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [5]:
def clean_gender(gender):
  gender = str(gender).strip().lower()
  male_terms = ['male', 'm', 'man', 'msle', 'mail', 'malr', 'mal', 'cis male', 'cis man', 'guy', 'make']
  female_terms = ['female', 'f', 'woman', 'femake', 'cis female', 'cis-female/femme', 'female (cis)', 'female (trans)', 'trans-female']

  if gender in male_terms:
    return "Male"
  
  if gender in female_terms:
    return "Female"
  
  else:
    return "Other"

df['Gender'] = df['Gender'].apply(clean_gender)

In [6]:
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,Male,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


### Select relevant features

In [8]:
selected_cols = [
    'Age', 'Gender', 'family_history', 'work_interfere', 'no_employees',
    'remote_work', 'benefits', 'care_options', 'wellness_program',
    'seek_help', 'anonymity', 'leave', 'mental_health_consequence',
    'phys_health_consequence', 'coworkers', 'supervisor',
    'mental_health_interview', 'phys_health_interview',
    'mental_vs_physical', 'obs_consequence', 'treatment'  # target
]

df = df[selected_cols]

### Handle categorical variables

In [11]:
#encode target
df['treatment'] = df['treatment'].map({'Yes': 1, 'No': 0})

# fill missing values with mode for categorical, median for age
for col in df.columns:
  if df[col].dtype == 'object':
    df[col] = df[col].fillna(df[col].mode()[0])
  else:
    df[col] = df[col].fillna(df[col].median())

#encode categorical features using LabelEncoder
cat_cols = df.select_dtypes(include='object').columns

le = LabelEncoder()
for col in cat_cols:
  df[col] = le.fit_transform(df[col])

In [12]:
df.head()

Unnamed: 0,Age,Gender,family_history,work_interfere,no_employees,remote_work,benefits,care_options,wellness_program,seek_help,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,treatment
0,37,0,0,1,4,0,2,1,1,2,...,2,1,1,1,2,1,0,2,0,
1,44,1,0,2,5,0,0,0,0,0,...,0,0,1,0,0,1,1,0,0,
2,32,1,0,2,4,0,1,0,1,1,...,1,1,1,2,2,2,2,1,0,
3,31,1,1,1,2,0,1,2,1,1,...,1,2,2,1,0,0,0,1,1,
4,31,1,0,0,1,1,2,0,0,0,...,0,1,1,1,2,2,2,0,0,


In [14]:
# scale age i.e numerical feature for better fitting and prevent outliers or extremes
scaler = StandardScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

### Train/Test split

In [16]:
X = df.drop('treatment', axis=1)
y = df['treatment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (1007, 20)
Test set shape: (252, 20)
