## Import Libraries

In [1]:
import pandas as pd

## Load the dataset

In [45]:
data = pd.read_csv('https://raw.githubusercontent.com/ktxdev/mind-matters/refs/heads/master/data/raw/train.csv')

## Initial Exploration
### Shape and Structure

In [46]:
print(data.shape)
data.head()

(140700, 20)


Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


### Dropping features not useful for analysis

In [47]:
data.drop(columns=['id', 'Name'], inplace=True)

### Data Types

In [48]:
data.dtypes

Gender                                    object
Age                                      float64
City                                      object
Working Professional or Student           object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness          object
Depression                                 int64
dtype: object

### Data Types Conversion

In [49]:
data['Study Satisfaction'] = data['Study Satisfaction'].astype('category')
data['Job Satisfaction'] = data['Job Satisfaction'].astype('category')
data['Academic Pressure'] = data['Academic Pressure'].astype('category')
data['Work Pressure'] = data['Work Pressure'].astype('category')
data['Financial Stress'] = data['Financial Stress'].astype('category')
data['Depression'] = data['Depression'].astype('category')

### Checking for Missing Values

In [53]:
data.isnull().sum()

Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

### Basic Statistics
#### Continuous Variables

In [50]:
data.describe()

Unnamed: 0,Age,CGPA,Work/Study Hours
count,140700.0,27898.0,140700.0
mean,40.388621,7.658636,6.252679
std,12.384099,1.464466,3.853615
min,18.0,5.03,0.0
25%,29.0,6.29,3.0
50%,42.0,7.77,6.0
75%,51.0,8.92,10.0
max,60.0,10.0,12.0


#### Categorical Variables

In [52]:
categorical_variables = ['Gender', 'City', 'Working Professional or Student', 'Study Satisfaction',
                         'Job Satisfaction', 'Academic Pressure', 'Work Pressure', 'Depression',
                         'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Financial Stress',
                         'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
data[categorical_variables].describe()

Unnamed: 0,Gender,City,Working Professional or Student,Study Satisfaction,Job Satisfaction,Academic Pressure,Work Pressure,Depression,Profession,Sleep Duration,Dietary Habits,Degree,Financial Stress,Have you ever had suicidal thoughts ?,Family History of Mental Illness
count,140700,140700,140700,27897.0,112790.0,27897.0,112782.0,140700,104070,140700,140696,140698,140696.0,140700,140700
unique,2,98,2,5.0,5.0,5.0,5.0,2,64,36,23,115,5.0,2,2
top,Male,Kalyan,Working Professional,4.0,2.0,3.0,2.0,0,Teacher,Less than 5 hours,Moderate,Class 12,2.0,No,No
freq,77464,6591,112799,6360.0,24783.0,7463.0,24373.0,115133,24906,38784,49705,14729,31451.0,71138,70758


## Univariate Analysis
### Numerical Columns

## Handle Missing Values

In [16]:
data.isnull().sum()

Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

To handle missing values in the dataset for the columns `Job Satisfaction` and `Study Satisfaction`, we’ll create a `Job/Study Satisfaction` column by merging these columns. Since each individual is either a student or a professional, only one of these columns will have a value for each user, while the other will be empty. By consolidating them into a single column, we simplify the dataset and eliminate these missing values, as Satisfaction will contain the relevant data for each user on a consistent 1 to 5 scale. The same will be done for the columns `Academic Pressure` and `Work Pressure` by creating a `Academic/Work Pressure` column combining these two columns. For handling missing values for `Profession` since there is a student profession in the `Profession` column I will make all students have the `Student` profession and for working professional I will make all working professionals have the `Working Professional` category. Since CGPA has around 80% of missing values and imputation of this value is unreliable as most this data is missing for working professionals and not students hence we will drop the feature.

In [30]:
# Handling missing values for Job and Study satisfaction
data['Job/Study Satisfaction'] = data['Study Satisfaction'].fillna(data['Job Satisfaction'])
# Dropping the original satisfaction columns
data.drop(['Study Satisfaction', 'Job Satisfaction'], axis=1, inplace=True)

# Handling missing values for Academic and Work pressure
data['Academic/Work Pressure'] = data['Academic Pressure'].fillna(data['Work Pressure'])
# Dropping the original pressure columns
data.drop(['Academic Pressure', 'Work Pressure'], axis=1, inplace=True)

# Fill missing values for profession
data.loc[(data['Working Professional or Student'] == 'Student') & (data['Profession'].isnull()), 'Profession'] = 'Student'
data.loc[(data['Working Professional or Student'] == 'Working Professional') & (data['Profession'].isnull()), 'Profession'] = 'Working Professional'

# Dropping CGPA feature
data.drop(columns=['CGPA'], inplace=True)

#### Re-checking missing values

In [41]:
data.isnull().sum()

Gender                                    0
Age                                       0
City                                      0
Working Professional or Student           0
Profession                                0
Sleep Duration                            0
Dietary Habits                            4
Degree                                    2
Have you ever had suicidal thoughts ?     0
Work/Study Hours                          0
Financial Stress                          4
Family History of Mental Illness          0
Depression                                0
Job/Study Satisfaction                   15
Academic/Work Pressure                   21
dtype: int64

Since the number of records with missing values is no longer significant I will drop those records

In [43]:
data.dropna(inplace=True)
# Printin the shape of the data after dropping records
print(data.shape)

(140661, 15)
