###Importing Essential Libraries and Metrics

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

plt.style.use('seaborn-v0_8-whitegrid')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shariful07/student-mental-health")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/student-mental-health


###Loading the Data

In [None]:
df=pd.read_csv("/content/Student Mental health.csv")
df.head()   #Head is used to display the top 5 data in the dataset

Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


####Understanding the data set

                                         
1.  Timestamp	     :                      Date and time of the response
2.  Choose your gender	:                 Respondent’s gender
3.  Age	:                                 Respondent’s age
4.  Respondent’s field of study	 :        Field of study
5.  Your current year of Study	:         Academic year
6.  What is your CGPA?	:                 Grade average range
7.  Marital status	:                     Single or married status
8.  Do you have Depression?	 :            Has depression (Yes/No)
9.  Do you have Anxiety?	:               Has anxiety (Yes/No)
10. Do you have Panic attack? :           Has panic attacks (Yes/No)
11. Did you seek any specialist for a treatment? :	Sought professional help (Yes/No)

####Initial Inspection

In [None]:
df.shape

(101, 11)

In [None]:
df.info()  #to find the type of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 11 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Timestamp                                     101 non-null    object 
 1   Choose your gender                            101 non-null    object 
 2   Age                                           100 non-null    float64
 3   What is your course?                          101 non-null    object 
 4   Your current year of Study                    101 non-null    object 
 5   What is your CGPA?                            101 non-null    object 
 6   Marital status                                101 non-null    object 
 7   Do you have Depression?                       101 non-null    object 
 8   Do you have Anxiety?                          101 non-null    object 
 9   Do you have Panic attack?                     101 non-null    obj



```
The name of each column represents one question in this survey.
```



In [None]:
df.dtypes

Unnamed: 0,0
Timestamp,object
Choose your gender,object
Age,float64
What is your course?,object
Your current year of Study,object
What is your CGPA?,object
Marital status,object
Do you have Depression?,object
Do you have Anxiety?,object
Do you have Panic attack?,object



```
only two data types are shown,'object' and 'float'.
```





In [None]:
df.isnull().sum()

Unnamed: 0,0
Timestamp,0
Choose your gender,0
Age,1
What is your course?,0
Your current year of Study,0
What is your CGPA?,0
Marital status,0
Do you have Depression?,0
Do you have Anxiety?,0
Do you have Panic attack?,0




```
one value is missing in the 'age' column.
```



In [None]:
df.describe()

Unnamed: 0,Age
count,100.0
mean,20.53
std,2.49628
min,18.0
25%,18.0
50%,19.0
75%,23.0
max,24.0


In [None]:
# 3. Drop Columns (if any irrelevant columns exist)
# Let's inspect first
print(df.columns)

# Drop unnamed or irrelevant columns if any
df = df.drop(columns=[col for col in df.columns if 'Unnamed' in col], errors='ignore')


Index(['Timestamp', 'Choose your gender', 'Age', 'What is your course?',
       'Your current year of Study', 'What is your CGPA?', 'Marital status',
       'Do you have Depression?', 'Do you have Anxiety?',
       'Do you have Panic attack?',
       'Did you seek any specialist for a treatment?'],
      dtype='object')


In [None]:
# 4. Handle Missing Values
print("Missing values:\n", df.isnull().sum())

# If there are any, fill or drop based on situation
df.fillna(method='ffill', inplace=True)


Missing values:
 Timestamp                                       0
Choose your gender                              0
Age                                             1
What is your course?                            0
Your current year of Study                      0
What is your CGPA?                              0
Marital status                                  0
Do you have Depression?                         0
Do you have Anxiety?                            0
Do you have Panic attack?                       0
Did you seek any specialist for a treatment?    0
dtype: int64


In [None]:
# 5. Encode Categorical Features
le = LabelEncoder()

# Encode all object (categorical) columns
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

df.head()


Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,23,0,18.0,17,3,3,0,1,0,1,0
1,24,1,21.0,25,4,3,0,0,1,0,0
2,25,1,19.0,4,0,3,0,1,1,1,0
3,26,0,22.0,33,5,3,1,1,0,0,0
4,27,1,23.0,37,6,3,0,0,0,0,0


In [None]:
# 6. Define Features and Target

# Assuming 'Depression' is the target column
X = df.drop('Do you have Depression?', axis=1)
y = df['Do you have Depression?']


In [None]:
# 7. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 8. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# 9. Pipeline Complete – Check Output
print("ETL Pipeline completed ✅")
print("X_train shape:", X_train_scaled.shape)
print("X_test shape:", X_test_scaled.shape)


ETL Pipeline completed ✅
X_train shape: (80, 10)
X_test shape: (21, 10)
