In [25]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
import requests
from io import StringIO
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder

In [26]:
dataset1_url = "https://raw.githubusercontent.com/hbedros/data622-assignment4/main/data/dataset1.csv"
dataset2_url = "https://raw.githubusercontent.com/hbedros/data622-assignment4/main/data/dataset2.csv"

response1 = requests.get(dataset1_url, verify=False)
response2 = requests.get(dataset2_url, verify=False)

data1 = StringIO(response1.text)
data2 = StringIO(response2.text)
dataset1 = pd.read_csv(data1)
dataset2 = pd.read_csv(data2)

print(dataset1.head())
print(dataset2.head())


   Gender  Age  Academic Pressure  Study Satisfaction     Sleep Duration  \
0    Male   28                2.0                 4.0          7-8 hours   
1    Male   28                4.0                 5.0          5-6 hours   
2    Male   25                1.0                 3.0          5-6 hours   
3    Male   23                1.0                 4.0  More than 8 hours   
4  Female   31                1.0                 5.0  More than 8 hours   

  Dietary Habits Have you ever had suicidal thoughts ?  Study Hours  \
0       Moderate                                   Yes            9   
1        Healthy                                   Yes            7   
2      Unhealthy                                   Yes           10   
3      Unhealthy                                   Yes            7   
4        Healthy                                   Yes            4   

   Financial Stress Family History of Mental Illness Depression  
0                 2                              Y



## Cleaning Dataset 1 - The Depression Dataset

In [27]:
#check for missing observations
print(dataset1.isnull().sum())

#No missing data present

df1 = pd.DataFrame(dataset1)

df1.columns = df1.columns.str.strip()

print("Cleaned Columns", df1.columns)

df1 = df1.rename(columns = {'Have you ever had suicidal thoughts ?':'suicidal thoughts'})

df1['Sleep Duration'] = pd.Categorical(df1['Sleep Duration'], categories=['Less than 5 hours', '5-6 hours', '7-8 hours', 'More than 8 hours'])

Gender                                   0
Age                                      0
Academic Pressure                        0
Study Satisfaction                       0
Sleep Duration                           0
Dietary Habits                           0
Have you ever had suicidal thoughts ?    0
Study Hours                              0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64
Cleaned Columns Index(['Gender', 'Age', 'Academic Pressure', 'Study Satisfaction',
       'Sleep Duration', 'Dietary Habits',
       'Have you ever had suicidal thoughts ?', 'Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')


In [28]:
numeric_cols_df1 = ['Age', 'Academic Pressure', 'Study Satisfaction', 'Study Hours', 'Financial Stress']
categorical_cols_df1 = ['Gender', 'Sleep Duration', 'Dietary Habits', 'suicidal thoughts', 'Family History of Mental Illness', 'Depression']

scaler = MinMaxScaler()
df1[numeric_cols_df1] = scaler.fit_transform(df1[numeric_cols_df1])

print(df1.head())



   Gender     Age  Academic Pressure  Study Satisfaction     Sleep Duration  \
0    Male  0.6250               0.25                0.75          7-8 hours   
1    Male  0.6250               0.75                1.00          5-6 hours   
2    Male  0.4375               0.00                0.50          5-6 hours   
3    Male  0.3125               0.00                0.75  More than 8 hours   
4  Female  0.8125               0.00                1.00  More than 8 hours   

  Dietary Habits suicidal thoughts  Study Hours  Financial Stress  \
0       Moderate               Yes     0.750000              0.25   
1        Healthy               Yes     0.583333              0.00   
2      Unhealthy               Yes     0.833333              0.75   
3      Unhealthy               Yes     0.583333              0.25   
4        Healthy               Yes     0.333333              0.25   

  Family History of Mental Illness Depression  
0                              Yes         No  
1             

## Cleaning Dataset 2 - Student Mental Health Survey

In [29]:
#check for nulls
print(dataset2.isnull().sum())


df2 = pd.DataFrame(dataset2)
df2.columns = df2.columns.str.strip()

print("Cleaned Columns:", df2.columns)

gender                      0
age                         0
university                  0
degree_level                0
degree_major                0
academic_year               0
cgpa                        0
residential_status          0
campus_discrimination       0
sports_engagement           0
average_sleep               0
study_satisfaction          0
academic_workload           0
academic_pressure           0
financial_concerns          0
social_relationships        0
depression                  0
anxiety                     0
isolation                   0
future_insecurity           0
stress_relief_activities    0
dtype: int64
Cleaned Columns: Index(['gender', 'age', 'university', 'degree_level', 'degree_major',
       'academic_year', 'cgpa', 'residential_status', 'campus_discrimination',
       'sports_engagement', 'average_sleep', 'study_satisfaction',
       'academic_workload', 'academic_pressure', 'financial_concerns',
       'social_relationships', 'depression', 'anxiety

In [30]:
# Convert CGPA ranges to their midpoint
def parse_range(val):
    if isinstance(val, str) and "-" in val:
        lower, upper = map(float, val.split("-"))
        return (lower + upper) / 2
    else:
        return float(val)

df2["cgpa"] = df2["cgpa"].apply(parse_range)

# Function to parse sleep range into a midpoint
def parse_sleep(val):
    if isinstance(val, str) and "hrs" in val:
        val = val.replace(" hrs", "").strip()
    if "-" in val:
        lower, upper = map(float, val.split("-"))
        return (lower + upper) / 2
    else:
        return float(val)

if "average_sleep" in df2.columns:
    df2["average_sleep"] = df2["average_sleep"].apply(parse_sleep)

numerical_cols = [
    "cgpa", "study_satisfaction", "average_sleep"
]

missing_cols = [col for col in numerical_cols if col not in df2.columns]
if missing_cols:
    print("Missing columns after cleaning:", missing_cols)
else:
    scaler = MinMaxScaler()
    df2[numerical_cols] = scaler.fit_transform(df2[numerical_cols])


In [31]:
# stress relief acticivties column convert to floats with one-hot encoding
if "stress_relief_activities" in df2.columns:
    activities = df2["stress_relief_activities"].str.get_dummies(sep=",")
    
    # We create dummy columns and concatenate them with the original dataframe
    # dummy columns needed so it doesn't have statistcal impact after the one-hot encoding
    df2 = pd.concat([df2, activities], axis=1)
    
    print(activities.head())


    Creative Outlets   Online Entertainment   Outdoor Activities   Sleep  \
0                  0                      1                    0       0   
1                  0                      0                    0       0   
2                  0                      1                    0       0   
3                  0                      0                    0       0   
4                  0                      0                    0       0   

    Social Connections   Sports and Fitness  Creative Outlets  Nothing  \
0                    1                    0                 0        0   
1                    0                    0                 0        0   
2                    0                    1                 0        0   
3                    0                    0                 0        0   
4                    0                    0                 0        0   

   Online Entertainment  Outdoor Activities  Religious Activities  Sleep  \
0                     

In [None]:
# Convert sports engagement to float
def parse_sports_engagement(val):
    if isinstance(val, str):
        val = val.strip().lower()  # Normalize the input
        if "no sports" in val:
            return 0.0  # Assign 0 for 'No Sports'
        elif "-" in val:
            # Handle ranges like "1-3 times"
            try:
                lower, upper = map(float, val.replace(" times", "").split("-"))
                return (lower + upper) / 2
            except ValueError:
                return np.nan  # Handle malformed ranges
        elif "7+" in val or "7+s" in val:
            # Assign a numeric value for '7+' or '7+s'
            return 7.5
        elif "time" in val:
            # Handle single occurrences like "1 time"
            try:
                return float(val.replace(" time", "").strip())
            except ValueError:
                return np.nan
    try:
        return float(val)
    except ValueError:
        return np.nan

print(df2["sports_engagement"].head())

0    No Sports
1    1-3 times
2    1-3 times
3    No Sports
4    No Sports
Name: sports_engagement, dtype: object


In [33]:
# put the cleaned data in amodul to call from EDA file
def get_cleaned_data2():
    return df2

print(df2.head())

   gender  age university   degree_level      degree_major academic_year  \
0    Male   20         PU  Undergraduate      Data Science      2nd year   
1    Male   20        UET   Postgraduate  Computer Science      3rd year   
2    Male   20       FAST  Undergraduate  Computer Science      3rd year   
3    Male   20        UET  Undergraduate  Computer Science      3rd year   
4  Female   20        UET  Undergraduate  Computer Science      3rd year   

       cgpa residential_status campus_discrimination sports_engagement  ...  \
0  0.866667         Off-Campus                    No         No Sports  ...   
1  0.866667         Off-Campus                    No         1-3 times  ...   
2  0.733333         Off-Campus                    No         1-3 times  ...   
3  0.733333          On-Campus                    No         No Sports  ...   
4  0.866667         Off-Campus                   Yes         No Sports  ...   

    Social Connections   Sports and Fitness  Creative Outlets  Nothi