# Integrated Mental Health & Cybersecurity Analysis

Datasets
- **Students' Social Media Addiction**
- **Cyber Security Indexes**


## 1. Imports and Data Loading

In [None]:
#Import each library that we use for the project
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

from ipywidgets import interact, FloatSlider

import os

project_dir = os.getcwd()

#Make sure this is your path
print(project_dir)

In [None]:
#Read the social media dataset from the Dataset Folder
social_df = pd.read_csv(os.path.join(project_dir, 'Datasets/studentSocialMediaDataset.csv'))

#Describe and show the dataset
print("Social media dataset shape:", social_df.shape)
social_df.head()

In [None]:
#Read the cybersecurity dataset from the Dataset Folder
cybersecurity_df = pd.read_csv(os.path.join(project_dir, 'Datasets/cybersecurityDataset.csv'))

#Describe and show the dataset
print("Cybersecurity dataset shape:", cybersecurity_df.shape)
cybersecurity_df.head()

In [None]:
#Make sure to get rid of any unnecessary columns for each dataset
social_df = social_df[['Country', 'Avg_Daily_Usage_Hours','Sleep_Hours_Per_Night', 'Mental_Health_Score', 'Addicted_Score']].copy()
social_df.head()

In [None]:
cybersecurity_df = cybersecurity_df[['Country', 'Region', 'CEI', 'GCI']].copy()
cybersecurity_df.head()

## 2. Basic Cleaning & Type Handling

Here we:
- Handle simple missing values
- Find and fix outliers


#### Social Media Dataset

In [None]:
#Make a copy of the data so we don't mess up the original
social_df_cleaned = social_df.copy()

#Clean the dataframe
for col in social_df_cleaned.columns:
    if social_df_cleaned[col].dtype == object:#If it's a text/string
        social_df_cleaned[col] = social_df_cleaned[col].astype(str).str.strip()
        mode_val = social_df_cleaned[col].mode(dropna=True)#Fill in missing data with the mode
        if len(mode_val) > 0:
            social_df_cleaned[col] = social_df_cleaned[col].fillna(mode_val.iloc[0])

    elif social_df_cleaned[col].isna().any():
        median_val = social_df_cleaned[col].median()#Fill missing value with the median
        social_df_cleaned[col] = social_df_cleaned[col].fillna(median_val)

social_df_cleaned.info()

#### Cybersecurity Dataset

In [None]:
#Copy original dataset and drop null values
cybersecurity_df_cleaned = cybersecurity_df.copy()
cybersecurity_df_cleaned = cybersecurity_df_cleaned.dropna()
cybersecurity_df_cleaned.info()

In [None]:
#Function to find and remove outliers based off bell curve distribution
def find_and_remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[column_name] < lower_bound) | (df[column_name] > upper_bound)]
    num_outliers = len(outliers)

    df_clean = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)].copy()

    return df_clean, num_outliers, lower_bound, upper_bound

In [None]:
#remove outliers using function created previously
social_original = social_df_cleaned
social_clean = social_df_cleaned

for col in ['Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night','Mental_Health_Score', 'Addicted_Score']:
    social_clean_temp, num_outliers, lower, upper = find_and_remove_outliers(social_clean, col)
    social_clean = social_clean_temp

print(f"Social Media Dataset:")
print(f"OG rows: {len(social_original)}")
print(f"After outlier removal: {len(social_clean)}")
print(f"Rows removed: {len(social_original) - len(social_clean)} ({(len(social_original) - len(social_clean))/len(social_original)*100:.1f}%)")

## 3. Dataset-Specific Exploratory Analysis

We explore each dataset separately before comparing patterns.


#### Social Media Dataset

In [None]:
#Basic statistics
social_df_cleaned.describe()

In [None]:
#Scatter plot to show social media usage against mental health score
plt.figure()
plt.scatter(social_df_cleaned["Avg_Daily_Usage_Hours"], social_df_cleaned["Mental_Health_Score"], color="lightpink")
plt.xlabel("Average Daily Social Media Usage (hours)")
plt.ylabel("Mental Health Score")
plt.title("Social Media Usage vs Mental Health Score")
plt.show()

In [None]:
#Scatter plot to show sleep quality against mental health score
plt.figure()
plt.scatter(social_df_cleaned["Sleep_Hours_Per_Night"], social_df_cleaned["Mental_Health_Score"], color="lightpink")
plt.xlabel("Sleep Hours Per Night")
plt.ylabel("Mental Health Score")
plt.title("Sleep vs Mental Health Score")
plt.show()

In [None]:
#Scatter plot to show social media addiction against mental health score
plt.figure()
plt.scatter(social_df_cleaned["Addicted_Score"], social_df_cleaned["Mental_Health_Score"], color="lightpink")
plt.xlabel("Addicted Score")
plt.ylabel("Mental Health Score")
plt.title("Addiction vs Mental Health Score")
plt.show()

#### Cybersecurity Dataset

In [None]:
#Basic statistics
cybersecurity_df_cleaned.describe()

In [None]:
#Find the unique regions from the cybersecurity dataset
cybersecurity_df_cleaned["Region"].unique()

In [None]:
#Scatter plot to show CEI vs GCI based on country and region
region_color_mapping = {
    "Asia-Pasific": "lightpink",
    "Europe": "lightblue",
    "Africa": "peachpuff",
    "South America": "lightgreen",
    "North America": "lavender"
}

#keeps track of each country's region and color
plot_colors = []
for i in range(len(cybersecurity_df)):
    region = cybersecurity_df.iloc[i]["Region"]
    if region in region_color_mapping:
        plot_colors.append(region_color_mapping[region])

plt.figure(figsize=(8, 5))
plt.scatter(cybersecurity_df["GCI"], cybersecurity_df["CEI"], c=plot_colors)
plt.title("Cybersecurity Exposure (CEI) vs Global Cybersecurity Index (GCI) by Region")
plt.xlabel("GCI")
plt.ylabel("CEI")
for region in region_color_mapping:
    plt.scatter([], [], color=region_color_mapping[region], label=region)
plt.legend(title="Region")
plt.tight_layout()
plt.show()


## 4. Predictive Modelling
We started by merging the data set on "Country" before doing the regression models.

Model Testing:
- Linear Regression
- Single Variable Random Forest
- Multi-Variable Random Forest


In [None]:
#merge dataset by the country
merged_df = pd.merge(social_df_cleaned, cybersecurity_df_cleaned, on='Country', how='inner')

print("Merged dataset info:")
print(merged_df.info())
print(f"Number of unique countries in merged dataset: {merged_df['Country'].nunique()}")
merged_df.head()

In [None]:
#view correlation between variables
print("Correlation matrix:")
correlation_matrix = merged_df[['Mental_Health_Score', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Addicted_Score', 'CEI']].corr()
correlation_matrix

In [None]:
#Prep data for Linear Regression
X = merged_df[['Mental_Health_Score']]
y = merged_df['CEI']

#for reproducibility set random_state to 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
#Scale for the Linear Regression model
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Create the linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
y_pred_linear = linear_model.predict(X_test_scaled)

r2_linear = r2_score(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)

#Print out model evaluation metrics
print("Linear Regression Model")
print(f"R-squared Score: {r2_linear:.4f}")
print(f"Mean Squared Error: {mse_linear:.4f}")
print(f"Mean Absolute Error: {mae_linear:.4f}")

In [None]:
#Analyze the correlation between CEI and Mental Health Score
print("Correlation between CEI and Mental Health Score:")

corr = merged_df['CEI'].corr(merged_df[col])

print(f"CEI vs Mental Health Score: {corr:.4f}")

In [None]:
#Prep and split data for single variable random forest regression
X = merged_df[['Mental_Health_Score']]
y = merged_df['CEI']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

In [None]:
#Scale and train single variable random forest regressor
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Changed n_estimators from 100 to 250
#Set random_state to 42 for reproducibility
model = RandomForestRegressor(n_estimators=250, random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
y_pred = model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

#Print evaluation metrics
print("Single Variable Random Forest Regression Model")
print(f"R-squared Score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")

In [None]:
#scatter plot to visualize different between the predicted and the actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color="lightpink")
plt.plot([y.min(), y.max()], [y.min(), y.max()], color="plum")
plt.xlabel('Actual CEI')
plt.ylabel('Predicted CEI')
plt.title('Actual vs Predicted CEI')
plt.show()

In [None]:
#Scatter plot for the residuals
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color="lightpink")
plt.xlabel('Predicted CEI')
plt.ylabel('Actual CEI - Predicted CEI')
plt.title('Residual Plot for CEI Predictions')
plt.show()

In [None]:
#Create and train Multi-Variable Random Tree Model
#Update the X variable to include multiple predictors
X_multi = merged_df[['Mental_Health_Score', 'Avg_Daily_Usage_Hours', 'Sleep_Hours_Per_Night', 'Addicted_Score']]
y_multi = merged_df['CEI']

X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)

In [None]:
#Scale and train random Forest Regressor
scaler_multi = StandardScaler()
X_train_multi_scaled = scaler_multi.fit_transform(X_train_multi)
X_test_multi_scaled = scaler_multi.transform(X_test_multi)

#Changed n_estimators from 100 to 300
#Set random_state to 42 for reproducibility
model_multi = RandomForestRegressor(n_estimators=300, random_state=42)
model_multi.fit(X_train_multi_scaled, y_train_multi)

In [None]:
y_pred_multi = model_multi.predict(X_test_multi_scaled)
r2_multi = r2_score(y_test_multi, y_pred_multi)
mse_multi = mean_squared_error(y_test_multi, y_pred_multi)
mae_multi = mean_absolute_error(y_test_multi, y_pred_multi)

#Print evaluation metrics
print("Multi-Variable Random Forest Regression Model")
print(f"R-squared Score: {r2_multi:.4f}")
print(f"Mean Squared Error: {mse_multi:.4f}")
print(f"Mean Absolute Error: {mae_multi:.4f}")

In [None]:
#Analyze the correlation between CEI and each predictor
print("Correlation between CEI and each predictor:")

for col in X_multi.columns:
    corr = merged_df['CEI'].corr(merged_df[col])
    print(f"CEI vs {col}: {corr:.4f}")

In [None]:
#Compare each model against each other using their evaluation metrics
comparison_df = pd.DataFrame({'Model': ['Linear Regression','Single Variable Random Forest', 'Multi-Variable Random Forest'],'R-squared': [r2_linear, r2, r2_multi], 'MSE': [mse_linear, mse, mse_multi], 'MAE': [mae_linear, mae, mae_multi]})

print("Model Comparison")
comparison_df

In [None]:
#Find the min and max for each predictor because that's what the sliders will be
mental_health_min = merged_df['Mental_Health_Score'].min()
mental_health_max = merged_df['Mental_Health_Score'].max()

usage_min = merged_df['Avg_Daily_Usage_Hours'].min()
usage_max = merged_df['Avg_Daily_Usage_Hours'].max()

sleep_min = merged_df['Sleep_Hours_Per_Night'].min()
sleep_max = merged_df['Sleep_Hours_Per_Night'].max()

addiction_min = merged_df['Addicted_Score'].min()
addiction_max = merged_df['Addicted_Score'].max()

In [None]:
#Create function for the sliders that will predict CEI based on user inputted mental health score, social media usage, sleep, and addiction
#It uses the multiple variable random forest regression model that we created earlier
def predict_cei(mental_health, usage, sleep, addiction):
    row_dict = {'Mental_Health_Score': [mental_health], 'Avg_Daily_Usage_Hours': [usage], 'Sleep_Hours_Per_Night': [sleep],'Addicted_Score': [addiction]}
    row = pd.DataFrame(row_dict)
    row_scaled = scaler_multi.transform(row)
    prediction = model_multi.predict(row_scaled)[0]

    if prediction > 0.70:
        level = "High CEI Risk"
    elif prediction > 0.40:
        level = "Moderate CEI Risk"
    else:
        level = "Low CEI Risk"

    print(f"Predicted CEI: {prediction:.3f}")
    print(f"Risk Level: {level}")

In [None]:
#Enable the slider
interact(predict_cei,
    mental_health=FloatSlider(
        value=merged_df["Mental_Health_Score"].mean(),
        min=mental_health_min, max=mental_health_max, step=0.1,
        description="Mental Health Score"
    ),
    usage=FloatSlider(
        value=merged_df["Avg_Daily_Usage_Hours"].mean(),
        min=usage_min, max=usage_max, step=0.1,
        description="Daily Usage Hours"
    ),
    sleep=FloatSlider(
        value=merged_df["Sleep_Hours_Per_Night"].mean(),
        min=sleep_min, max=sleep_max, step=0.1,
        description="Sleep Hours"
    ),
    addiction=FloatSlider(
        value=merged_df["Addicted_Score"].mean(),
        min=addiction_min, max=addiction_max, step=0.1,
        description="Addiction Score"
    )
)