### HAMZA YOUSAF
### C964 - CAPSTONE PROJECT

#                                                   STUDENT MENTAL HEALTH PROJECT

#### Importing Libraries and Configure Logging

In [10]:
# Import necessary libraries for data processing, machine learning, and visualization
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import plotly.express as px
from dash import Dash, html, dcc, Input, Output
import logging

# Set up logging to monitor the stages of the script execution
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')

#### Defining Preprocessing Functions

In [11]:
# Suppress warnings for cleaner output, (Caution: might hide important warnings)
pd.options.mode.chained_assignment = None

def preprocess_data(file_path):
    """Preprocesses the dataset for analysis."""
     # Log the preprocessing step
    logging.info("Preprocessing data from: {}".format(file_path))
    # Load dataset and drop rows with missing values
    data_frame = pd.read_csv(file_path).dropna()
    # Identify binary columns to map 'Yes'/'No' to 1/0
    binary_cols = ['Do you have Depression?', 'Do you have Anxiety?', 'Do you have Panic attack?', 'Did you seek any specialist for a treatment?']
    # Identify categorical columns for label encoding
    cat_cols = data_frame.select_dtypes(['object']).columns.difference(binary_cols)
    # Apply mapping for binary columns
    data_frame[binary_cols] = data_frame[binary_cols].apply(lambda x: x.map({'Yes': 1, 'No': 0}))
    # Apply label encoding for other categorical columns
    for col in cat_cols:
        data_frame[col] = LabelEncoder().fit_transform(data_frame[col])
    return data_frame

#### Loading and Preprocessing the data

In [12]:
# Path to the CSV file
csv_file_path = 'SMHealth.csv'

# Preprocess data
data = preprocess_data(csv_file_path)

[2024-02-21 18:54:12,462] INFO: Preprocessing data from: SMHealth.csv


#### Spliting the Dataset and Scale Features

In [13]:
# Split dataset into features and target variable
features = data.drop('Did you seek any specialist for a treatment?', axis=1)
target = data['Did you seek any specialist for a treatment?']
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Training the model

In [14]:
# Log the model training step
logging.info ("Training the model...")
# Initialize and train the XGBoost classifier
classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
classifier.fit(X_train, y_train)

[2024-02-21 18:54:13,342] INFO: Training the model...


#### Evaluating the Model

In [15]:
# Evaluate the model's performance
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
evaluation_report = classification_report(y_test, y_pred, zero_division=0)

#### PCA for Visualization

In [16]:
# Apply PCA for dimensionality reduction for visualization purposes
pca = PCA(n_components=2)
pca_components = pca.fit_transform(features)

#### Initializing Dash Application for Interactive Visualization

In [17]:
# # Initialize the Dash application for interactive visualization
app = Dash(__name__)

# Layout definition for the Dash application
app.layout = html.Div([
    html.H1("Analysis of Student Mental Health"),
    dcc.Graph(id='pca-visualization'),
    html.Div(f"Model Accuracy: {accuracy*100:.2f}%"),
    dcc.Dropdown(
        id='visualization-dropdown',
        options=[
            {'label': 'PCA Components', 'value': 'PCA'},
            {'label': 'Feature Importance', 'value': 'FI'},
            {'label': 'Condition Analysis', 'value': 'CA'}
        ],
        value='PCA'
    ),
    html.Div(id='report', children=[html.Pre(evaluation_report)])
])

# Callback for updating the graph based on the dropdown selection
@app.callback(
    Output('pca-visualization', 'figure'),
    [Input('visualization-dropdown', 'value')]
)
def update_figure(selected_value):
    # Update the visualization based on user selection
    if selected_value == 'PCA':
        fig = px.scatter(x=pca_components[:,0], y=pca_components[:,1], color=target.map({0: 'No', 1: 'Yes'}), labels={'x': 'PCA 1', 'y': 'PCA 2'})
        fig.update_layout(title='PCA of Student Mental Health Data')
    elif selected_value == 'FI':
        fig = px.bar(x=features.columns, y=classifier.feature_importances_, title="Feature Importance")
    elif selected_value == 'CA':
        fig = px.sunburst(data, path=['Do you have Depression?', 'Do you have Anxiety?', 'Do you have Panic attack?'], title="Sunburst Chart of Conditions")
    else:
        fig = {}
    return fig

#### Running the Dash Application

In [18]:
# Conditional to run the Dash app in a development server if the script is executed directly
if __name__ == '__main__':
    app.run_server(debug=True,)