# Health Risk Dataset

Goal: Use the 8 features to predict the 1 label (health risk).
Process:
- Embed all features into an 8-dimensional vector space
- When new data points arrive, project them into the same 8D space
- Compute pairwise cosine similarity between all vectors (could use optimization here, lazy-loading etc.)
- Use k nearest neighbours to find vectors with closest similarity and make the prediction based on majority vote

Source:
https://www.kaggle.com/datasets/ludocielbeckett/health-risk-prediction-anonymized-real-data

In [42]:
import numpy as np
import pandas as pd
import plotly.express as px
import sys
import plotly.graph_objects as go

sys.path.append('..')
from distances.distance_matrix import make_distance_matrix

df = pd.read_csv("../data/Health-Risk.csv")

label = df['Risk_Level']
df = df.iloc[:, 1:10]


In [39]:
# The following graph is ONLY for visualization, not for model training. We'll draw 3 orthogonal axes, and colour the vectors according to risk level.
color_map = {'Low': 'green', 'Normal': 'blue', 'Medium': 'orange', 'High': 'red'}

# Select the first 30 datapoints for plotting
df_sample = df.iloc[:30]
label_sample = label.iloc[:30]

respiratory_rate = df_sample.iloc[:, 1]  # Respiratory_Rate
systolic_bp = df_sample.iloc[:, 4]       # Systolic_BP
temperature = df_sample['Temperature']  # Correct column for numerical temperature values

# Prepare data for vectors (lines from origin to each point)
x_lines = []
y_lines = []
z_lines = []
colors = []
risk_labels = []

for i in range(len(df_sample)):
    x_lines.extend([0, respiratory_rate.iloc[i], None])  # None to break line segments
    y_lines.extend([0, systolic_bp.iloc[i], None])
    z_lines.extend([0, float(temperature.iloc[i]), None])  # Ensure numerical values for z axis
    color = color_map[label_sample.iloc[i]]
    colors.extend([color, color, color])  # Repeat for each segment
    risk_labels.append(label_sample.iloc[i])

# Make the scale even smaller for a focused plot
x_max = max(abs(respiratory_rate.min()), abs(respiratory_rate.max())) + 0.5
y_max = max(abs(systolic_bp.min()), abs(systolic_bp.max())) + 1
z_max = max(abs(float(temperature.min())), abs(float(temperature.max()))) + 0.2

# Create the 3D plot with skinnier arrows
fig = go.Figure()

# Add lines and tips for each risk level, but only show legend once per color
for risk in color_map.keys():
    indices = [i for i, l in enumerate(label_sample) if l == risk]
    # Add lines (vectors)
    for j, i in enumerate(indices):
        fig.add_trace(go.Scatter3d(
            x=[0, respiratory_rate.iloc[i]],
            y=[0, systolic_bp.iloc[i]],
            z=[0, float(temperature.iloc[i])],
            mode='lines',
            line=dict(color=color_map[risk], width=1.5),
            name=f'{risk} Risk',
            showlegend=(j == 0)  # Only show one legend entry per risk
        ))
    # Add tips (markers) but do not show legend
    fig.add_trace(go.Scatter3d(
        x=[respiratory_rate.iloc[i] for i in indices],
        y=[systolic_bp.iloc[i] for i in indices],
        z=[float(temperature.iloc[i]) for i in indices],
        mode='markers',
        marker=dict(size=4, color=color_map[risk], symbol='cross'),
        name=f'{risk} Risk Tip',
        showlegend=False
    ))


# Update layout for visual appeal, centering at origin, always show axes, and maximize canvas
fig.update_layout(
    title='3D Vectors of Respiratory Rate, Systolic BP, and Temperature by Risk Level (Sample of 15)',
    scene=dict(
        xaxis_title='Respiratory Rate',
        yaxis_title='Systolic BP',
        zaxis_title='Temperature',
        xaxis=dict(range=[0, x_max], showspikes=True, showline=True, zeroline=True, showgrid=True, visible=True),
        yaxis=dict(range=[0, y_max], showspikes=True, showline=True, zeroline=True, showgrid=True, visible=True),
        zaxis=dict(range=[0, z_max], showspikes=True, showline=True, zeroline=True, showgrid=True, visible=True),
        aspectmode='cube',
        camera=dict(eye=dict(x=1.5, y=1.5, z=1.5))
    ),
    showlegend=True,
    autosize=True,
    margin=dict(l=0, r=0, b=0, t=40),
    height=900,
    width=1200
 )

fig.show()

In [None]:
# Now onto the actual prediction example
import sys
import numpy as np

sys.path.append('..')

from distances.cos_sim import cosine_similarity

# Recall that df and label are our features and labels respectively

# label holds risk level whereas
# df holds everything else

def health_risk_predictor(new_vector: np.ndarray) -> str:
    df = pd.read_csv("../data/Health-Risk.csv")

    label = df['Risk_Level']
    
    # Note that second-last column uses AVUP system
    # A: Awake (most conscious)
    # V: Verbal (answers but may be drowsy)
    # P: Pain (responsive to pain)
    # U: Unresponsive

    avup_map = {'A': 3, 'V': 2, 'P': 1, 'U': 0}
    df['Consciousness'] = df['Consciousness'].map(avup_map)

    # Map the 'Consciousness' value in new_vector (second last column) if it's a string
    new_vector = new_vector.copy()
    if isinstance(new_vector[-2], str):
        new_vector[-2] = avup_map.get(new_vector[-2], new_vector[-2])
        new_vector = np.array(new_vector, dtype=float)

    # Select only feature columns (exclude ID, Risk_Level)
    df = df.iloc[:, 1:9]

    similarities = np.array([cosine_similarity(new_vector, row) for row in df.to_numpy()])

    # It's important to have a high k-value in this case, as a false negative is very costly.
    k = 20
    top_k_indices = np.argsort(similarities)[-k:]  # Get indices of k largest similarities

    # Get their risk levels
    neighbor_risks = label.iloc[top_k_indices]

    # Predict by majority vote
    predicted_risk = neighbor_risks.value_counts().idxmax()
    print("Predicted risk level for new vector:", predicted_risk)
    return predicted_risk

# Example new data (replace 'A' with 3, 'V' with 2, etc. for Consciousness)
new_data = np.array([24,85,1,90,126,38.2,'A',1])

health_risk = health_risk_predictor(new_data)

new_data2 = np.array([18,91,1,102,89,37.6,'A',0])

health_risk2 = health_risk_predictor(new_data2)

print("So far so good. Model is accurate.")


Predicted risk level for new vector: High
Predicted risk level for new vector: Medium
