# Analyzing Panic Attack Data: Lifestyle Influences, Potenital Triggers, and Symptoms

Gavin Kornitsky, Massimo Prag, Katrina Shonka, Sarah Hudson

In [None]:
# imports
import pandas as pd
import altair as alt

In [None]:
df = pd.read_csv('panic_attack_dataset.csv')
pd.set_option('display.max_rows', None)
df.head()

In [None]:
# check for missing values
missing_vals = df.isnull().sum()
missing_vals # quicker than manually checking df for them

the only column with missing values is Medical_History, and the missing values for that column just mean 
that those people dont have prexisting conditions, so we will be keeping those

In [None]:
# convert categorical data to numerical (Yes/No -> 1/0)
yes_no = ["Sweating", "Shortness_of_Breath", "Dizziness", "Trembling", "Smoking", "Therapy", "Medication"]
df[yes_no] = df[yes_no].applymap(lambda x: 1 if x =="Yes" else 0) # assigns 1s to yes val and 0 to no
df[yes_no]

In [None]:
# removing non-binary individuals, making male=1, female=0
df = df[df["Gender"] != "Non-binary"].copy()
df.loc[:, "Gender"] = df["Gender"].map({"Female": 0, "Male": 1})

In [None]:
# checking to see if gender is correctly mapped
df.head()

### Standardizing the data
for numerical columns. not every numerical column needs to be standardized though

Columns where standardization is not neccessary:
- Age: Standardization would make it harder to understand, its unneccesary
- Panic_Attack_Frequency: its a count of attack per month, standardization would just complicate an easy stat
- Duration_Minutes: time in mins should be in raw format

Columns that need standardization:
- Heart_Rate: Can vary from 60-180 bpm, might dominate some smaller scale features
- Caffeine_Intake: smaller values, so it will benefit from standardization if for ex we are comparing heart rate to sleep
- Exercise_Frequency: weekly count (0-7)
- Sleep_Hours: varies between 3-10 hours so standardizing will help for comparing to caffeine and heart rate 
- Alcohol_Consumption: this one is so varied

In [None]:
# standardizing
numerical = ["Heart_Rate", "Caffeine_Intake", "Exercise_Frequency", "Sleep_Hours", "Alcohol_Consumption", "Chest_Pain"]
# dtype is int64, but pandas needs it as float64
# df[numerical] = df[numerical].astype("float64")
numerical

### Feature Engineering: Panic Severity

In [None]:
# function for classifying panic severity
def panic_severity(score):
    if score <=3:
        return "Low"
    elif 4 <= score <= 6:
        return "Medium"
    else:
        return "High"

# applying function to df
df.loc[:, "Panic_Severity"] = df["Panic_Score"].apply(panic_severity)

In [None]:
df = df.drop(columns=["ID"])

In [None]:
# final cleaned and preprocessed dataset :)
df.head()

## Making Sankey Diagram

In [None]:
import plotly.graph_objects as go
import plotly.graph_objects as go
import pandas as pd

def _code_mapping(df, src, targ):
    """ Map labels in src and targ columns to integers """
    labels = sorted(list(set(list(df[src]) + list(df[targ]))))
    codes = list(range(len(labels)))
    lc_map = dict(zip(labels, codes))
    df = df.replace({src: lc_map, targ: lc_map})
    return df, labels

def make_sankey(df, src, targ, vals=None, color_col=None, color_map=None, **kwargs):
    """
    Generate a Sankey diagram with optional link coloring by color_col + color_map.
    
    df       : DataFrame (already grouped or melted as needed)
    src      : name of source column
    targ     : name of target column
    vals     : name of values column (e.g. 'count'), or None
    color_col: name of column that determines link colors (e.g. 'Panic_Severity')
    color_map: dict from color_col values to a color string (e.g. {"Low": "green", "High": "red"})
    kwargs   : additional Sankey styling parameters (pad, thickness, line_color, line_width, etc.)
    """
    # Build the link values
    if vals is not None:
        values = df[vals]
    else:
        values = [1]*len(df[src])

    # Convert source/target text to integer codes
    df_for_sankey = df.copy()
    df_for_sankey, labels = _code_mapping(df_for_sankey, src, targ)

    # Build link dictionary
    link = {
        'source': df_for_sankey[src],
        'target': df_for_sankey[targ],
        'value': values
    }

    if color_col and color_map:
        link_colors = []
        for i, row in df_for_sankey.iterrows():
            severity = row[color_col]  # e.g. "Low", "Medium", "High"
            link_colors.append(color_map.get(severity, "gray"))
        link['color'] = link_colors  # set link colors
 
    pad = kwargs.get('pad', 50)
    thickness = kwargs.get('thickness', 50)
    line_color = kwargs.get('line_color', 'black')
    line_width = kwargs.get('line_width', 1)
    node = {
        'label': labels,
        'pad': pad,
        'thickness': thickness,
        'line': {'color': line_color, 'width': line_width}
    }

    sk = go.Sankey(link=link, node=node)
    fig = go.Figure(sk)

    width = kwargs.get('width', 800)
    height = kwargs.get('height', 400)
    fig.update_layout(autosize=False, width=width, height=height)
    return fig

def show_sankey(df, src, targ, vals=None, color_col=None, color_map=None, **kwargs):
    fig = make_sankey(df, src, targ, vals, color_col=color_col, color_map=color_map, **kwargs)
    fig.show()


symptom_cols = ["Sweating", "Dizziness", "Chest_Pain", "Shortness_of_Breath"]


for col in symptom_cols:
    df[col] = df[col].map({1: "Yes", 0: "No"}).fillna(df[col]) 

df_melt = pd.melt(
    df,
    id_vars=["Trigger", "Panic_Severity"],
    value_vars=symptom_cols,
    var_name="Symptom",
    value_name="HasSymptom"
)
df_melt = df_melt[df_melt["HasSymptom"] == "Yes"]


df_grouped = df_melt.groupby(["Trigger", "Symptom", "Panic_Severity"]).size().reset_index(name="count")

df_grouped.head()

color_map = {
    "Low":   "rgba(0, 128, 0, 0.4)",       
    "Medium":"rgba(255, 165, 0, 0.4)",     
    "High":  "rgba(255, 0, 0, 0.4)"       
}


fig = make_sankey(
    df_grouped,
    src="Trigger",
    targ="Symptom",
    vals="count",
    color_col="Panic_Severity",
    color_map=color_map,
    width=1000,
    height=600
)

fig.show()
