# Analyzing Panic Attack Data: Lifestyle Influences, Potenital Triggers, and Symptoms

Gavin Kornitsky, Massimo Prag, Katrina Shonka, Sarah Hudson

In [1]:
# imports
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('panic_attack_dataset.csv')
pd.set_option('display.max_rows', None)
df

Unnamed: 0,ID,Age,Gender,Panic_Attack_Frequency,Duration_Minutes,Trigger,Heart_Rate,Sweating,Shortness_of_Breath,Dizziness,...,Trembling,Medical_History,Medication,Caffeine_Intake,Exercise_Frequency,Sleep_Hours,Alcohol_Consumption,Smoking,Therapy,Panic_Score
0,1,56,Female,9,5,Caffeine,134,Yes,No,Yes,...,No,Anxiety,No,2,3,6.4,5,Yes,Yes,5
1,2,46,Male,8,9,Stress,139,Yes,Yes,No,...,No,PTSD,Yes,2,5,5.0,3,No,Yes,7
2,3,32,Female,6,31,PTSD,141,No,Yes,Yes,...,No,Depression,No,4,0,8.3,8,No,Yes,7
3,4,60,Male,5,20,Caffeine,109,Yes,Yes,No,...,Yes,Depression,No,3,3,5.3,8,No,No,1
4,5,25,Non-binary,6,10,Caffeine,101,Yes,No,Yes,...,Yes,Depression,No,3,6,7.2,2,No,No,5
5,6,38,Male,0,44,Social Anxiety,154,Yes,Yes,Yes,...,No,Depression,Yes,1,4,4.8,6,No,No,8
6,7,56,Male,0,17,PTSD,108,Yes,Yes,Yes,...,No,Anxiety,No,0,6,6.0,4,Yes,Yes,7
7,8,36,Male,7,39,PTSD,120,Yes,No,No,...,Yes,PTSD,No,0,4,8.5,8,No,Yes,2
8,9,40,Non-binary,1,23,Unknown,121,Yes,No,No,...,No,Anxiety,No,4,5,4.8,1,No,Yes,8
9,10,28,Female,9,16,Caffeine,144,No,No,Yes,...,Yes,Anxiety,No,4,6,7.3,2,No,Yes,2


In [3]:
# check for missing values
missing_vals = df.isnull().sum()
missing_vals # quicker than manually checking df for them

ID                          0
Age                         0
Gender                      0
Panic_Attack_Frequency      0
Duration_Minutes            0
Trigger                     0
Heart_Rate                  0
Sweating                    0
Shortness_of_Breath         0
Dizziness                   0
Chest_Pain                  0
Trembling                   0
Medical_History           122
Medication                  0
Caffeine_Intake             0
Exercise_Frequency          0
Sleep_Hours                 0
Alcohol_Consumption         0
Smoking                     0
Therapy                     0
Panic_Score                 0
dtype: int64

the only column with missing values is Medical_History, and the missing values for that column just mean 
that those people dont have prexisting conditions, so we will be keeping those

In [12]:
# convert categorical data to numerical (Yes/No -> 1/0)
yes_no = ["Sweating", "Shortness_of_Breath", "Dizziness", "Trembling", "Smoking", "Therapy", "Medication"]
df[yes_no] = df[yes_no].applymap(lambda x: 1 if x =="Yes" else 0) # assigns 1s to yes val and 0 to no
df[yes_no]

Unnamed: 0,Sweating,Shortness_of_Breath,Dizziness,Trembling,Smoking,Therapy,Medication
0,1,0,1,0,1,1,0
1,1,1,0,0,0,1,1
2,0,1,1,0,0,1,0
3,1,1,0,1,0,0,0
4,1,0,1,1,0,0,0
5,1,1,1,0,0,0,1
6,1,1,1,0,1,1,0
7,1,0,0,1,0,1,0
8,1,0,0,0,0,1,0
9,0,0,1,1,0,1,0


In [13]:
# removing non-binary individuals, making male=1, female=0
df = df[df["Gender"] != "Non-binary"].copy()
df.loc[:, "Gender"] = df["Gender"].map({"Female": 0, "Male": 1})

In [14]:
# checking to see if gender is correctly mapped
df.head()

Unnamed: 0,Age,Gender,Panic_Attack_Frequency,Duration_Minutes,Trigger,Heart_Rate,Sweating,Shortness_of_Breath,Dizziness,Chest_Pain,...,Medical_History,Medication,Caffeine_Intake,Exercise_Frequency,Sleep_Hours,Alcohol_Consumption,Smoking,Therapy,Panic_Score,Panic_Severity
0,56,0,9,5,Caffeine,0.586361,1,0,1,Yes,...,Anxiety,0,-0.314174,0.021838,-0.058065,0.201053,1,1,5,Medium
1,46,1,8,9,Stress,0.800401,1,1,0,No,...,PTSD,1,-0.314174,0.992402,-1.054478,-0.488272,0,1,7,High
2,32,0,6,31,PTSD,0.886017,0,1,1,No,...,Depression,0,0.851233,-1.434008,1.29421,1.235041,0,1,7,High
3,60,1,5,20,Caffeine,-0.483836,1,1,0,No,...,Depression,0,0.268529,0.021838,-0.840961,1.235041,0,0,1,Low
5,38,1,0,44,Social Anxiety,1.442519,1,1,1,No,...,Depression,1,-0.896878,0.50712,-1.196823,0.545716,0,0,8,High


### Standardizing the data
for numerical columns. not every numerical column needs to be standardized though

Columns where standardization is not neccessary:
- Age: Standardization would make it harder to understand, its unneccesary
- Panic_Attack_Frequency: its a count of attack per month, standardization would just complicate an easy stat
- Duration_Minutes: time in mins should be in raw format

Columns that need standardization:
- Heart_Rate: Can vary from 60-180 bpm, might dominate some smaller scale features
- Caffeine_Intake: smaller values, so it will benefit from standardization if for ex we are comparing heart rate to sleep
- Exercise_Frequency: weekly count (0-7)
- Sleep_Hours: varies between 3-10 hours so standardizing will help for comparing to caffeine and heart rate 
- Alcohol_Consumption: this one is so varied

In [15]:
# standardizing
numerical = ["Heart_Rate", "Caffeine_Intake", "Exercise_Frequency", "Sleep_Hours", "Alcohol_Consumption"]
# dtype is int64, but pandas needs it as float64 for StanardScaler
df[numerical] = df[numerical].astype("float64")

# standardization
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

### Feature Engineering: Panic Severity

In [16]:
# function for classifying panic severity
def panic_severity(score):
    if score <=3:
        return "Low"
    elif 4 <= score <= 6:
        return "Medium"
    else:
        return "High"

# applying function to df
df.loc[:, "Panic_Severity"] = df["Panic_Score"].apply(panic_severity)

In [17]:
df = df.drop(columns=["ID"])

KeyError: "['ID'] not found in axis"

In [None]:
# final cleaned and preprocessed dataset :)
df.head()

## Making Sankey Diagram

In [32]:
import plotly.graph_objects as go
import plotly.graph_objects as go
import pandas as pd

def _code_mapping(df, src, targ):
    """ Map labels in src and targ columns to integers """
    labels = sorted(list(set(list(df[src]) + list(df[targ]))))
    codes = list(range(len(labels)))
    lc_map = dict(zip(labels, codes))
    df = df.replace({src: lc_map, targ: lc_map})
    return df, labels

def make_sankey(df, src, targ, vals=None, color_col=None, color_map=None, **kwargs):
    """
    Generate a Sankey diagram with optional link coloring by color_col + color_map.
    
    df       : DataFrame (already grouped or melted as needed)
    src      : name of source column
    targ     : name of target column
    vals     : name of values column (e.g. 'count'), or None
    color_col: name of column that determines link colors (e.g. 'Panic_Severity')
    color_map: dict from color_col values to a color string (e.g. {"Low": "green", "High": "red"})
    kwargs   : additional Sankey styling parameters (pad, thickness, line_color, line_width, etc.)
    """
    # Build the link values
    if vals is not None:
        values = df[vals]
    else:
        values = [1]*len(df[src])

    # Convert source/target text to integer codes
    df_for_sankey = df.copy()
    df_for_sankey, labels = _code_mapping(df_for_sankey, src, targ)

    # Build link dictionary
    link = {
        'source': df_for_sankey[src],
        'target': df_for_sankey[targ],
        'value': values
    }

    if color_col and color_map:
        link_colors = []
        for i, row in df_for_sankey.iterrows():
            severity = row[color_col]  # e.g. "Low", "Medium", "High"
            link_colors.append(color_map.get(severity, "gray"))
        link['color'] = link_colors  # set link colors
 
    pad = kwargs.get('pad', 50)
    thickness = kwargs.get('thickness', 50)
    line_color = kwargs.get('line_color', 'black')
    line_width = kwargs.get('line_width', 1)
    node = {
        'label': labels,
        'pad': pad,
        'thickness': thickness,
        'line': {'color': line_color, 'width': line_width}
    }

    sk = go.Sankey(link=link, node=node)
    fig = go.Figure(sk)

    width = kwargs.get('width', 800)
    height = kwargs.get('height', 400)
    fig.update_layout(autosize=False, width=width, height=height)
    return fig

def show_sankey(df, src, targ, vals=None, color_col=None, color_map=None, **kwargs):
    fig = make_sankey(df, src, targ, vals, color_col=color_col, color_map=color_map, **kwargs)
    fig.show()


symptom_cols = ["Sweating", "Dizziness", "Chest_Pain", "Shortness_of_Breath"]


for col in symptom_cols:
    df[col] = df[col].map({1: "Yes", 0: "No"}).fillna(df[col]) 

df_melt = pd.melt(
    df,
    id_vars=["Trigger", "Panic_Severity"],
    value_vars=symptom_cols,
    var_name="Symptom",
    value_name="HasSymptom"
)
df_melt = df_melt[df_melt["HasSymptom"] == "Yes"]


df_grouped = df_melt.groupby(["Trigger", "Symptom", "Panic_Severity"]).size().reset_index(name="count")

df_grouped.head()

color_map = {
    "Low":   "rgba(0, 128, 0, 0.4)",       
    "Medium":"rgba(255, 165, 0, 0.4)",     
    "High":  "rgba(255, 0, 0, 0.4)"       
}


fig = make_sankey(
    df_grouped,
    src="Trigger",
    targ="Symptom",
    vals="count",
    color_col="Panic_Severity",
    color_map=color_map,
    width=1000,
    height=600
)

fig.show()
