# Analyzing Panic Attack Data: Lifestyle Influences, Potenital Triggers, and Symptoms

Gavin Kornitsky, Massimo Prag, Katrina Shonka, Sarah Hudson

In [1]:
# imports
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('panic_attack_dataset.csv')
pd.set_option('display.max_rows', None)
df.head()

Unnamed: 0,ID,Age,Gender,Panic_Attack_Frequency,Duration_Minutes,Trigger,Heart_Rate,Sweating,Shortness_of_Breath,Dizziness,...,Trembling,Medical_History,Medication,Caffeine_Intake,Exercise_Frequency,Sleep_Hours,Alcohol_Consumption,Smoking,Therapy,Panic_Score
0,1,56,Female,9,5,Caffeine,134,Yes,No,Yes,...,No,Anxiety,No,2,3,6.4,5,Yes,Yes,5
1,2,46,Male,8,9,Stress,139,Yes,Yes,No,...,No,PTSD,Yes,2,5,5.0,3,No,Yes,7
2,3,32,Female,6,31,PTSD,141,No,Yes,Yes,...,No,Depression,No,4,0,8.3,8,No,Yes,7
3,4,60,Male,5,20,Caffeine,109,Yes,Yes,No,...,Yes,Depression,No,3,3,5.3,8,No,No,1
4,5,25,Non-binary,6,10,Caffeine,101,Yes,No,Yes,...,Yes,Depression,No,3,6,7.2,2,No,No,5


In [3]:
# check for missing values
missing_vals = df.isnull().sum()
missing_vals # quicker than manually checking df for them

ID                        0
Age                       0
Gender                    0
Panic_Attack_Frequency    0
Duration_Minutes          0
Trigger                   0
Heart_Rate                0
Sweating                  0
Shortness_of_Breath       0
Dizziness                 0
Chest_Pain                0
Trembling                 0
Medical_History           0
Medication                0
Caffeine_Intake           0
Exercise_Frequency        0
Sleep_Hours               0
Alcohol_Consumption       0
Smoking                   0
Therapy                   0
Panic_Score               0
dtype: int64

the only column with missing values is Medical_History, and the missing values for that column just mean 
that those people dont have prexisting conditions, so we will be keeping those

In [4]:
# convert categorical data to numerical (Yes/No -> 1/0)
yes_no = ["Sweating", "Shortness_of_Breath", "Dizziness", "Trembling", "Smoking", "Therapy", "Medication"]
df[yes_no] = df[yes_no].applymap(lambda x: 1 if x =="Yes" else 0) # assigns 1s to yes val and 0 to no
df[yes_no]

Unnamed: 0,Sweating,Shortness_of_Breath,Dizziness,Trembling,Smoking,Therapy,Medication
0,1,0,1,0,1,1,0
1,1,1,0,0,0,1,1
2,0,1,1,0,0,1,0
3,1,1,0,1,0,0,0
4,1,0,1,1,0,0,0
5,1,1,1,0,0,0,1
6,1,1,1,0,1,1,0
7,1,0,0,1,0,1,0
8,1,0,0,0,0,1,0
9,0,0,1,1,0,1,0


In [5]:
# removing non-binary individuals, making male=1, female=0
df = df[df["Gender"] != "Non-binary"].copy()
df.loc[:, "Gender"] = df["Gender"].map({"Female": 0, "Male": 1})

  df.loc[:, "Gender"] = df["Gender"].map({"Female": 0, "Male": 1})


In [6]:
# checking to see if gender is correctly mapped
df.head()

Unnamed: 0,ID,Age,Gender,Panic_Attack_Frequency,Duration_Minutes,Trigger,Heart_Rate,Sweating,Shortness_of_Breath,Dizziness,...,Trembling,Medical_History,Medication,Caffeine_Intake,Exercise_Frequency,Sleep_Hours,Alcohol_Consumption,Smoking,Therapy,Panic_Score
0,1,56,0,9,5,Caffeine,134,1,0,1,...,0,Anxiety,0,2,3,6.4,5,1,1,5
1,2,46,1,8,9,Stress,139,1,1,0,...,0,PTSD,1,2,5,5.0,3,0,1,7
2,3,32,0,6,31,PTSD,141,0,1,1,...,0,Depression,0,4,0,8.3,8,0,1,7
3,4,60,1,5,20,Caffeine,109,1,1,0,...,1,Depression,0,3,3,5.3,8,0,0,1
5,6,38,1,0,44,Social Anxiety,154,1,1,1,...,0,Depression,1,1,4,4.8,6,0,0,8


### Standardizing the data
for numerical columns. not every numerical column needs to be standardized though

Columns where standardization is not neccessary:
- Age: Standardization would make it harder to understand, its unneccesary
- Panic_Attack_Frequency: its a count of attack per month, standardization would just complicate an easy stat
- Duration_Minutes: time in mins should be in raw format

Columns that need standardization:
- Heart_Rate: Can vary from 60-180 bpm, might dominate some smaller scale features
- Caffeine_Intake: smaller values, so it will benefit from standardization if for ex we are comparing heart rate to sleep
- Exercise_Frequency: weekly count (0-7)
- Sleep_Hours: varies between 3-10 hours so standardizing will help for comparing to caffeine and heart rate 
- Alcohol_Consumption: this one is so varied

In [7]:
# standardizing
numerical = ["Heart_Rate", "Caffeine_Intake", "Exercise_Frequency", "Sleep_Hours", "Alcohol_Consumption"]
# dtype is int64, but pandas needs it as float64 for StanardScaler
df[numerical] = df[numerical].astype("float64")

# standardization
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

### Feature Engineering: Panic Severity

In [8]:
# function for classifying panic severity
def panic_severity(score):
    if score <=3:
        return "Low"
    elif 4 <= score <= 6:
        return "Medium"
    else:
        return "High"

# applying function to df
df.loc[:, "Panic_Severity"] = df["Panic_Score"].apply(panic_severity)

In [9]:
df = df.drop(columns=["ID"])

In [10]:
# final cleaned and preprocessed dataset :)
df.head()

Unnamed: 0,Age,Gender,Panic_Attack_Frequency,Duration_Minutes,Trigger,Heart_Rate,Sweating,Shortness_of_Breath,Dizziness,Chest_Pain,...,Medical_History,Medication,Caffeine_Intake,Exercise_Frequency,Sleep_Hours,Alcohol_Consumption,Smoking,Therapy,Panic_Score,Panic_Severity
0,56,0,9,5,Caffeine,0.579776,1,0,1,Yes,...,Anxiety,0,-0.305815,0.01202,-0.063514,0.202882,1,1,5,Medium
1,46,1,8,9,Stress,0.794274,1,1,0,No,...,PTSD,1,-0.305815,0.978938,-1.060073,-0.483504,0,1,7,High
2,32,0,6,31,PTSD,0.880074,0,1,1,No,...,Depression,0,0.8534,-1.438358,1.288959,1.232461,0,1,7,High
3,60,1,5,20,Caffeine,-0.492713,1,1,0,No,...,Depression,0,0.273793,0.01202,-0.846525,1.232461,0,0,1,Low
5,38,1,0,44,Social Anxiety,1.437768,1,1,1,No,...,Depression,1,-0.885423,0.495479,-1.202439,0.546075,0,0,8,High


### First Visualization: Sankey Diagram

In [11]:
# Sankey diagram functions in code block adapted from lecture notes in DS3500 course

import plotly.graph_objects as go
import pandas as pd

# pd.set_option('future.no_silent_downcasting', True)

def _code_mapping(df, src, targ):
    """ Map labels in src and targ columns to integers """
    # Get distinct labels
    labels = sorted(list(set(list(df[src]) + list(df[targ]))))

    # Get integer codes
    codes = list(range(len(labels)))

    # Create label to code mapping
    lc_map = dict(zip(labels, codes))

    # Substitute names for codes in dataframe
    df = df.replace({src: lc_map, targ: lc_map})
    return df, labels


def make_sankey(df, src, targ, vals=None, **kwargs):
    """ Generate a sankey diagram
    df - Dataframe
    src - Source column
    targ - Target column
    vals - Values column (optional)
    optional params: pad, thickness, line_color, line_width """

    if vals:
        values = df[vals]
    else:
        values = [1] * len(df[src])  # all 1

    df, labels = _code_mapping(df, src, targ)
    link = {'source': df[src], 'target': df[targ], 'value': values}

    pad = kwargs.get('pad', 50)
    thickness = kwargs.get('thickness', 50)
    line_color = kwargs.get('line_color', 'black')
    line_width = kwargs.get('line_width', 1)

    node = {'label': labels, 'pad': pad, 'thickness': thickness, 'line': {'color': line_color, 'width': line_width}}
    sk = go.Sankey(link=link, node=node)
    fig = go.Figure(sk)

    # pixels !
    width = kwargs.get('width', 800)
    height = kwargs.get('height', 400)
    fig.update_layout(
        autosize=False,
        width=width,
        height=height)

    return fig


def show_sankey(df, src, targ, vals=None, **kwargs):
    fig = make_sankey(df, src, targ, vals, **kwargs)
    fig.show()

### Grouped Bar Chart for Effects of Therapy and Medication

In [12]:
import altair as alt

# Create Therapy_Status and Medication_Status columns for labeling purposes using .apply()
df['Therapy_Status'] = df['Therapy'].apply(lambda x: 'Therapy' if x == 1 else 'No Therapy')
df['Medication_Status'] = df['Medication'].apply(lambda x: 'Medication' if x == 1 else 'No Medication')

# Create a combined Therapy/Medication status column
df['Status'] = df['Therapy_Status'] + ' & ' + df['Medication_Status']

# Group by and count each combined therapy/medication status and panic severity category
count_df = df.groupby(['Status', 'Panic_Severity']).size().reset_index(name='Count')

# Create the grouped bar chart with severity on the x-axis
chart = alt.Chart(count_df).mark_bar().encode(
    x=alt.X('Panic_Severity:N', title='Panic Severity'),
    y=alt.Y('Count:Q', title='Count of Panic Severity'),
    color=alt.Color('Status:N', title='Therapy and Medication Status'),
    
    # Create separate columns for each therapy/medication status
    column=alt.Column('Status:N', title='Therapy and Medication Status')  
).properties(
    width=150,  
    height=400
).interactive()

# change orders
chart

In [13]:
import altair as alt

# Create Therapy_Status and Medication_Status columns for labeling purposes using .apply()
df['Therapy_Status'] = df['Therapy'].apply(lambda x: 'Therapy' if x == 1 else 'No Therapy')
df['Medication_Status'] = df['Medication'].apply(lambda x: 'Medication' if x == 1 else 'No Medication')

# Create a combined Therapy/Medication status column
df['Status'] = df['Therapy_Status'] + ' & ' + df['Medication_Status']

# Group by and count each combined therapy/medication status and panic severity category
count_df = df.groupby(['Status', 'Panic_Severity']).size().reset_index(name='Count')

# Specify the order of Panic_Severity categories
severity_order = ['Low', 'Medium', 'High']

# Create the grouped bar chart with severity on the x-axis
chart = alt.Chart(count_df).mark_bar().encode(
    x=alt.X('Panic_Severity:N', title='Panic Severity', sort=severity_order),
    y=alt.Y('Count:Q', title='Count of Panic Severity'),
    color=alt.Color('Status:N', title='Therapy and Medication Status'),
    
    # Create separate columns for each therapy/medication status
    column=alt.Column('Status:N', title='Therapy and Medication Status')  
).properties(
    width=150,  
    height=400
).interactive()

chart

In [17]:
import altair as alt

# Create Therapy_Status and Medication_Status columns for labeling purposes using .apply()
df['Therapy_Status'] = df['Therapy'].apply(lambda x: 'Therapy' if x == 1 else 'No Therapy')
df['Medication_Status'] = df['Medication'].apply(lambda x: 'Medication' if x == 1 else 'No Medication')

# Create a combined Therapy/Medication status column
df['Status'] = df['Therapy_Status'] + ' & ' + df['Medication_Status']

# Group by and count each combined therapy/medication status and panic severity category
count_df = df.groupby(['Status', 'Panic_Severity']).size().reset_index(name='Count')

# Specify the order of Panic_Severity categories (x-axis)
severity_order = ['Low', 'Medium', 'High']

# Specify the order of Status categories (subplots)
status_order = ['No Therapy & No Medication', 'No Therapy & Medication', 
                'Therapy & No Medication', 'Therapy & Medication']

# Create the grouped bar chart with severity on the x-axis
chart = alt.Chart(count_df).mark_bar(color='blue').encode(
    x=alt.X('Panic_Severity:N', title='Panic Severity', sort=severity_order),
    y=alt.Y('Count:Q', title='Count of Panic Severity'),
    

    # Ensure the subplots follow the specified order
    column=alt.Column('Status:N', title='Therapy and Medication Status', sort=status_order)  
).properties(
    width=150,  
    height=400
).interactive()

chart