# Analyzing Panic Attack Data: Lifestyle Influences, Potenital Triggers, and Symptoms

Gavin Kornitsky, Massimo Prag, Katrina Shonka, Sarah Hudson

In [None]:
# imports
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('panic_attack_dataset.csv')
pd.set_option('display.max_rows', None)
df.head()

In [None]:
df.columns

In [None]:
df['Medical_History'].unique()

In [None]:
# check for missing values
missing_vals = df.isnull().sum()
missing_vals # quicker than manually checking df for them

the only column with missing values is Medical_History, and the missing values for that column just mean 
that those people dont have prexisting conditions, so we will be keeping those

In [None]:
# convert categorical data to numerical (Yes/No -> 1/0)
yes_no = ["Sweating", "Shortness_of_Breath", "Dizziness", "Trembling", "Smoking", "Therapy", "Chest_Pain", "Medication"]
df[yes_no] = df[yes_no].applymap(lambda x: 1 if x =="Yes" else 0) # assigns 1s to yes val and 0 to no
df[yes_no]

In [None]:
# removing non-binary individuals, making male=1, female=0
df = df[df["Gender"] != "Non-binary"].copy()
df.loc[:, "Gender"] = df["Gender"].map({"Female": 0, "Male": 1})

In [None]:
# checking to see if gender is correctly mapped
df.head()

### Standardizing the data
for numerical columns. not every numerical column needs to be standardized though

Columns where standardization is not neccessary:
- Age: Standardization would make it harder to understand, its unneccesary
- Panic_Attack_Frequency: its a count of attack per month, standardization would just complicate an easy stat
- Duration_Minutes: time in mins should be in raw format

Columns that need standardization:
- Heart_Rate: Can vary from 60-180 bpm, might dominate some smaller scale features
- Caffeine_Intake: smaller values, so it will benefit from standardization if for ex we are comparing heart rate to sleep
- Exercise_Frequency: weekly count (0-7)
- Sleep_Hours: varies between 3-10 hours so standardizing will help for comparing to caffeine and heart rate 
- Alcohol_Consumption: this one is so varied

In [None]:
# standardizing
numerical = ["Heart_Rate", "Caffeine_Intake", "Exercise_Frequency", "Sleep_Hours", "Alcohol_Consumption"]
# dtype is int64, but pandas needs it as float64 for StanardScaler
df[numerical] = df[numerical].astype("float64")

# standardization
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

### Feature Engineering: Panic Severity

In [None]:
# function for classifying panic severity
def panic_severity(score):
    if score <=3:
        return "Low"
    elif 4 <= score <= 6:
        return "Medium"
    else:
        return "High"

# applying function to df
df.loc[:, "Panic_Severity"] = df["Panic_Score"].apply(panic_severity)

In [None]:
df = df.drop(columns=["ID"])

In [None]:
# final cleaned and preprocessed dataset :)
df.head()

### First Visualization: Sankey Diagram

In [None]:
# Sankey diagram functions in code block adapted from lecture notes in DS3500 course

import plotly.graph_objects as go
import pandas as pd

# pd.set_option('future.no_silent_downcasting', True)

def _code_mapping(df, src, targ):
    """ Map labels in src and targ columns to integers """
    # Get distinct labels
    labels = sorted(list(set(list(df[src]) + list(df[targ]))))

    # Get integer codes
    codes = list(range(len(labels)))

    # Create label to code mapping
    lc_map = dict(zip(labels, codes))

    # Substitute names for codes in dataframe
    df = df.replace({src: lc_map, targ: lc_map})
    return df, labels


def make_sankey(df, src, targ, vals=None, **kwargs):
    """ Generate a sankey diagram
    df - Dataframe
    src - Source column
    targ - Target column
    vals - Values column (optional)
    optional params: pad, thickness, line_color, line_width """

    if vals:
        values = df[vals]
    else:
        values = [1] * len(df[src])  # all 1

    df, labels = _code_mapping(df, src, targ)
    link = {'source': df[src], 'target': df[targ], 'value': values}

    pad = kwargs.get('pad', 50)
    thickness = kwargs.get('thickness', 50)
    line_color = kwargs.get('line_color', 'black')
    line_width = kwargs.get('line_width', 1)

    node = {'label': labels, 'pad': pad, 'thickness': thickness, 'line': {'color': line_color, 'width': line_width}}
    sk = go.Sankey(link=link, node=node)
    fig = go.Figure(sk)

    # pixels !
    width = kwargs.get('width', 800)
    height = kwargs.get('height', 400)
    fig.update_layout(
        autosize=False,
        width=width,
        height=height)

    return fig


def show_sankey(df, src, targ, vals=None, **kwargs):
    fig = make_sankey(df, src, targ, vals, **kwargs)
    fig.show()

### Grouped Bar Chart for Effects of Therapy and Medication

In [None]:
import altair as alt

# Create Therapy_Status and Medication_Status columns for labeling purposes using .apply()
df['Therapy_Status'] = df['Therapy'].apply(lambda x: 'Therapy' if x == 1 else 'No Therapy')
df['Medication_Status'] = df['Medication'].apply(lambda x: 'Medication' if x == 1 else 'No Medication')

# Create a combined Therapy/Medication status column
df['Status'] = df['Therapy_Status'] + ' & ' + df['Medication_Status']

# Group by and count each combined therapy/medication status and panic severity category
count_df = df.groupby(['Status', 'Panic_Severity']).size().reset_index(name='Count')

# Specify the order of Panic_Severity categories (x-axis)
severity_order = ['Low', 'Medium', 'High']

# Specify the order of Status categories (subplots)
status_order = ['No Therapy & No Medication', 'No Therapy & Medication', 
                'Therapy & No Medication', 'Therapy & Medication']

# Create the grouped bar chart with severity on the x-axis
chart = alt.Chart(count_df).mark_bar(color='blue').encode(
    x=alt.X('Panic_Severity:N', title='Panic Severity', sort=severity_order),
    y=alt.Y('Count:Q', title='Count of Panic Severity'),
    

    # Ensure the subplots follow the specified order
    column=alt.Column('Status:N', title='Therapy and Medication Status', sort=status_order)  
).properties(
    width=150,  
    height=400
).interactive()

chart

In [None]:
symptom_df = df[['Sweating', 'Shortness_of_Breath', 'Dizziness',
       'Chest_Pain', 'Trembling']]

symptom_df.head()

### Creation of CSV for Stacked Bar Chart for Proportions of Yes/No for Each Symptom

In [None]:
import pandas as pd
from collections import Counter

symptom_df = df[['Sweating', 'Shortness_of_Breath', 'Dizziness',
       'Chest_Pain', 'Trembling']]

# Flatten the dataset into (symptom, response) pairs
symptom_responses = [(col, response) for col in symptom_df.columns for response in symptom_df[col]]

# Count occurrences of each (symptom, response) pair
response_counts = Counter(symptom_responses)


# Convert to DataFrame
symptom_data = pd.DataFrame(response_counts.items(), columns=["Symptom - Response", "Count"])
symptom_data["Symptom - Response"] = symptom_data["Symptom - Response"].apply(lambda x: f"{x[0]} - {x[1]}")

# Sort and display the final DataFrame
symptom_data = symptom_data.sort_values(by="Symptom - Response").reset_index(drop=True)
print(symptom_data)
