In [21]:
# Libraries 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import shapiro
import statsmodels.api as sm

In [22]:
df = pd.read_csv('../Data/Clean/full_dataset.csv', low_memory=False)

In [23]:
# Normalize the column names by removing leading/trailing spaces and converting to uppercase
df.columns = df.columns.str.strip().str.upper()

# Select columns from index 14 onwards for renaming
columns_to_rename = df.columns[14:]  # This selects columns from index 14 onwards

# Define a dictionary to rename these selected columns for better visual representation
rename_columns = {col: col.replace('_', ' ').title() for col in columns_to_rename}

# Apply the new column names only for the selected columns
df.rename(columns=rename_columns, inplace=True)

# Display the first few rows to verify the changes
df.head()


Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,Dysphagia,Mobility Decreased,Injection Site Reaction,Lip Swelling,Sleep Disorder,Hot Flush,Abdominal Pain,Feeling Cold,Swollen Tongue,Lymph Node Pain
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,0


2. Feature Engineering


In [24]:
# Create symptom_count (Total Symptoms Reported)

# Symptom columns
symptom_cols = df.columns[14:]  # All columns from index 14 onward seem to represent symptoms

# Create SYMPTOM_COUNT by summing up all symptom columns
df['SYMPTOM_COUNT'] = df[symptom_cols].sum(axis=1)

In [8]:
agg_df = df.groupby('SEX')[symptom_cols].sum()
agg_df

Unnamed: 0_level_0,None,Headache,Chills,Fatigue,Pyrexia,Pain,Nausea,Dizziness,Injection Site Pain,Pain In Extremity,...,Dysphagia,Mobility Decreased,Injection Site Reaction,Lip Swelling,Sleep Disorder,Hot Flush,Abdominal Pain,Feeling Cold,Swollen Tongue,Lymph Node Pain
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13611,3990,2755,2708,2567,2655,2532,2315,1942,1754,...,191,184,192,195,163,176,163,169,178,187
1,3111,871,793,713,821,593,415,437,421,285,...,27,30,21,14,45,30,43,35,26,15


In [11]:
# Export to CSV
agg_df.to_csv('../Data/Clean/symptom_count_by_sex.csv')

In [10]:
def top_symptoms_by_demographic_relative(df, demo_col, top_n=10):
    """
    Aggregates data to show the most common symptoms by a demographic parameter in relative proportions.
    Args:
    - df (pd.DataFrame): The dataframe containing demographic and symptom data.
    - demo_col (str): The column name corresponding to the demographic parameter (e.g., ‘age’, ‘gender’).
    - top_n (int): The number of top symptoms to return (default is 10).
    Returns:
    - pd.DataFrame: A table showing the most common symptoms by the specified demographic in percentages.
    """
    # Identify symptom columns (assuming binary encoding: 1 = yes, 0 = no)
    symptom_cols = df.columns[14:]
    # Group by the demographic column and sum the symptoms
    agg_df = df.groupby(demo_col)[symptom_cols].sum()
    # Get total number of patients in each demographic group
    counts_df = df.groupby(demo_col).size()
    # Calculate the relative frequency (percentage) of each symptom
    relative_df = agg_df.div(counts_df, axis=0) * 100  # Multiply by 100 to get percentage
    # Sort the symptoms for each demographic group and get the top N
    top_symptoms = relative_df.apply(lambda row: row.nlargest(top_n), axis=1)
    return top_symptoms

In [11]:
df_agg = top_symptoms_by_demographic_relative(df, 'SEX', top_n=20)
df_agg

Unnamed: 0_level_0,Arthralgia,Asthenia,Chills,Cough,Dizziness,Dyspnoea,Fatigue,Headache,Hyperhidrosis,Injection Site Erythema,...,Myalgia,Nausea,None,Pain,Pain In Extremity,Paraesthesia,Pruritus,Pyrexia,Rash,SYMPTOM_COUNT
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,6.595745,,17.2403,,14.486859,5.450563,16.946183,24.968711,,9.349186,...,7.38423,15.844806,85.175219,16.614518,10.97622,6.276596,7.715895,16.06383,5.982478,430.093867
1,7.629734,4.57223,22.244039,3.674614,12.258065,4.263675,20.0,24.431978,4.936886,,...,10.238429,11.640954,87.265077,16.633941,7.99439,4.151473,,23.029453,3.955119,383.338008


In [12]:
df_agg.to_csv('../Data/Clean/top_symptoms_by_sex.csv')

In [25]:
list(df.columns)

['VAERS_ID',
 'STATE',
 'AGE_YRS',
 'SEX',
 'DIED',
 'L_THREAT',
 'HOSPITAL',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'BIRTH_DEFECT',
 'VAX_LOT',
 'VAX_MANU',
 'VAX_DOSE_SERIES',
 'None',
 'Headache',
 'Chills',
 'Fatigue',
 'Pyrexia',
 'Pain',
 'Nausea',
 'Dizziness',
 'Injection Site Pain',
 'Pain In Extremity',
 'Injection Site Erythema',
 'Myalgia',
 'Pruritus',
 'Arthralgia',
 'Injection Site Swelling',
 'Paraesthesia',
 'Rash',
 'Dyspnoea',
 'Injection Site Pruritus',
 'Urticaria',
 'Erythema',
 'Vomiting',
 'Injection Site Warmth',
 'Hypoaesthesia',
 'Flushing',
 'Diarrhoea',
 'Malaise',
 'Hyperhidrosis',
 'Palpitations',
 'Lymphadenopathy',
 'Asthenia',
 'Heart Rate Increased',
 'Cough',
 'Chest Discomfort',
 'Paraesthesia Oral',
 'Feeling Abnormal',
 'Tachycardia',
 'Feeling Hot',
 'Sars-Cov-2 Test Negative',
 'Oropharyngeal Pain',
 'Throat Irritation',
 'Tremor',
 'Peripheral Swelling',
 'Blood Pressure Increased',
 'Body Temperature Increased',
 'Skin Warm',
 'Throat Tightness'

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19545 entries, 0 to 19544
Data columns (total 95 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   VAERS_ID                           19545 non-null  int64  
 1   STATE                              19545 non-null  object 
 2   AGE_YRS                            19545 non-null  float64
 3   SEX                                19545 non-null  int64  
 4   DIED                               19545 non-null  int64  
 5   L_THREAT                           19545 non-null  int64  
 6   HOSPITAL                           19545 non-null  int64  
 7   DISABLE                            19545 non-null  int64  
 8   RECOVD                             19545 non-null  int64  
 9   VAX_DATE                           19545 non-null  object 
 10  BIRTH_DEFECT                       19545 non-null  int64  
 11  VAX_LOT                            6677 non-null   obj

In [15]:
# Step 1: Create Age Groups
def create_age_groups(df, age_col):
    bins = [0, 18, 35, 50, 65, 100]  # Define the bins (adjust based on your dataset)
    labels = ['0-18', '19-35', '36-50', '51-65', '66+']  # Labels for the bins
    df['age_group'] = pd.cut(df[age_col], bins=bins, labels=labels, right=False)
    return df
# Step 2: Get top symptoms by age group (relative)
def top_symptoms_by_age_group(df, age_col, top_n=10):
    # First, create age groups if not already done
    df = create_age_groups(df, age_col)
    # Use the function to get the top symptoms by age group
    return top_symptoms_by_demographic_relative(df, 'age_group', top_n)

In [16]:
top_symptoms_by_age_group(df, 'AGE_YRS', top_n=20)

  agg_df = df.groupby(demo_col)[symptom_cols].sum()


TypeError: category type does not support sum operations

In [26]:
# Symptom columns
symptom_cols = df.columns[14:]  

# Create the 'SYMPTOM_COUNT' column (this sums up the symptoms for each patient)
df['SYMPTOM_COUNT'] = df[symptom_cols].sum(axis=1)

In [27]:
# Convert 'VAX_DOSE_SERIES' to string to handle non-numeric values like '7+'
df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].astype(str)

# Filter for single dose ('1')
df_single_dose = df[df['VAX_DOSE_SERIES'] == '1']

In [28]:
df_single_dose

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,Mobility Decreased,Injection Site Reaction,Lip Swelling,Sleep Disorder,Hot Flush,Abdominal Pain,Feeling Cold,Swollen Tongue,Lymph Node Pain,SYMPTOM_COUNT
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,4
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,4
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,8
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,10
6,902490,NM,37.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19525,931025,TN,30.0,0,0,0,0,0,0,12/30/2020,...,0,0,0,0,0,0,0,0,0,8
19526,931026,OH,34.0,0,0,0,0,0,1,01/08/2021,...,0,0,0,0,0,0,0,0,0,12
19530,931031,TX,44.0,0,0,0,0,0,0,12/18/2020,...,0,0,0,0,0,0,0,0,0,14
19538,932378,VA,59.0,0,0,0,0,0,0,12/24/2020,...,0,0,0,0,0,0,0,0,0,10


In [29]:
#save df_single_dose
#df_single_dose.to_csv('../Data/Clean/df_single_dose.csv')

In [None]:
#df_vax_manu = df[df['VAX_MANU'].notna()]
#df_vax_manu

In [115]:
# Convert the column to numeric, forcing errors (like '7+') to NaN
#df['VAX_DOSE_SERIES'] = pd.to_numeric(df['VAX_DOSE_SERIES'], errors='coerce')

# Fill NaN values with 0 (or another value, if contextually appropriate)
#df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].fillna(0)

# Convert the column to integers
#df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].astype(int)

# Filter for multiple doses (greater than 1)
#df_multiple_dose = df[df['VAX_DOSE_SERIES'] > 1]

## Further Transformations for Tableau

In [30]:
# List of symptom columns in 
symptom_columns = df.columns[14:94]  # Adjust this based on where symptom columns start

# Sum each symptom across all rows (patients)
symptom_counts = df[symptom_columns].sum().sort_values(ascending=False)

# Display top 10 symptoms
top_10_symptoms = symptom_counts.head(10)
print(top_10_symptoms)


None                   16722
Headache                4861
Chills                  3548
Fatigue                 3421
Pyrexia                 3388
Pain                    3248
Nausea                  2947
Dizziness               2752
Injection Site Pain     2363
Pain In Extremity       2039
dtype: int64


In [31]:
# Step 1: Aggregating Symptoms in Python
# List of symptom columns 
symptom_columns = df.columns[14:94]

# Create a new column 'TOTAL_SYMPTOMS' that aggregates the symptoms for each patient
df['TOTAL_SYMPTOMS'] = df[symptom_columns].sum(axis=1)



In [32]:
# Classify patients into symptom severity based on the TOTAL_SYMPTOMS count and hospitalization status
def classify_severity(row):
    # Check if the patient required hospitalization (assuming 1 = Hospitalized)
    if row['HOSPITAL'] == 1:
        return 'Severe'
    elif row['TOTAL_SYMPTOMS'] == 0:
        return 'No Symptoms'
    elif row['TOTAL_SYMPTOMS'] <= 3:
        return 'Mild'
    elif row['TOTAL_SYMPTOMS'] <= 7:
        return 'Moderate'
    else:
        return 'Severe'

# Apply classification based on TOTAL_SYMPTOMS and hospitalization status
df['SYMPTOM_SEVERITY'] = df.apply(classify_severity, axis=1)

In [33]:
df

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,Lip Swelling,Sleep Disorder,Hot Flush,Abdominal Pain,Feeling Cold,Swollen Tongue,Lymph Node Pain,SYMPTOM_COUNT,TOTAL_SYMPTOMS,SYMPTOM_SEVERITY
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,4,2,Mild
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,4,2,Mild
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,8,4,Moderate
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,4,2,Mild
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,10,5,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19540,1279599,CA,43.0,1,0,0,0,0,1,01/05/2021,...,0,0,0,0,0,0,0,6,3,Mild
19541,1279600,CA,27.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,10,5,Moderate
19542,2019715,NY,35.0,0,0,0,1,1,0,08/06/2021,...,0,0,0,0,0,0,0,2,1,Severe
19543,2427491,VA,37.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,4,2,Mild


In [36]:
# Keep only the relevant columns
columns_to_keep = ['VAERS_ID', 'STATE', 'AGE_YRS', 'SEX', 'DIED', 'L_THREAT', 'HOSPITAL', 
                   'SYMPTOM_COUNT', 'TOTAL_SYMPTOMS', 'SYMPTOM_SEVERITY']

df_demo = df[columns_to_keep]

# Save the cleaned dataset for Tableau
#df_demo.to_csv('subset_df_symptoms.csv', index=False)


In [None]:
# Save the cleaned dataset for Tableau
#df_cleaned.to_csv('cleaned_dataset_for_tableau.csv', index=False)