In [1]:
#Libraries
import pandas as pd
from functions import query_to_df

# Set Pandas display options to show all rows and columns
# pd.set_option("display.max_rows", None)  # None means no limit
# pd.set_option("display.max_columns", None)  # None means no limit

In [None]:
sql = """
SELECT 
    e.*,
    v.VAX_LOT, v.VAX_MANU, v.VAX_DOSE_SERIES,
    s.SYMPTOM1, s.SYMPTOM2, s.SYMPTOM3, s.SYMPTOM4, s.SYMPTOM5
FROM event_data AS e
LEFT JOIN sample_vax_data AS v
ON e.VAERS_ID = v.VAERS_ID
LEFT JOIN sample_sympt_data AS s
ON e.VAERS_ID = s.VAERS_ID
"""
df = query_to_df(sql)
df.head()

In [None]:
#df.to_csv("raw_sql_data.csv")

In [25]:
df = pd.read_csv("raw_sql_data.csv", index_col=False)
df

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,BIRTH_DEFECT,VAX_LOT,VAX_MANU,VAX_DOSE_SERIES,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,hypoaesthesia,injection site hypoaesthesia,,,
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,headache,,,,
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,erythema,feeling hot,flushing,,
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,0,,,,dizziness,electrocardiogram normal,hyperhidrosis,laboratory test normal,presyncope
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,sensory disturbance,tremor,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25959,931039,IN,44.0,0,0,0,0,0,1,01/07/2021,0,,,,pyrexia,,,,
25960,931039,IN,44.0,0,0,0,0,0,1,01/07/2021,0,,,,headache,injection site hypersensitivity,injection site pain,lethargy,pain
25961,931048,IN,65.0,1,0,0,0,0,0,01/07/2021,0,,,,,,,,
25962,931061,ME,61.0,0,0,0,0,0,1,12/29/2020,0,,,,,,,,


In [26]:
# Drop rows where 'VAX_LOT' or 'VAX_MANU' have null values
df = df.dropna(subset=['VAX_MANU'])

In [27]:
df

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,BIRTH_DEFECT,VAX_LOT,VAX_MANU,VAX_DOSE_SERIES,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,hypoaesthesia,injection site hypoaesthesia,,,
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,headache,,,,
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,erythema,feeling hot,flushing,,
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,sensory disturbance,tremor,,,
5,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,dysgeusia,oral pruritus,paraesthesia,paraesthesia oral,parosmia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25941,930882,CA,33.0,0,0,0,0,0,0,12/31/2020,0,039K20A,MODERNA,1,erythema,pruritus,rash macular,skin warm,
25952,930989,MD,26.0,0,0,0,0,0,0,01/08/2021,0,012L20A,MODERNA,1,presyncope,,,,
25953,930989,MD,26.0,0,0,0,0,0,0,01/08/2021,0,012L20A,MODERNA,1,muscle twitching,musculoskeletal disorder,myalgia,nausea,paraesthesia
25954,930989,MD,26.0,0,0,0,0,0,0,01/08/2021,0,012L20A,MODERNA,1,dysstasia,fine motor skill dysfunction,headache,hyperventilation,malaise


In [11]:
# Replace empty symptom fields with "None"
symptom_columns = ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']
df[symptom_columns] = df[symptom_columns].replace('', 'None')

In [12]:
# Combine SYMPTOM1 to SYMPTOM5 into a single column
df['ALL_SYMPTOMS'] = df[symptom_columns].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)


In [13]:
# Apply One-Hot Encoding on the combined symptoms
symptom_dummies = df['ALL_SYMPTOMS'].str.get_dummies(sep=',')

# Merge the one-hot encoded symptoms back into the original dataframe
df = pd.concat([df, symptom_dummies], axis=1)

In [14]:
# Display all column names
list(df.columns)

['VAERS_ID',
 'STATE',
 'AGE_YRS',
 'SEX',
 'DIED',
 'L_THREAT',
 'HOSPITAL',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'BIRTH_DEFECT',
 'VAX_LOT',
 'VAX_MANU',
 'VAX_DOSE_SERIES',
 'SYMPTOM1',
 'SYMPTOM2',
 'SYMPTOM3',
 'SYMPTOM4',
 'SYMPTOM5',
 'ALL_SYMPTOMS',
 ' visual',
 'abdominal discomfort',
 'abdominal distension',
 'abdominal pain',
 'abdominal pain lower',
 'abdominal pain upper',
 'abnormal behaviour',
 'abnormal dreams',
 'abnormal sensation in eye',
 'abortion spontaneous',
 'abscess',
 'acoustic stimulation tests abnormal',
 'activated partial thromboplastin time',
 'activated partial thromboplastin time shortened',
 'acute coronary syndrome',
 'acute kidney injury',
 'acute myocardial infarction',
 'acute respiratory failure',
 'administration site swelling',
 'adnexa uteri mass',
 'adult failure to thrive',
 'adverse drug reaction',
 'adverse event',
 'adverse reaction',
 'affect lability',
 'ageusia',
 'aggression',
 'agitation',
 'alanine aminotransferase increased',
 'ala

In [19]:
df

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,wrong technique in product usage process,x-ray,x-ray abnormal,x-ray dental normal,x-ray limb,x-ray limb normal,x-ray normal,x-ray with contrast,yawning,yellow skin
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
5,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25941,930882,CA,33.0,0,0,0,0,0,0,12/31/2020,...,0,0,0,0,0,0,0,0,0,0
25952,930989,MD,26.0,0,0,0,0,0,0,01/08/2021,...,0,0,0,0,0,0,0,0,0,0
25953,930989,MD,26.0,0,0,0,0,0,0,01/08/2021,...,0,0,0,0,0,0,0,0,0,0
25954,930989,MD,26.0,0,0,0,0,0,0,01/08/2021,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# 2035 cols 
# # Select the symptom columns
sympt_cols_df = df.iloc[:, -10:]
# Calculate the occurrences of each symptom
symptom_counts = sympt_cols_df.sum(axis=0)

TypeError: can only concatenate str (not "int") to str

In [9]:
symptom_counts

 insomnia type            1
 visual                   2
abdominal adhesions       1
abdominal discomfort    157
abdominal distension     28
                       ... 
x-ray limb normal         4
x-ray normal             11
x-ray with contrast       1
yawning                   2
yellow skin               1
Length: 2035, dtype: int64

In [10]:
# Create a DataFrame to display the column names and their counts
symptom_table = pd.DataFrame({
    "Symptom": symptom_counts.index,
    "Occurrences": symptom_counts.values
})
# Sort the table by occurrences in descending order
symptom_table_sorted = symptom_table.sort_values(by="Occurrences", ascending=False)
# Display the sorted table
display(symptom_table_sorted)

Unnamed: 0,Symptom,Occurrences
823,headache,4863
389,chills,3549
690,fatigue,3423
1545,pyrexia,3390
1376,pain,3250
...,...,...
958,inflammatory marker test,1
967,infusion site joint movement impairment,1
968,infusion site warmth,1
971,injection related reaction,1


In [11]:
#Filter symptoms with more than 200 occurrences
symptoms_to_keep = symptom_table_sorted[symptom_table_sorted['Occurrences'] > 200]['Symptom'].tolist()
len(symptoms_to_keep)

79

In [12]:
# Get the original columns
original_cols = df.columns[:14].tolist()
original_cols

['VAERS_ID',
 'STATE',
 'AGE_YRS',
 'SEX',
 'DIED',
 'L_THREAT',
 'HOSPITAL',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'BIRTH_DEFECT',
 'VAX_LOT',
 'VAX_MANU',
 'VAX_DOSE_SERIES']

In [13]:
columns_to_keep = original_cols + symptoms_to_keep

In [14]:
df_processed = df[columns_to_keep]

In [15]:
df_processed

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,dysphagia,mobility decreased,injection site reaction,lip swelling,sleep disorder,hot flush,abdominal pain,feeling cold,swollen tongue,lymph node pain
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25959,931039,IN,44.0,0,0,0,0,0,1,01/07/2021,...,0,0,0,0,0,0,0,0,0,0
25960,931039,IN,44.0,0,0,0,0,0,1,01/07/2021,...,0,0,0,0,0,0,0,0,0,0
25961,931048,IN,65.0,1,0,0,0,0,0,01/07/2021,...,0,0,0,0,0,0,0,0,0,0
25962,931061,ME,61.0,0,0,0,0,0,1,12/29/2020,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Original columns without VAERS_ID
original_cols_without_id = [col for col in original_cols if col != 'VAERS_ID']
original_cols_without_id

['STATE',
 'AGE_YRS',
 'SEX',
 'DIED',
 'L_THREAT',
 'HOSPITAL',
 'DISABLE',
 'RECOVD',
 'VAX_DATE',
 'BIRTH_DEFECT',
 'VAX_LOT',
 'VAX_MANU',
 'VAX_DOSE_SERIES']

In [18]:
# Define the aggregation dictionary
agg_dict = {col: "first" for col in original_cols_without_id}  # Max for all symptom columns
agg_dict.update({col: "max" for col in symptoms_to_keep})  # First for all other columns
# Group by ‘vars ID’ and apply the aggregation
df_merged = df_processed.groupby("VAERS_ID").agg(agg_dict).reset_index()
# Display the result
df_merged

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,dysphagia,mobility decreased,injection site reaction,lip swelling,sleep disorder,hot flush,abdominal pain,feeling cold,swollen tongue,lymph node pain
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19571,1279599,CA,43.0,1,0,0,0,0,1,01/05/2021,...,0,0,0,0,0,0,0,0,0,0
19572,1279600,CA,27.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,0,0,0
19573,2019715,NY,35.0,0,0,0,1,1,0,08/06/2021,...,0,0,0,0,0,0,0,0,0,0
19574,2427491,VA,37.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_merged

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,dysphagia,mobility decreased,injection site reaction,lip swelling,sleep disorder,hot flush,abdominal pain,feeling cold,swollen tongue,lymph node pain
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19571,1279599,CA,43.0,1,0,0,0,0,1,01/05/2021,...,0,0,0,0,0,0,0,0,0,0
19572,1279600,CA,27.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,0,0,0
19573,2019715,NY,35.0,0,0,0,1,1,0,08/06/2021,...,0,0,0,0,0,0,0,0,0,0
19574,2427491,VA,37.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,0,0,0
