In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../Data/Clean/full_dataset.csv', low_memory=False)

In [3]:
# Normalize the column names by removing leading/trailing spaces and converting to uppercase
df.columns = df.columns.str.strip().str.upper()

# Select columns from index 14 onwards for renaming
columns_to_rename = df.columns[14:]  # This selects columns from index 14 onwards

# Define a dictionary to rename these selected columns for better visual representation
rename_columns = {col: col.replace('_', ' ').title() for col in columns_to_rename}

# Apply the new column names only for the selected columns
df.rename(columns=rename_columns, inplace=True)

# Display the first few rows to verify the changes
df.head()


Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,Dysphagia,Mobility Decreased,Injection Site Reaction,Lip Swelling,Sleep Disorder,Hot Flush,Abdominal Pain,Feeling Cold,Swollen Tongue,Lymph Node Pain
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,0
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Create symptom_count (Total Symptoms Reported)

# Symptom columns
symptom_cols = df.columns[14:]  # All columns from index 14 onward seem to represent symptoms

# Create SYMPTOM_COUNT by summing up all symptom columns
df['SYMPTOM_COUNT'] = df[symptom_cols].sum(axis=1)

In [5]:
df

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,...,Mobility Decreased,Injection Site Reaction,Lip Swelling,Sleep Disorder,Hot Flush,Abdominal Pain,Feeling Cold,Swollen Tongue,Lymph Node Pain,SYMPTOM_COUNT
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,2
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,2
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,4
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,...,0,0,0,0,0,0,0,0,0,2
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19540,1279599,CA,43.0,1,0,0,0,0,1,01/05/2021,...,0,0,0,0,0,0,0,0,0,3
19541,1279600,CA,27.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,0,0,5
19542,2019715,NY,35.0,0,0,0,1,1,0,08/06/2021,...,0,0,0,0,0,0,0,0,0,1
19543,2427491,VA,37.0,0,0,0,0,0,0,01/05/2021,...,0,0,0,0,0,0,0,0,0,2


In [6]:
agg_df = df.groupby('SEX')[symptom_cols].sum()
agg_df

Unnamed: 0_level_0,None,Headache,Chills,Fatigue,Pyrexia,Pain,Nausea,Dizziness,Injection Site Pain,Pain In Extremity,...,Dysphagia,Mobility Decreased,Injection Site Reaction,Lip Swelling,Sleep Disorder,Hot Flush,Abdominal Pain,Feeling Cold,Swollen Tongue,Lymph Node Pain
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,13611,3990,2755,2708,2567,2655,2532,2315,1942,1754,...,191,184,192,195,163,176,163,169,178,187
1,3111,871,793,713,821,593,415,437,421,285,...,27,30,21,14,45,30,43,35,26,15


In [None]:
# Export to CSV
agg_df.to_csv('../Data/Clean/symptom_count_by_sex.csv')

In [11]:
# Convert 'VAX_DOSE_SERIES' to string to handle non-numeric values like '7+'
df['VAX_DOSE_SERIES'] = df['VAX_DOSE_SERIES'].astype(str)

# Filter for single dose ('1')
df_single_dose = df[df['VAX_DOSE_SERIES'] == '1']
df_multi_dose = df[df['VAX_DOSE_SERIES'] !='1']


In [14]:
# Assuming 'df' is your DataFrame, and 'symptom_columns' contains the one-hot encoded symptom columns

# Create a new column 'Symptoms' that lists all symptoms present for each row
df['Symptoms'] = df.apply(lambda row: ', '.join([col for col in symptom_columns if row[col] == 1]), axis=1)

# Now you can drop the one-hot encoded symptom columns if you don't need them anymore
df_flattened = df.drop(columns=symptom_columns)


In [15]:
df_flattened

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,BIRTH_DEFECT,VAX_LOT,VAX_MANU,VAX_DOSE_SERIES,None,SYMPTOM_COUNT,Symptoms
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,1,2,Hypoaesthesia
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,1,2,Headache
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,1,4,"Erythema, Flushing, Feeling Hot"
3,902464,LA,42.0,1,0,0,0,0,1,12/15/2020,0,,,,0,2,"Dizziness, Hyperhidrosis"
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,1,5,"Paraesthesia, Paraesthesia Oral, Tremor, Dysge..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19540,1279599,CA,43.0,1,0,0,0,0,1,01/05/2021,0,,,,1,3,"Pruritus, Paraesthesia"
19541,1279600,CA,27.0,0,0,0,0,0,0,01/05/2021,0,EK5730,PFIZER\BIONTECH,1,0,5,"Headache, Chills, Pain, Myalgia, Lymphadenopathy"
19542,2019715,NY,35.0,0,0,0,1,1,0,08/06/2021,0,,,,1,1,
19543,2427491,VA,37.0,0,0,0,0,0,0,01/05/2021,0,,,,1,2,Pyrexia


In [30]:
df_tableau = df_flattened.dropna()

In [34]:
df_tab = df_tableau.drop(columns=['None'])

In [35]:
df_tab

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,BIRTH_DEFECT,VAX_LOT,VAX_MANU,VAX_DOSE_SERIES,SYMPTOM_COUNT,Symptoms
0,902418,NJ,56.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,2,Hypoaesthesia
1,902440,AZ,35.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,2,Headache
2,902446,WV,55.0,0,0,0,0,0,1,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,4,"Erythema, Flushing, Feeling Hot"
4,902465,AR,60.0,0,0,0,0,0,0,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,5,"Paraesthesia, Paraesthesia Oral, Tremor, Dysge..."
6,902490,NM,37.0,0,0,0,0,0,0,12/15/2020,0,EH9899,PFIZER\BIONTECH,1,3,"Headache, Pain"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19525,931025,TN,30.0,0,0,0,0,0,0,12/30/2020,0,025L20A,MODERNA,1,4,"Injection Site Erythema, Injection Site Swelli..."
19526,931026,OH,34.0,0,0,0,0,0,1,01/08/2021,0,026L20A,MODERNA,1,6,"Headache, Chills, Nausea, Pain In Extremity, P..."
19530,931031,TX,44.0,0,0,0,0,0,0,12/18/2020,0,EK5730,PFIZER\BIONTECH,1,7,"Pyrexia, Pain, Nausea, Diarrhoea, Cough, Chest..."
19538,932378,VA,59.0,0,0,0,0,0,0,12/24/2020,0,039K20A,MODERNA,1,5,"Headache, Myalgia, Hypoaesthesia, Paraesthesia..."


In [36]:
#Save the cleaned dataframe to a CSV file
#df_tab.to_csv('../Data/Clean/tableau_subframe.csv')