In [67]:
#Cleaning VAERS VAX
import pandas as pd 

In [68]:
# Load the VAERS Data (Demographics and Event Information)
vaers_vax = pd.read_csv('VAERSVAX.csv', low_memory=False)

In [69]:
vaers_vax.head(10)


Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,902418,COVID19,PFIZER\BIONTECH,EH9899,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
1,902440,COVID19,PFIZER\BIONTECH,EH 9899,1,SYR,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
2,902446,COVID19,PFIZER\BIONTECH,EH9899,1,IM,RA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,902464,COVID19,PFIZER\BIONTECH,EH9899,UNK,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
4,902465,COVID19,PFIZER\BIONTECH,EH9899,1,IM,RA,COVID19 (COVID19 (PFIZER-BIONTECH))
5,902468,COVID19,PFIZER\BIONTECH,EH9899,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
6,902479,COVID19,PFIZER\BIONTECH,,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
7,902490,COVID19,PFIZER\BIONTECH,EH9899,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
8,902491,COVID19,PFIZER\BIONTECH,EH9899,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
9,902492,COVID19,PFIZER\BIONTECH,EH9899,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))


Cleaning Process

In [70]:
# Remove leading/trailing spaces from all columns
vaers_vax_cleaned = vaers_vax.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [71]:
# Standardize the 'VAX_LOT' column (Remove spaces, make uppercase)
vaers_vax_cleaned['VAX_LOT'] = vaers_vax_cleaned['VAX_LOT'].str.replace(" ", "").str.upper()

In [72]:
# Check the distribution of VAX_ROUTE
route_distribution = vaers_vax_cleaned['VAX_ROUTE'].value_counts()
print(route_distribution)


VAX_ROUTE
IM     499361
SYR    166927
OT     134185
UN       8064
SC       3008
ID        550
JET       166
IN         14
PO          7
Name: count, dtype: int64


In [73]:
# Group 'SYR' with 'IM', and everything else as 'Other'
vaers_vax_cleaned['VAX_ROUTE_CLEANED'] = vaers_vax_cleaned['VAX_ROUTE'].replace({
    'SYR': 'SYR',   # SYR as Syringe
    'IM': 'IM',    # IM as Intramuscular
    'ID': 'OT',    # Intradermal as 'Other'
    'JET': 'OT',   # Jet injection as 'Other'
    'IN': 'OT',    # Intranasal as 'Other'
    'PO': 'OT',    # Oral as 'Other'
    'OT': 'OT',    # Other as 'Other'
    'UN': 'OT',    # Unknown as 'Other'
    # Optionally, you can drop any rare or undefined routes later
})

# Drop any rows where the VAX_ROUTE_CLEANED is now set as 'OT' (optional)
vaers_vax_cleaned = vaers_vax_cleaned[vaers_vax_cleaned['VAX_ROUTE_CLEANED'] != 'OT']

# Verify changes
print(vaers_vax_cleaned['VAX_ROUTE_CLEANED'].value_counts())


VAX_ROUTE_CLEANED
IM     499361
SYR    166927
SC       3008
Name: count, dtype: int64


In [74]:
vaers_vax_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 930530 entries, 0 to 1073515
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   VAERS_ID           930530 non-null  int64 
 1   VAX_TYPE           930530 non-null  object
 2   VAX_MANU           930530 non-null  object
 3   VAX_LOT            677042 non-null  object
 4   VAX_DOSE_SERIES    925789 non-null  object
 5   VAX_ROUTE          669296 non-null  object
 6   VAX_SITE           707413 non-null  object
 7   VAX_NAME           930530 non-null  object
 8   VAX_ROUTE_CLEANED  669296 non-null  object
dtypes: int64(1), object(8)
memory usage: 71.0+ MB


In [75]:
# Check for null values in each column of vaers_vax
print(vaers_vax_cleaned.isnull().sum())


VAERS_ID                  0
VAX_TYPE                  0
VAX_MANU                  0
VAX_LOT              253488
VAX_DOSE_SERIES        4741
VAX_ROUTE            261234
VAX_SITE             223117
VAX_NAME                  0
VAX_ROUTE_CLEANED    261234
dtype: int64


In [76]:
# Drop rows where 'VAX_LOT', 'VAX_ROUTE', or 'VAX_SITE' are missing
vaers_vax_cleaned = vaers_vax.dropna(subset=['VAX_LOT', 'VAX_ROUTE', 'VAX_SITE'])

# Check the remaining rows
print(vaers_vax_cleaned.info())


<class 'pandas.core.frame.DataFrame'>
Index: 568142 entries, 0 to 1073508
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   VAERS_ID         568142 non-null  int64 
 1   VAX_TYPE         568142 non-null  object
 2   VAX_MANU         568142 non-null  object
 3   VAX_LOT          568142 non-null  object
 4   VAX_DOSE_SERIES  564127 non-null  object
 5   VAX_ROUTE        568142 non-null  object
 6   VAX_SITE         568142 non-null  object
 7   VAX_NAME         568142 non-null  object
dtypes: int64(1), object(7)
memory usage: 39.0+ MB
None


In [77]:
# Drop the original 'VAX_ROUTE' column
vaers_vax_cleaned.drop('VAX_ROUTE', axis=1, inplace=True)

# Rename 'VAX_ROUTE_CLEANED' to 'VAX_ROUTE'
vaers_vax_cleaned.rename(columns={'VAX_ROUTE_CLEANED': 'VAX_ROUTE'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vaers_vax_cleaned.drop('VAX_ROUTE', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vaers_vax_cleaned.rename(columns={'VAX_ROUTE_CLEANED': 'VAX_ROUTE'}, inplace=True)


In [78]:
vaers_vax_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 568142 entries, 0 to 1073508
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   VAERS_ID         568142 non-null  int64 
 1   VAX_TYPE         568142 non-null  object
 2   VAX_MANU         568142 non-null  object
 3   VAX_LOT          568142 non-null  object
 4   VAX_DOSE_SERIES  564127 non-null  object
 5   VAX_SITE         568142 non-null  object
 6   VAX_NAME         568142 non-null  object
dtypes: int64(1), object(6)
memory usage: 34.7+ MB


In [79]:
vaers_vax_cleaned.drop(columns=['VAX_SITE'])

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_NAME
0,902418,COVID19,PFIZER\BIONTECH,EH9899,1,COVID19 (COVID19 (PFIZER-BIONTECH))
1,902440,COVID19,PFIZER\BIONTECH,EH 9899,1,COVID19 (COVID19 (PFIZER-BIONTECH))
2,902446,COVID19,PFIZER\BIONTECH,EH9899,1,COVID19 (COVID19 (PFIZER-BIONTECH))
3,902464,COVID19,PFIZER\BIONTECH,EH9899,UNK,COVID19 (COVID19 (PFIZER-BIONTECH))
4,902465,COVID19,PFIZER\BIONTECH,EH9899,1,COVID19 (COVID19 (PFIZER-BIONTECH))
...,...,...,...,...,...,...
1073499,2776226,COVID19,PFIZER\BIONTECH,EN6202,1,COVID19 (COVID19 (PFIZER-BIONTECH))
1073500,2776226,COVID19,PFIZER\BIONTECH,EN6202,2,COVID19 (COVID19 (PFIZER-BIONTECH))
1073501,2776230,COVID19,MODERNA,AU5557B,2,COVID19 (COVID19 (MODERNA))
1073502,2776253,COVID19,PFIZER\BIONTECH,LE1811,1,COVID19 (COVID19 (PFIZER-BIONTECH))


In [80]:
# Define a function to clean the VAX_NAME column
def clean_vax_name(vax_name):
    # Remove unnecessary characters and extra spaces
    cleaned_name = vax_name.replace("COVID19 (COVID19 (", "").replace("))", "").strip()
    return cleaned_name

# Apply the cleaning function to the VAX_NAME column
vaers_vax_cleaned['VAX_NAME'] = vaers_vax_cleaned['VAX_NAME'].apply(clean_vax_name)

# Display the cleaned column
vaers_vax_cleaned[['VAX_NAME']].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vaers_vax_cleaned['VAX_NAME'] = vaers_vax_cleaned['VAX_NAME'].apply(clean_vax_name)


Unnamed: 0,VAX_NAME
0,PFIZER-BIONTECH
1,PFIZER-BIONTECH
2,PFIZER-BIONTECH
3,PFIZER-BIONTECH
4,PFIZER-BIONTECH


In [81]:
vaers_vax_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 568142 entries, 0 to 1073508
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   VAERS_ID         568142 non-null  int64 
 1   VAX_TYPE         568142 non-null  object
 2   VAX_MANU         568142 non-null  object
 3   VAX_LOT          568142 non-null  object
 4   VAX_DOSE_SERIES  564127 non-null  object
 5   VAX_SITE         568142 non-null  object
 6   VAX_NAME         568142 non-null  object
dtypes: int64(1), object(6)
memory usage: 34.7+ MB


In [82]:
# Save the vaers_vax DataFrame to a CSV file
vaers_vax_cleaned.to_csv('vaers_vax_cleaned.csv', index=False)


In [83]:
#Load the VAERS Data (Demographics and Event Information)
vaers_data_cleaned = pd.read_csv('vaers_data_cleaned.csv', low_memory=False)

In [84]:
vaers_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735849 entries, 0 to 735848
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      735849 non-null  int64  
 1   RECVDATE      735849 non-null  object 
 2   STATE         735849 non-null  object 
 3   AGE_YRS       735849 non-null  float64
 4   SEX           735849 non-null  int64  
 5   SYMPTOM_TEXT  735849 non-null  object 
 6   DIED          735849 non-null  int64  
 7   DATEDIED      11475 non-null   object 
 8   L_THREAT      735849 non-null  int64  
 9   HOSPITAL      735849 non-null  int64  
 10  DISABLE       735849 non-null  int64  
 11  RECOVD        735849 non-null  int64  
 12  VAX_DATE      735849 non-null  object 
 13  ONSET_DATE    735849 non-null  object 
 14  PRIOR_VAX     735849 non-null  object 
 15  BIRTH_DEFECT  735849 non-null  int64  
 16  ALLERGIES     353899 non-null  object 
dtypes: float64(1), int64(8), object(8)
memory usage:

In [85]:
vaers_vax_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 568142 entries, 0 to 1073508
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   VAERS_ID         568142 non-null  int64 
 1   VAX_TYPE         568142 non-null  object
 2   VAX_MANU         568142 non-null  object
 3   VAX_LOT          568142 non-null  object
 4   VAX_DOSE_SERIES  564127 non-null  object
 5   VAX_SITE         568142 non-null  object
 6   VAX_NAME         568142 non-null  object
dtypes: int64(1), object(6)
memory usage: 34.7+ MB


In [86]:
#Merge the two dataframes
vaersdata_merged = pd.merge(vaers_data_cleaned, vaers_vax_cleaned, on='VAERS_ID', how='left')

In [87]:
# Merge the two datasets on 'VAERS_ID' and specify suffixes to avoid duplicate column names
merged_data = pd.merge(vaers_data_cleaned, vaers_vax, on='VAERS_ID', how='inner', suffixes=('_data', '_vax'))

# Check the first few rows of the merged dataset
print(merged_data.head())

   VAERS_ID    RECVDATE STATE  AGE_YRS  SEX  \
0    902418  2020-12-15    NJ     56.0    0   
1    902440  2020-12-15    AZ     35.0    0   
2    902446  2020-12-15    WV     55.0    0   
3    902464  2020-12-15    LA     42.0    1   
4    902465  2020-12-15    AR     60.0    0   

                                        SYMPTOM_TEXT  DIED DATEDIED  L_THREAT  \
0  Patient experienced mild numbness traveling fr...     0      NaN         0   
1                                       C/O Headache     0      NaN         0   
2  felt warm, hot and face and ears were red and ...     0      NaN         0   
3  within 15 minutes progressive light-headedness...     0      NaN         0   
4  Pt felt wave come over body @ 1218 starting in...     0      NaN         0   

   HOSPITAL  ...  PRIOR_VAX  BIRTH_DEFECT  \
0         0  ...    Unknown             0   
1         0  ...    Unknown             0   
2         0  ...    Unknown             0   
3         0  ...    Unknown             0   
4    

In [88]:
#csv
vaersdata_merged.to_csv('vaersdata_merged.csv', index=False)

In [89]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 782639 entries, 0 to 782638
Data columns (total 24 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   VAERS_ID         782639 non-null  int64  
 1   RECVDATE         782639 non-null  object 
 2   STATE            782639 non-null  object 
 3   AGE_YRS          782639 non-null  float64
 4   SEX              782639 non-null  int64  
 5   SYMPTOM_TEXT     782639 non-null  object 
 6   DIED             782639 non-null  int64  
 7   DATEDIED         14398 non-null   object 
 8   L_THREAT         782639 non-null  int64  
 9   HOSPITAL         782639 non-null  int64  
 10  DISABLE          782639 non-null  int64  
 11  RECOVD           782639 non-null  int64  
 12  VAX_DATE         782639 non-null  object 
 13  ONSET_DATE       782639 non-null  object 
 14  PRIOR_VAX        782639 non-null  object 
 15  BIRTH_DEFECT     782639 non-null  int64  
 16  ALLERGIES        379528 non-null  obje

In [90]:
# Check unique VAERS_ID counts
print(f"Unique VAERS_IDs in vaers_data_cleaned: {vaers_data_cleaned['VAERS_ID'].nunique()}")
print(f"Unique VAERS_IDs in vaers_vax: {vaers_vax['VAERS_ID'].nunique()}")


Unique VAERS_IDs in vaers_data_cleaned: 735849
Unique VAERS_IDs in vaers_vax: 1012894


In [91]:
# Check the intersection of VAERS_IDs
common_ids = set(vaers_data_cleaned['VAERS_ID']).intersection(set(vaers_vax['VAERS_ID']))
print(f"Number of common VAERS_IDs: {len(common_ids)}")


Number of common VAERS_IDs: 735849


In [92]:
vaers_data_cleaned.shape

(735849, 17)

In [93]:
vaers_vax.shape

(1073516, 8)

In [94]:
# Perform an inner join to keep only the common records in both datasets
merged_data = pd.merge(vaers_data_cleaned, vaers_vax, on='VAERS_ID', how='inner')

# Check the shape and first few rows of the merged data
merged_data.shape
merged_data.head()


Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,SEX,SYMPTOM_TEXT,DIED,DATEDIED,L_THREAT,HOSPITAL,...,PRIOR_VAX,BIRTH_DEFECT,ALLERGIES,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,902418,2020-12-15,NJ,56.0,0,Patient experienced mild numbness traveling fr...,0,,0,0,...,Unknown,0,none,COVID19,PFIZER\BIONTECH,EH9899,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
1,902440,2020-12-15,AZ,35.0,0,C/O Headache,0,,0,0,...,Unknown,0,,COVID19,PFIZER\BIONTECH,EH 9899,1,SYR,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
2,902446,2020-12-15,WV,55.0,0,"felt warm, hot and face and ears were red and ...",0,,0,0,...,Unknown,0,"Contrast Dye IV contrast, shellfish, strawberry",COVID19,PFIZER\BIONTECH,EH9899,1,IM,RA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,902464,2020-12-15,LA,42.0,1,within 15 minutes progressive light-headedness...,0,,0,0,...,Unknown,0,none,COVID19,PFIZER\BIONTECH,EH9899,UNK,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
4,902465,2020-12-15,AR,60.0,0,Pt felt wave come over body @ 1218 starting in...,0,,0,0,...,Unknown,0,Biaxin,COVID19,PFIZER\BIONTECH,EH9899,1,IM,RA,COVID19 (COVID19 (PFIZER-BIONTECH))


In [95]:
merged_data.shape

(782639, 24)

In [96]:
# Convert 'ALLERGIES' column to binary 'HAS_ALLERGIES'
merged_data['HAS_ALLERGIES'] = merged_data['ALLERGIES'].notna().astype(int)

# Drop the original 'ALLERGIES' column
merged_data.drop('ALLERGIES', axis=1, inplace=True)

# Check the changes
print(merged_data[['VAERS_ID', 'HAS_ALLERGIES']].head())


   VAERS_ID  HAS_ALLERGIES
0    902418              1
1    902440              0
2    902446              1
3    902464              1
4    902465              1
