# Data_Cleaning


In [31]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline  
import nltk
import re

In [32]:
# Load the VAERS Data (Demographics and Event Information)
vaers_data = pd.read_csv('../Data/VAERSDATA.csv', low_memory=False)

In [33]:
vaers_data.head()

Unnamed: 0,VAERS_ID,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,...,CUR_ILL,HISTORY,PRIOR_VAX,SPLTTYPE,FORM_VERS,TODAYS_DATE,BIRTH_DEFECT,OFC_VISIT,ER_ED_VISIT,ALLERGIES
0,902418,12/15/2020,NJ,56.0,56.0,,F,,Patient experienced mild numbness traveling fr...,,...,none,none,,,2,12/15/2020,,,,none
1,902440,12/15/2020,AZ,35.0,35.0,,F,,C/O Headache,,...,,,,,2,12/15/2020,,,,
2,902446,12/15/2020,WV,55.0,55.0,,F,,"felt warm, hot and face and ears were red and ...",,...,none,"Hypertension, sleep apnea, hypothyroidism",,,2,12/15/2020,,,,"Contrast Dye IV contrast, shellfish, strawberry"
3,902464,12/15/2020,LA,42.0,42.0,,M,,within 15 minutes progressive light-headedness...,,...,none,none,,,2,12/15/2020,,,Y,none
4,902465,12/15/2020,AR,60.0,60.0,,F,,Pt felt wave come over body @ 1218 starting in...,,...,"Bronchitis, finished prednisone on 12-13-20","hypertension, fibromyalgia",,,2,12/15/2020,,,,Biaxin


In [34]:
vaers_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1012894 entries, 0 to 1012893
Data columns (total 35 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   VAERS_ID      1012894 non-null  int64  
 1   RECVDATE      1012894 non-null  object 
 2   STATE         842425 non-null   object 
 3   AGE_YRS       909608 non-null   float64
 4   CAGE_YR       809314 non-null   float64
 5   CAGE_MO       5374 non-null     float64
 6   SEX           1012894 non-null  object 
 7   RPT_DATE      1130 non-null     object 
 8   SYMPTOM_TEXT  1011423 non-null  object 
 9   DIED          18951 non-null    object 
 10  DATEDIED      16828 non-null    object 
 11  L_THREAT      15197 non-null    object 
 12  ER_VISIT      144 non-null      object 
 13  HOSPITAL      90081 non-null    object 
 14  HOSPDAYS      53040 non-null    float64
 15  X_STAY        505 non-null      object 
 16  DISABLE       18274 non-null    object 
 17  RECOVD        882224 non-nu

In [35]:
vaers_data.columns

Index(['VAERS_ID', 'RECVDATE', 'STATE', 'AGE_YRS', 'CAGE_YR', 'CAGE_MO', 'SEX',
       'RPT_DATE', 'SYMPTOM_TEXT', 'DIED', 'DATEDIED', 'L_THREAT', 'ER_VISIT',
       'HOSPITAL', 'HOSPDAYS', 'X_STAY', 'DISABLE', 'RECOVD', 'VAX_DATE',
       'ONSET_DATE', 'NUMDAYS', 'LAB_DATA', 'V_ADMINBY', 'V_FUNDBY',
       'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'SPLTTYPE',
       'FORM_VERS', 'TODAYS_DATE', 'BIRTH_DEFECT', 'OFC_VISIT', 'ER_ED_VISIT',
       'ALLERGIES'],
      dtype='object')

In [37]:
# List of columns to drop based on the analysis
columns_to_drop = ['RECVDATE','CAGE_YR', 'CAGE_MO','RPT_DATE','DATEDIED','ER_VISIT','HOSPDAYS', 'X_STAY','ONSET_DATE', 'NUMDAYS', 'LAB_DATA', 'V_ADMINBY', 'V_FUNDBY',
       'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'SPLTTYPE','OFC_VISIT', 'ER_ED_VISIT', 'ALLERGIES',
       'FORM_VERS', 'TODAYS_DATE'
    
]

# Drop the unnecessary columns from the DataFrame
data_df = vaers_data.drop(columns=columns_to_drop)

# Display the shape of the cleaned DataFrame to verify the result
print("Shape of VAERS DataFrame after dropping irrelevant columns:", data_df.shape)

Shape of VAERS DataFrame after dropping irrelevant columns: (1012894, 12)


In [None]:
data_df.head()

In [38]:
# Check for null values in the entire DataFrame
null_values = data_df.isnull().sum()

# Display the count of null values for each column
print("Count of null values in each column:")
print(null_values)


Count of null values in each column:
VAERS_ID              0
STATE            170469
AGE_YRS          103286
SEX                   0
SYMPTOM_TEXT       1471
DIED             993943
L_THREAT         997697
HOSPITAL         922813
DISABLE          994620
RECOVD           130670
VAX_DATE          73924
BIRTH_DEFECT    1012281
dtype: int64


In [39]:
# Drop rows with missing values in the critical columns: VAERS_ID, STATE, AGE_YRS, and SEX
data_df = data_df.dropna(subset=['VAERS_ID', 'STATE', 'AGE_YRS', 'SEX'])

# Display the shape of the cleaned DataFrame to verify the result
print("Shape of VAERS DataFrame after dropping rows with nulls in critical columns:", data_df.shape)

# Check if there are any remaining null values in those critical columns
print("\nRemaining missing values in critical columns:")
print(data_df[['VAERS_ID', 'STATE', 'AGE_YRS', 'SEX']].isnull().sum())

Shape of VAERS DataFrame after dropping rows with nulls in critical columns: (787806, 12)

Remaining missing values in critical columns:
VAERS_ID    0
STATE       0
AGE_YRS     0
SEX         0
dtype: int64


In [40]:
# List of valid U.S. state abbreviations (50 states only)
valid_states = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS',
    'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
    'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 
    'WI', 'WY'
]

# Filter rows where the STATE column contains valid state abbreviations
data_df = data_df[data_df['STATE'].isin(valid_states)]

In [41]:
# Remove rows where SEX is 'U' (Unknown)
data_df = data_df[data_df['SEX'] != 'U']

# Display the shape of the DataFrame after removing 'Unknown' sex entries
print("Shape of VAERS DataFrame after removing 'Unknown' sex entries:", data_df.shape)

Shape of VAERS DataFrame after removing 'Unknown' sex entries: (772636, 12)


In [42]:
# Check the value counts for the SEX column to see how many "unknown" values exist
sex_value_counts = data_df['SEX'].value_counts(dropna=False)
print("SEX column value counts:")
print(sex_value_counts)


SEX column value counts:
SEX
F    522793
M    249843
Name: count, dtype: int64


In [43]:
# Convert the DIED column to binary: 1 for 'Y', 0 for empty or null
data_df['DIED'] = data_df['DIED'].apply(lambda x: 1 if x == 'Y' else 0)

# Check the result of the transformation
print("Value counts for the DIED column after transformation:")
print(data_df['DIED'].value_counts())


Value counts for the DIED column after transformation:
DIED
0    760514
1     12122
Name: count, dtype: int64


In [44]:
# Convert the HOSPITAL column to binary: 1 for 'Y', 0 for empty or null
data_df['HOSPITAL'] = data_df['HOSPITAL'].apply(lambda x: 1 if x == 'Y' else 0)

# Convert the DISABLE column to binary: 1 for 'Y', 0 for empty or null
data_df['DISABLE'] = data_df['DISABLE'].apply(lambda x: 1 if x == 'Y' else 0)

# Convert the BIRTH_DEFECT column to binary: 1 for 'Y', 0 for empty or null
data_df['BIRTH_DEFECT'] = data_df['BIRTH_DEFECT'].apply(lambda x: 1 if x == 'Y' else 0)

In [45]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 772636 entries, 0 to 1012886
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      772636 non-null  int64  
 1   STATE         772636 non-null  object 
 2   AGE_YRS       772636 non-null  float64
 3   SEX           772636 non-null  object 
 4   SYMPTOM_TEXT  771639 non-null  object 
 5   DIED          772636 non-null  int64  
 6   L_THREAT      13776 non-null   object 
 7   HOSPITAL      772636 non-null  int64  
 8   DISABLE       772636 non-null  int64  
 9   RECOVD        695682 non-null  object 
 10  VAX_DATE      760654 non-null  object 
 11  BIRTH_DEFECT  772636 non-null  int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 76.6+ MB


In [46]:
data_df = data_df.dropna(subset=['STATE'])


In [47]:
# Example: Drop rows with missing data in critical columns in VAERS dataset
critical_columns_vaers = ['AGE_YRS', 'SEX', 'STATE', 'SYMPTOM_TEXT', 'DIED', 'VAX_DATE']
data_df = data_df.dropna(subset=critical_columns_vaers)

# Similarly, do this for the other datasets, like symptoms and vaccination data

print(data_df[['VAERS_ID', 'STATE', 'AGE_YRS', 'SEX']].isnull().sum())


VAERS_ID    0
STATE       0
AGE_YRS     0
SEX         0
dtype: int64


In [48]:
data_df.shape

(759664, 12)

In [49]:
# Check for missing values in SYMPTOM_TEXT
print(data_df['SYMPTOM_TEXT'].isnull().sum())


0


In [None]:
data_df.head()

In [50]:
# Convert SEX column to binary: 1 for Male ('M'), 0 for Female ('F')
data_df['SEX_BINARY'] = data_df['SEX'].apply(lambda x: 1 if x == 'M' else 0)

# Verify the result
print(data_df[['SEX', 'SEX_BINARY']].head())

  SEX  SEX_BINARY
0   F           0
1   F           0
2   F           0
3   M           1
4   F           0


In [51]:
# Check unique values in SEX to confirm the actual values
print(data_df['SEX'].unique())

# Convert SEX to binary: 1 for Male ('M'), 0 for Female ('F')
# Handle unknown or missing values by assigning them as NaN or 0
data_df['SEX_BINARY'] = data_df['SEX'].apply(lambda x: 1 if x == 'M' else (0 if x == 'F' else None))

# Verify the result
print(data_df[['SEX', 'SEX_BINARY']].head())

['F' 'M']
  SEX  SEX_BINARY
0   F           0
1   F           0
2   F           0
3   M           1
4   F           0


In [52]:
# Confirm the distribution of 'SEX' values (should only be 'M' and 'F')
print(data_df['SEX'].value_counts())

# Convert SEX to binary: 1 for Male ('M'), 0 for Female ('F')
data_df['SEX_BINARY'] = data_df['SEX'].apply(lambda x: 1 if x == 'M' else 0)

# Verify the result
print(data_df[['SEX', 'SEX_BINARY']].head())

# Check value counts for the new SEX_BINARY column to confirm the transformation
print(data_df['SEX_BINARY'].value_counts())


SEX
F    514900
M    244764
Name: count, dtype: int64
  SEX  SEX_BINARY
0   F           0
1   F           0
2   F           0
3   M           1
4   F           0
SEX_BINARY
0    514900
1    244764
Name: count, dtype: int64


In [None]:
# Replace the original 'SEX' column with the binary version
data_df['SEX'] = data_df['SEX_BINARY']

# Drop the temporary 'SEX_BINARY' column since it's now stored in 'SEX'
data_df = data_df.drop(columns=['SEX_BINARY'])

# Verify the result
print(data_df.head())


In [None]:
# Filter age to remove outliers (between 0.5 years and 100 years)
data_df = data_df[(data_df['AGE_YRS'] >= 0.5) & (data_df['AGE_YRS'] <= 100)]

# Verify the new age range
print("Min and Max Age after filtering:", data_df['AGE_YRS'].min(), "-", data_df['AGE_YRS'].max())

In [53]:
# Convert RECOVD to binary: 1 for 'Y', 0 for anything else (including NaN)
data_df['RECOVD'] = data_df['RECOVD'].apply(lambda x: 1 if x == 'Y' else 0)

# Convert L_THREAT to binary: 1 for 'Y', 0 for anything else (including NaN)
data_df['L_THREAT'] = data_df['L_THREAT'].apply(lambda x: 1 if x == 'Y' else 0)

# Verify the result
print("Value counts for RECOVD after binary transformation:")
print(data_df['RECOVD'].value_counts())

print("\nValue counts for L_THREAT after binary transformation:")
print(data_df['L_THREAT'].value_counts())

Value counts for RECOVD after binary transformation:
RECOVD
0    500532
1    259132
Name: count, dtype: int64

Value counts for L_THREAT after binary transformation:
L_THREAT
0    745990
1     13674
Name: count, dtype: int64


In [None]:
data_df.info()

In [None]:
# Check for missing values in each column
print(data_df.isnull().sum())


In [54]:
# Drop rows where SYMPTOM_TEXT is missing
data_df = data_df.dropna(subset=['SYMPTOM_TEXT'])

# Verify the result
print(data_df.isnull().sum())

VAERS_ID        0
STATE           0
AGE_YRS         0
SEX             0
SYMPTOM_TEXT    0
DIED            0
L_THREAT        0
HOSPITAL        0
DISABLE         0
RECOVD          0
VAX_DATE        0
BIRTH_DEFECT    0
SEX_BINARY      0
dtype: int64


In [55]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 759664 entries, 0 to 1012886
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      759664 non-null  int64  
 1   STATE         759664 non-null  object 
 2   AGE_YRS       759664 non-null  float64
 3   SEX           759664 non-null  object 
 4   SYMPTOM_TEXT  759664 non-null  object 
 5   DIED          759664 non-null  int64  
 6   L_THREAT      759664 non-null  int64  
 7   HOSPITAL      759664 non-null  int64  
 8   DISABLE       759664 non-null  int64  
 9   RECOVD        759664 non-null  int64  
 10  VAX_DATE      759664 non-null  object 
 11  BIRTH_DEFECT  759664 non-null  int64  
 12  SEX_BINARY    759664 non-null  int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 81.1+ MB


In [56]:
data_df.head()

Unnamed: 0,VAERS_ID,STATE,AGE_YRS,SEX,SYMPTOM_TEXT,DIED,L_THREAT,HOSPITAL,DISABLE,RECOVD,VAX_DATE,BIRTH_DEFECT,SEX_BINARY
0,902418,NJ,56.0,F,Patient experienced mild numbness traveling fr...,0,0,0,0,1,12/15/2020,0,0
1,902440,AZ,35.0,F,C/O Headache,0,0,0,0,1,12/15/2020,0,0
2,902446,WV,55.0,F,"felt warm, hot and face and ears were red and ...",0,0,0,0,1,12/15/2020,0,0
3,902464,LA,42.0,M,within 15 minutes progressive light-headedness...,0,0,0,0,1,12/15/2020,0,1
4,902465,AR,60.0,F,Pt felt wave come over body @ 1218 starting in...,0,0,0,0,0,12/15/2020,0,0


In [57]:
data_df.to_csv('../Data/Clean/event_data.csv', index=False)