# Cleaning Symptoms


In [9]:
import pandas as pd
import numpy as np

In [10]:
# Load the VAERS Data (Demographics and Event Information)
sym_df = pd.read_csv('../Data/VAERSSYMPTOMS.csv', low_memory=False)

In [11]:
sym_df.head()

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
0,902418,Hypoaesthesia,24.0,Injection site hypoaesthesia,24.0,,,,,,
1,902440,Headache,23.1,,,,,,,,
2,902446,Erythema,23.1,Feeling hot,23.1,Flushing,23.1,,,,
3,902464,Dizziness,23.1,Electrocardiogram normal,23.1,Hyperhidrosis,23.1,Laboratory test normal,23.1,Presyncope,23.1
4,902465,Dysgeusia,23.1,Oral pruritus,23.1,Paraesthesia,23.1,Paraesthesia oral,23.1,Parosmia,23.1


In [12]:
sym_df.columns

Index(['VAERS_ID', 'SYMPTOM1', 'SYMPTOMVERSION1', 'SYMPTOM2',
       'SYMPTOMVERSION2', 'SYMPTOM3', 'SYMPTOMVERSION3', 'SYMPTOM4',
       'SYMPTOMVERSION4', 'SYMPTOM5', 'SYMPTOMVERSION5'],
      dtype='object')

In [13]:
#Drop Irrelevant Columns

sym_df = sym_df.drop(['SYMPTOMVERSION1', 'SYMPTOMVERSION2', 'SYMPTOMVERSION3', 'SYMPTOMVERSION4', 'SYMPTOMVERSION5'], axis=1)

In [14]:
sym_df.head(20)

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,902418,Hypoaesthesia,Injection site hypoaesthesia,,,
1,902440,Headache,,,,
2,902446,Erythema,Feeling hot,Flushing,,
3,902464,Dizziness,Electrocardiogram normal,Hyperhidrosis,Laboratory test normal,Presyncope
4,902465,Dysgeusia,Oral pruritus,Paraesthesia,Paraesthesia oral,Parosmia
5,902465,Sensory disturbance,Tremor,,,
6,902468,Chest discomfort,Chills,Defaecation urgency,Diarrhoea,Dizziness
7,902468,Dyspnoea,Feeling abnormal,Flushing,Presyncope,
8,902479,Chest pain,Feeling abnormal,Flushing,Intensive care,
9,902490,Headache,Pain,,,


In [15]:
# Drop rows where SYMPTOM1 is NaN
sym_df = sym_df.dropna(subset=['SYMPTOM1'])

# Replace remaining NaN values with NULL (represented as None in pandas for SQL compatibility)
sym_df = sym_df.replace({np.nan: None})

In [16]:
sym_df

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOM2,SYMPTOM3,SYMPTOM4,SYMPTOM5
0,902418,Hypoaesthesia,Injection site hypoaesthesia,,,
1,902440,Headache,,,,
2,902446,Erythema,Feeling hot,Flushing,,
3,902464,Dizziness,Electrocardiogram normal,Hyperhidrosis,Laboratory test normal,Presyncope
4,902465,Dysgeusia,Oral pruritus,Paraesthesia,Paraesthesia oral,Parosmia
...,...,...,...,...,...,...
1363166,2776307,COVID-19,Drug ineffective,SARS-CoV-2 test,,
1363167,2776309,COVID-19,Drug ineffective,,,
1363168,2776310,COVID-19,SARS-CoV-2 test,Vaccination failure,,
1363169,2776312,Tinnitus,,,,


In [44]:
#save to csv
sym_df.to_csv('../Data/Clean/sympt_data.csv', index=False)
