In [1]:
import numpy as np
import pandas as pd
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from num2words import num2words
import string
import warnings
import time
import itertools
warnings.filterwarnings("ignore")
pd.set_option("display.max_column", 250)
pd.set_option("display.max_rows", 250)
pd.set_option("display.max_colwidth", 250)

- Data retrieved from https://vaers.hhs.gov/data/datasets.html
    - Last pulled 2021-05-02
- VAERS data use guide: https://vaers.hhs.gov/docs/VAERSDataUseGuide_November2020.pdf
- Drug Names: https://www.fda.gov/drugs/drug-approvals-and-databases/drugsfda-data-files

## Covid-19

In [2]:
# FUNCTIONS
stop = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def read_and_lower(filepath, vaers = True):
    if vaers:
        df = pd.read_csv(filepath, encoding = "ISO-8859-1", converters={"VAERS_ID": lambda x: str(x)})
    else:
        df = pd.read_csv(filepath, encoding = "ISO-8859-1")
    df.columns = df.columns.str.lower()
    for col in df.columns:
        if df[col].dtypes not in ["float64", "int64"]:
            df[col] = df[col].str.lower()            
    return df

def convert_number(text): 
    try:
        return num2words(text)
    except: 
        return text 

def clean_text(text, remove_digits = True, remove_hyphens = True, remove_time = True,
               remove_colons = True, remove_periods = True, remove_slash = True): 
    text = text.astype(str)
    #replace all punctuations with spaces except ".", ":", and "-"  
    text = text.str.replace("[^\w\s\d/.:-]", " ")
    if remove_hyphens: 
        text = text.str.replace("[-]", " ")
    if remove_digits:
        text = text.str.replace("[\d]", " ")
    elif not remove_digits: 
        text = text.apply(lambda x: " ".join([convert_number(word) for word in x.split()]))
    if remove_hyphens: 
        text = text.str.replace("[-]", " ")
    if remove_colons: 
        text = text.str.replace("[:]", " ")
    if remove_time:
        text = text.str.replace("[\d]", " ")
    if remove_periods:
        text = text.str.replace("[.]", " ")
    text = text.str.replace("[^\w\s\d/.:-]", " ")
    #Lemmatize and stem words 
    text = text.apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
    text = text.apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))
    #remove instances where there is only 1 character
    text = text.apply(lambda x: " ".join([word for word in x.split() if len(word) > 1 and not word.isnumeric()]))
    #take out extra spaces
    text = text.str.replace("\s+", " ")
    #strip leading and trailing spaces if any
    text = text.str.strip()
    #remove stopwords
    text = text.apply(lambda x: " ".join([word for word in x.split() if word not in stop]))
    return text


In [3]:
patients = read_and_lower("../../src/raw/data/2021VAERSDATA.csv")
symptoms = read_and_lower("../../src/raw/symptoms/2021VAERSSYMPTOMS.csv")
vax = read_and_lower("../../src/raw/vax/2021VAERSVAX.csv")

### Vax

In [4]:
vax.describe()

Unnamed: 0,vaers_id,vax_type,vax_manu,vax_lot,vax_dose_series,vax_route,vax_site,vax_name
count,109785,109785,109785,76767,108854,85325,84434,109785
unique,108190,43,16,4730,8,9,11,67
top,1215401,covid19,moderna,039k20a,1,im,la,covid19 (covid19 (moderna))
freq,10,109199,45515,2495,68705,59505,59507,45515


In [5]:
multi_id = vax["vaers_id"].value_counts()[vax["vaers_id"].value_counts() > 1].index
multi_id = vax[vax["vaers_id"].isin(multi_id)]
multi_id

Unnamed: 0,vaers_id,vax_type,vax_manu,vax_lot,vax_dose_series,vax_route,vax_site,vax_name
293,0916918,dtaphepbip,glaxosmithkline biologicals,,2,,lg,dtap + hepb + ipv (pediarix)
294,0916918,hibv,sanofi pasteur,,2,,lg,hib (acthib)
295,0916918,rv1,glaxosmithkline biologicals,,2,,lg,rotavirus (rotarix)
296,0916918,unk,unknown manufacturer,,2,,lg,vaccine not specified (other)
452,0917087,covid19,pfizer\biontech,,1,im,la,covid19 (covid19 (pfizer-biontech))
...,...,...,...,...,...,...,...,...
109468,1243574,covid19,pfizer\biontech,el1283,2,im,la,covid19 (covid19 (pfizer-biontech))
109626,1245083,covid19,janssen,unknown,unk,,,covid19 (covid19 (janssen))
109627,1245083,unk,unknown manufacturer,,unk,,,vaccine not specified (no brand name)
109783,1245985,covid19,pfizer\biontech,en6208,1,im,ra,covid19 (covid19 (pfizer-biontech))


Many instances where there are multiple records are when there is a second dose, take row with the latest dose

In [6]:
vax = vax.sort_values(["vaers_id", "vax_manu"])
vax_reduced = vax.groupby(["vaers_id", "vax_manu"]).last().reset_index()
vax_reduced

Unnamed: 0,vaers_id,vax_manu,vax_type,vax_lot,vax_dose_series,vax_route,vax_site,vax_name
0,0916600,moderna,covid19,037k20a,1,im,la,covid19 (covid19 (moderna))
1,0916601,moderna,covid19,025l20a,1,im,ra,covid19 (covid19 (moderna))
2,0916602,pfizer\biontech,covid19,el1284,1,im,la,covid19 (covid19 (pfizer-biontech))
3,0916603,moderna,covid19,unknown,unk,,,covid19 (covid19 (moderna))
4,0916604,moderna,covid19,,1,im,la,covid19 (covid19 (moderna))
...,...,...,...,...,...,...,...,...
108456,1245684,janssen,covid19,unknown,unk,,,covid19 (covid19 (janssen))
108457,1245685,janssen,covid19,unknown,unk,,,covid19 (covid19 (janssen))
108458,1245686,janssen,covid19,041a21a,unk,,,covid19 (covid19 (janssen))
108459,1245687,janssen,covid19,042a21a,unk,,,covid19 (covid19 (janssen))


In [7]:
multi_id_reduced = vax_reduced["vaers_id"].value_counts()[vax_reduced["vaers_id"].value_counts() > 1]
multi_id_reduced = vax_reduced[vax_reduced["vaers_id"].isin(multi_id_reduced.index)]
multi_id_reduced

Unnamed: 0,vaers_id,vax_manu,vax_type,vax_lot,vax_dose_series,vax_route,vax_site,vax_name
293,0916918,glaxosmithkline biologicals,rv1,,2,,lg,rotavirus (rotarix)
294,0916918,sanofi pasteur,hibv,,2,,lg,hib (acthib)
295,0916918,unknown manufacturer,unk,,2,,lg,vaccine not specified (other)
2641,0919620,pfizer\biontech,covid19,el0140,1,im,la,covid19 (covid19 (pfizer-biontech))
2642,0919620,unknown manufacturer,unk,el0140,1,,un,vaccine not specified (no brand name)
...,...,...,...,...,...,...,...,...
107863,1241260,unknown manufacturer,unk,,unk,,,vaccine not specified (no brand name)
107868,1241265,janssen,covid19,unknown,unk,,,covid19 (covid19 (janssen))
107869,1241265,unknown manufacturer,unk,,unk,,,vaccine not specified (no brand name)
108303,1245083,janssen,covid19,unknown,unk,,,covid19 (covid19 (janssen))


Remaining instances where there are multiple records are due to people taking 2 or more different vaccines, remove from dataset because can't trace back which side effects are associated with which vaccine 

In [8]:
vax_final = vax_reduced[~vax_reduced["vaers_id"].isin(multi_id_reduced["vaers_id"])]
#only looking at covid19 vaccines 
vax_final = vax_final[vax_final["vax_type"] == "covid19"]
vax_final.shape

(107657, 8)

### Symptoms

In [9]:
symptoms

Unnamed: 0,vaers_id,symptom1,symptomversion1,symptom2,symptomversion2,symptom3,symptomversion3,symptom4,symptomversion4,symptom5,symptomversion5
0,0916600,dysphagia,23.1,epiglottitis,23.1,,,,,,
1,0916601,anxiety,23.1,dyspnoea,23.1,,,,,,
2,0916602,chest discomfort,23.1,dysphagia,23.1,pain in extremity,23.1,visual impairment,23.1,,
3,0916603,dizziness,23.1,fatigue,23.1,mobility decreased,23.1,,,,
4,0916604,injection site erythema,23.1,injection site pruritus,23.1,injection site swelling,23.1,injection site warmth,23.1,,
...,...,...,...,...,...,...,...,...,...,...,...
148514,1245686,chills,23.1,headache,23.1,heart rate,23.1,heart rate increased,23.1,hyperhidrosis,23.1
148515,1245686,insomnia,23.1,lymphadenopathy,23.1,pain in extremity,23.1,pharyngeal hypoaesthesia,23.1,,
148516,1245687,body temperature,23.1,body temperature increased,23.1,dizziness,23.1,headache,23.1,nausea,23.1
148517,1245687,sars-cov-2 test,23.1,,,,,,,,


Symptom version is just the MedDra dictionary version where the MedDra term was created, doesn't lend much information so can drop them. 

In [10]:
#drop MedDra term versions 
symptoms_dropped = symptoms.drop(columns=["symptomversion1","symptomversion2", "symptomversion3", "symptomversion4", "symptomversion5"])
symptoms_dropped = symptoms_dropped.apply(lambda x: x.str.lower(), axis = 0)

#reduce columns to list of symptoms, removing NA values 
symptoms_list = symptoms_dropped.set_index("vaers_id").unstack().dropna().reset_index(
    name='symptoms')[["vaers_id", "symptoms"]].groupby("vaers_id").agg(lambda x: ' | '.join(x)).reset_index()
symptoms_list

Unnamed: 0,vaers_id,symptoms
0,0916600,dysphagia | epiglottitis
1,0916601,anxiety | dyspnoea
2,0916602,chest discomfort | dysphagia | pain in extremity | visual impairment
3,0916603,dizziness | fatigue | mobility decreased
4,0916604,injection site erythema | injection site pruritus | injection site swelling | injection site warmth
...,...,...
108184,1245684,dizziness | vertigo
108185,1245685,chills | lethargy | feeling abnormal | pain | headache | somnolence | hyperhidrosis | influenza like illness
108186,1245686,chills | insomnia | headache | lymphadenopathy | heart rate | pain in extremity | heart rate increased | pharyngeal hypoaesthesia | hyperhidrosis
108187,1245687,body temperature | sars-cov-2 test | body temperature increased | dizziness | headache | nausea


In [11]:
#remove death from symptoms list because will be predicting deaths from symptoms
symptoms_list["symptoms"] = symptoms_list["symptoms"].astype(str).str.replace(r"[\[\],']", "").str.replace("death", "")


### Patients

In [12]:
patients

Unnamed: 0,vaers_id,recvdate,state,age_yrs,cage_yr,cage_mo,sex,rpt_date,symptom_text,died,datedied,l_threat,er_visit,hospital,hospdays,x_stay,disable,recovd,vax_date,onset_date,numdays,lab_data,v_adminby,v_fundby,other_meds,cur_ill,history,prior_vax,splttype,form_vers,todays_date,birth_defect,ofc_visit,er_ed_visit,allergies
0,0916600,01/01/2021,tx,33.0,33.0,,f,,right side of epiglottis swelled up and hinder swallowing pictures taken benadryl tylenol taken,,,,,,,,,y,12/28/2020,12/30/2020,2.0,none,pvt,,none,none,none,,,2,01/01/2021,,y,,pcn and bee venom
1,0916601,01/01/2021,ca,73.0,73.0,,f,,"approximately 30 min post vaccination administration patient demonstrated sob and anxiousness. assessed at time of event: heart sounds normal, lung sounds clear. vitals within normal limits for patient. o2 91% on 3 liters nc continuous flow. 2 ...",,,,,,,,,y,12/31/2020,12/31/2020,0.0,,sen,,patient residing at nursing facility. see patients chart.,patient residing at nursing facility. see patients chart.,patient residing at nursing facility. see patients chart.,,,2,01/01/2021,,y,,"""dairy"""
2,0916602,01/01/2021,wa,23.0,23.0,,f,,"about 15 minutes after receiving the vaccine, the patient complained about her left arm hurting. she also complained of chest tightness and difficulty swallowing. patient also had vision changes. we gave the patient 1 tablet of benadryl 25 mg and...",,,,,,,,,u,12/31/2020,12/31/2020,0.0,,sen,,none,none,none,,,2,01/01/2021,,,y,shellfish
3,0916603,01/01/2021,wa,58.0,58.0,,f,,"extreme fatigue, dizziness,. could not lift my left arm for 72 hours",,,,,,,,,y,12/23/2020,12/23/2020,0.0,none,wrk,,none,kidney infection,"diverticulitis, mitral valve prolapse, osteoarthritis","got measles from measel shot, mums from mumps shot, headaches and nausea from flu shot",,2,01/01/2021,,,,"diclofenac, novacaine, lidocaine, pickles, tomatoes, milk"
4,0916604,01/01/2021,tx,47.0,47.0,,f,,"injection site swelling, redness, warm to the touch and itchy",,,,,,,,,n,12/22/2020,12/29/2020,7.0,,pub,,na,na,,,,2,01/01/2021,,,,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108185,1245684,04/23/2021,hi,73.0,,,m,,"head spinning/world spinning/ vertigo; slight dizzy; this spontaneous report received from a patient concerned a 73 year old male. the patient's weight was 165 pounds, and height was 175.26 centimeters. the patient's past medical history included...",,,,,,,,,y,,03/01/2021,,,oth,,losartan; norvasc,alcohol use (moderate to light.); essential hypertension; food allergy; nonsmoker,medical history/concurrent conditions: drug allergy; seasonal asthma; comments: the patient had no history of drug abuse.,,usjnjfoc20210347761,2,04/22/2021,,,,
108186,1245685,04/23/2021,tx,,,,m,,body aches; woke up sweaty and drenched; thought he had the flu; groggy; could not relax; lethargy/out of it all day; chills; headache; this spontaneous report received from a patient concerned a 58 year old male. the patient's weight was 185 pou...,,,,,,,,,n,,03/23/2021,,,oth,,,alcohol use (once a week.); non-smoker; seasonal allergy,"medical history/concurrent conditions: drug abuse (33 year ago.); comments: no illness at the time of vaccination. no past history of similar event. no adverse event after any previous vaccination. no history of allergy to vaccine drug, or food. ...",,usjnjfoc20210348673,2,04/22/2021,,,,
108187,1245686,04/23/2021,ny,25.0,,,f,,swollen lymph node; throat feeling a little numb; sweating a little; heart beating really fast; trouble sleeping; soreness in left arm; chills; headache; this spontaneous report received from a patient concerned a 25 year old female. the patient'...,,,,,,,,,n,,04/05/2021,,test date: 20210406; test name: heart rate; result unstructured data: 145 beats per minute; comments: highest rate; the patient had. baseline is 66-82 bpm; test date: 20210406; test name: heart rate; result unstructured data: 91 beats per minute;...,oth,,,pollen allergy,comments: the patient was not pregnant at the time of reporting and had no known drug allergies.,,usjnjfoc20210410875,2,04/14/2021,,,,
108188,1245687,04/23/2021,ia,18.0,,,f,,"dizziness; severe headache; nausea; temperature of 100; this spontaneous report received from a health care professional concerned an 18 year old female. the patient's height, and weight were not reported. the patient's concurrent conditions incl...",,,,,,,,,u,,04/01/2021,,test date: 202103; test name: sars-cov-2 pcr test; result unstructured data: positive; test date: 202104; test name: body temperature; result unstructured data: 100 f,oth,,,covid-19,comments: unknown,,usjnjfoc20210413238,2,04/09/2021,,,,


In [13]:
patients["vaers_id"].value_counts()

0916600    1
1055137    1
1055443    1
1055421    1
1055418    1
          ..
0956765    1
0956764    1
0956763    1
0956762    1
1245985    1
Name: vaers_id, Length: 108190, dtype: int64

All patient ids are unique

In [14]:
#Create a column to denote serious side effect, defined as if patient died, have life threatening event, had to go to the er,
#was hospitalized, had a prolonged hospitalization, had a disability, had a birth defect, or had an er/ed visit 

#check only values are Y or blank 
for i in ["died", "l_threat", "er_visit", "hospital", "x_stay", "disable", "birth_defect", "er_ed_visit"]:
    print(patients[i].unique())

[nan 'y']
[nan 'y']
[nan 'y']
[nan 'y']
[nan 'y']
[nan 'y']
[nan 'y']
[nan 'y']


In [15]:
patients["temp_list"] = patients[["died", "l_threat", "er_visit", "hospital", "x_stay", 
        "disable", "birth_defect", "er_ed_visit"]].apply(lambda x: np.array(x.values.tolist()), axis = 1)
patients["serious_se"] = patients.apply(lambda x: "died" if 0 in list(np.where(x["temp_list"] == "y")[0]) 
               #"If er_visit is the only option checked, the report is not considered serious"
               else "not serious" if x["er_visit"] == "y" and len(np.where(x["temp_list"] == "y")[0]) == 1 
               else "serious" if len(np.where(x["temp_list"] == "y")[0]) >= 1 
               else "not serious", axis = 1)
patients = patients.drop("temp_list", axis = 1)

In [16]:
#checking that er_visit serious column filled out correctly 
patients[patients["er_visit"] == "y"][["died", "l_threat", "er_visit", "hospital", "x_stay", 
        "disable", "birth_defect", "er_ed_visit", "serious_se"]]

Unnamed: 0,died,l_threat,er_visit,hospital,x_stay,disable,birth_defect,er_ed_visit,serious_se
2855,,,y,,,,,,not serious
8145,,,y,,,,,,not serious
9675,,,y,,,,,,not serious
11839,,,y,,,,,,not serious
13147,,,y,,,,,,not serious
18167,,,y,,,,,,not serious
24665,,,y,,,,,,not serious
30640,,,y,,,,,,not serious
34004,,,y,,,,,,not serious
35548,,,y,,,,,,not serious


In [17]:
for i in ["recvdate", "state", 'age_yrs', "cage_yr", "cage_mo", "sex", "rpt_date", "recovd", 
          "vax_date", "onset_date", "numdays", "lab_data", "v_adminby", "v_fundby", "other_meds", 
          "cur_ill", "history", "prior_vax", "splttype", "form_vers", "todays_date", "allergies"]:
    print(i, ":", patients[i].unique())

recvdate : ['01/01/2021' '01/02/2021' '01/03/2021' '01/04/2021' '01/05/2021'
 '01/06/2021' '01/07/2021' '01/08/2021' '01/09/2021' '01/10/2021'
 '01/11/2021' '01/12/2021' '01/13/2021' '01/14/2021' '01/15/2021'
 '01/16/2021' '01/17/2021' '01/18/2021' '01/19/2021' '01/20/2021'
 '01/21/2021' '01/22/2021' '01/23/2021' '01/24/2021' '01/25/2021'
 '01/26/2021' '01/27/2021' '01/28/2021' '01/29/2021' '01/30/2021'
 '01/31/2021' '02/01/2021' '02/02/2021' '02/03/2021' '02/04/2021'
 '02/05/2021' '02/06/2021' '02/07/2021' '02/08/2021' '02/09/2021'
 '02/10/2021' '02/11/2021' '02/12/2021' '02/13/2021' '02/14/2021'
 '02/15/2021' '02/16/2021' '02/17/2021' '02/18/2021' '02/19/2021'
 '02/20/2021' '02/21/2021' '02/22/2021' '02/23/2021' '02/24/2021'
 '02/25/2021' '02/26/2021' '02/27/2021' '02/28/2021' '03/01/2021'
 '03/02/2021' '03/03/2021' '03/04/2021' '03/05/2021' '03/06/2021'
 '03/07/2021' '03/08/2021' '03/09/2021' '03/10/2021' '03/11/2021'
 '03/12/2021' '03/13/2021' '03/14/2021' '03/15/2021' '03/16/2021'

todays_date : ['01/01/2021' '01/01/2020' nan '01/02/2021' '12/09/2020' '12/11/2020'
 '10/27/2020' '12/17/2020' '12/14/2020' '11/02/2020' '10/31/2020'
 '12/04/2020' '11/08/2020' '12/20/2020' '12/23/2020' '11/07/2020'
 '12/19/2020' '11/28/2020' '12/22/2020' '12/21/2020' '12/30/2020'
 '11/21/2020' '11/22/2020' '12/26/2020' '12/29/2020' '11/05/2019'
 '12/31/2020' '12/07/2020' '01/03/2021' '01/04/2021' '01/04/2020'
 '12/28/2020' '01/06/2021' '01/05/2021' '01/05/2020' '12/03/2020'
 '01/07/2021' '01/07/2020' '01/08/2021' '01/08/2020' '01/09/2021'
 '01/16/2021' '01/10/2021' '01/11/2021' '01/11/2011' '01/12/2021'
 '01/10/2020' '01/13/2021' '12/18/2020' '01/13/2020' '01/14/2021'
 '01/15/2021' '01/17/2021' '01/18/2021' '01/15/2020' '01/18/2020'
 '01/19/2021' '01/20/2021' '10/20/2020' '01/20/2020' '01/21/2021'
 '01/22/2021' '01/23/2021' '02/21/2021' '01/23/2020' '01/24/2021'
 '01/25/2021' '01/26/2021' '01/27/2021' '01/28/2021' '01/29/2021'
 '01/19/2020' '01/28/2015' '01/30/2021' '01/31/2021' '02/0

In [18]:
patients["lab_data"].sample(50).unique()
#contains a lot of dates and other test results that include slashes
patients["lab_data_cleaned"] = clean_text(patients["lab_data"], remove_slash = False, remove_digits = False)

In [19]:
#contains doses of medication
patients["other_meds"].sample(50).unique()
patients["other_meds_cleaned"] = clean_text(patients["other_meds"], remove_digits = False)

In [20]:
patients["cur_ill"].sample(50).unique()
patients["cur_ill_cleaned"] = clean_text(patients["cur_ill"])

In [21]:
patients["history"].sample(50).unique()
patients["history_cleaned"] = clean_text(patients["history"])

In [22]:
patients["prior_vax"].sample(100).unique()
patients["prior_vax_cleaned"] = clean_text(patients["prior_vax"])

In [23]:
patients["allergies"].sample(50).unique()
patients["allergies_cleaned"] = clean_text(patients["allergies"])

In [24]:
text_cols = ["lab_data", "other_meds", "cur_ill", "history", "prior_vax", "allergies"]
for col in text_cols: 
    patients[f'{col}_nchar'] = patients[col].astype(str).apply(lambda x: len(x))

In [25]:
final_df = vax_final.merge(symptoms_list, how="left", on = "vaers_id").merge(patients, how = "left", on = "vaers_id")
final_df

Unnamed: 0,vaers_id,vax_manu,vax_type,vax_lot,vax_dose_series,vax_route,vax_site,vax_name,symptoms,recvdate,state,age_yrs,cage_yr,cage_mo,sex,rpt_date,symptom_text,died,datedied,l_threat,er_visit,hospital,hospdays,x_stay,disable,recovd,vax_date,onset_date,numdays,lab_data,v_adminby,v_fundby,other_meds,cur_ill,history,prior_vax,splttype,form_vers,todays_date,birth_defect,ofc_visit,er_ed_visit,allergies,serious_se,lab_data_cleaned,other_meds_cleaned,cur_ill_cleaned,history_cleaned,prior_vax_cleaned,allergies_cleaned,lab_data_nchar,other_meds_nchar,cur_ill_nchar,history_nchar,prior_vax_nchar,allergies_nchar
0,0916600,moderna,covid19,037k20a,1,im,la,covid19 (covid19 (moderna)),dysphagia | epiglottitis,01/01/2021,tx,33.0,33.0,,f,,right side of epiglottis swelled up and hinder swallowing pictures taken benadryl tylenol taken,,,,,,,,,y,12/28/2020,12/30/2020,2.0,none,pvt,,none,none,none,,,2,01/01/2021,,y,,pcn and bee venom,not serious,none,none,none,none,,pcn bee venom,4,4,4,4,3,17
1,0916601,moderna,covid19,025l20a,1,im,ra,covid19 (covid19 (moderna)),anxiety | dyspnoea,01/01/2021,ca,73.0,73.0,,f,,"approximately 30 min post vaccination administration patient demonstrated sob and anxiousness. assessed at time of event: heart sounds normal, lung sounds clear. vitals within normal limits for patient. o2 91% on 3 liters nc continuous flow. 2 ...",,,,,,,,,y,12/31/2020,12/31/2020,0.0,,sen,,patient residing at nursing facility. see patients chart.,patient residing at nursing facility. see patients chart.,patient residing at nursing facility. see patients chart.,,,2,01/01/2021,,y,,"""dairy""",not serious,,patient resid nurs facil see patient chart,patient resid nurs facil see patient chart,patient resid nurs facil see patient chart,,dairi,3,57,57,57,3,7
2,0916602,pfizer\biontech,covid19,el1284,1,im,la,covid19 (covid19 (pfizer-biontech)),chest discomfort | dysphagia | pain in extremity | visual impairment,01/01/2021,wa,23.0,23.0,,f,,"about 15 minutes after receiving the vaccine, the patient complained about her left arm hurting. she also complained of chest tightness and difficulty swallowing. patient also had vision changes. we gave the patient 1 tablet of benadryl 25 mg and...",,,,,,,,,u,12/31/2020,12/31/2020,0.0,,sen,,none,none,none,,,2,01/01/2021,,,y,shellfish,serious,,none,none,none,,shellfish,3,4,4,4,3,9
3,0916603,moderna,covid19,unknown,unk,,,covid19 (covid19 (moderna)),dizziness | fatigue | mobility decreased,01/01/2021,wa,58.0,58.0,,f,,"extreme fatigue, dizziness,. could not lift my left arm for 72 hours",,,,,,,,,y,12/23/2020,12/23/2020,0.0,none,wrk,,none,kidney infection,"diverticulitis, mitral valve prolapse, osteoarthritis","got measles from measel shot, mums from mumps shot, headaches and nausea from flu shot",,2,01/01/2021,,,,"diclofenac, novacaine, lidocaine, pickles, tomatoes, milk",not serious,none,none,kidney infect,diverticul mitral valv prolaps osteoarthr,got measl measel shot mum mump shot headach nausea flu shot,diclofenac novacain lidocain pickl tomato milk,4,4,16,53,86,57
4,0916604,moderna,covid19,,1,im,la,covid19 (covid19 (moderna)),injection site erythema | injection site pruritus | injection site swelling | injection site warmth,01/01/2021,tx,47.0,47.0,,f,,"injection site swelling, redness, warm to the touch and itchy",,,,,,,,,n,12/22/2020,12/29/2020,7.0,,pub,,na,na,,,,2,01/01/2021,,,,na,not serious,,na,na,,,na,3,2,2,3,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107652,1245684,janssen,covid19,unknown,unk,,,covid19 (covid19 (janssen)),dizziness | vertigo,04/23/2021,hi,73.0,,,m,,"head spinning/world spinning/ vertigo; slight dizzy; this spontaneous report received from a patient concerned a 73 year old male. the patient's weight was 165 pounds, and height was 175.26 centimeters. the patient's past medical history included...",,,,,,,,,y,,03/01/2021,,,oth,,losartan; norvasc,alcohol use (moderate to light.); essential hypertension; food allergy; nonsmoker,medical history/concurrent conditions: drug allergy; seasonal asthma; comments: the patient had no history of drug abuse.,,usjnjfoc20210347761,2,04/22/2021,,,,,not serious,,losartan norvasc,alcohol use moder light essenti hypertens food allergi nonsmok,medic history/concurr condit drug allergi season asthma comment patient histori drug abus,,,3,17,81,121,3,3
107653,1245685,janssen,covid19,unknown,unk,,,covid19 (covid19 (janssen)),chills | lethargy | feeling abnormal | pain | headache | somnolence | hyperhidrosis | influenza like illness,04/23/2021,tx,,,,m,,body aches; woke up sweaty and drenched; thought he had the flu; groggy; could not relax; lethargy/out of it all day; chills; headache; this spontaneous report received from a patient concerned a 58 year old male. the patient's weight was 185 pou...,,,,,,,,,n,,03/23/2021,,,oth,,,alcohol use (once a week.); non-smoker; seasonal allergy,"medical history/concurrent conditions: drug abuse (33 year ago.); comments: no illness at the time of vaccination. no past history of similar event. no adverse event after any previous vaccination. no history of allergy to vaccine drug, or food. ...",,usjnjfoc20210348673,2,04/22/2021,,,,,not serious,,,alcohol use onc week non smoker season allergi,medic history/concurr condit drug abus year ago comment ill time vaccin past histori similar event advers event ani previou vaccin histori allergi vaccin drug food pre exist acut ill day prior vaccin histori hospit last day caus famili histori an...,,,3,3,56,559,3,3
107654,1245686,janssen,covid19,041a21a,unk,,,covid19 (covid19 (janssen)),chills | insomnia | headache | lymphadenopathy | heart rate | pain in extremity | heart rate increased | pharyngeal hypoaesthesia | hyperhidrosis,04/23/2021,ny,25.0,,,f,,swollen lymph node; throat feeling a little numb; sweating a little; heart beating really fast; trouble sleeping; soreness in left arm; chills; headache; this spontaneous report received from a patient concerned a 25 year old female. the patient'...,,,,,,,,,n,,04/05/2021,,test date: 20210406; test name: heart rate; result unstructured data: 145 beats per minute; comments: highest rate; the patient had. baseline is 66-82 bpm; test date: 20210406; test name: heart rate; result unstructured data: 91 beats per minute;...,oth,,,pollen allergy,comments: the patient was not pregnant at the time of reporting and had no known drug allergies.,,usjnjfoc20210410875,2,04/14/2021,,,,,not serious,test date twenti million two hundr ten thousand four hundr six test name heart rate result unstructur data one hundr forti five beat per minut comment highest rate patient baselin sixti six eighti two bpm test date twenti million two hundr ten th...,,pollen allergi,comment patient wa pregnant time report known drug allergi,,,314,3,14,96,3,3
107655,1245687,janssen,covid19,042a21a,unk,,,covid19 (covid19 (janssen)),body temperature | sars-cov-2 test | body temperature increased | dizziness | headache | nausea,04/23/2021,ia,18.0,,,f,,"dizziness; severe headache; nausea; temperature of 100; this spontaneous report received from a health care professional concerned an 18 year old female. the patient's height, and weight were not reported. the patient's concurrent conditions incl...",,,,,,,,,u,,04/01/2021,,test date: 202103; test name: sars-cov-2 pcr test; result unstructured data: positive; test date: 202104; test name: body temperature; result unstructured data: 100 f,oth,,,covid-19,comments: unknown,,usjnjfoc20210413238,2,04/09/2021,,,,,not serious,test date two hundr two thousand one hundr three test name sar cov two pcr test result unstructur data posit test date two hundr two thousand one hundr four test name bodi temperatur result unstructur data one hundr,,covid,comment unknown,,,166,3,8,17,3,3


In [26]:
#remove people who live in a state with low representation 
final_df = final_df[~final_df["state"].isin(final_df["state"].value_counts().tail(8).index)]
final_df["vax_dose_series"] = final_df["vax_dose_series"].apply(lambda x: "2+" if x in ["2", "3", "4", "5", "6", "7+"] else x)
final_df["vax_dose_series"] = final_df["vax_dose_series"].fillna("unk")
final_df["vax_lot"] = final_df["vax_lot"].fillna("")
final_df["vax_lot"] = final_df["vax_lot"].str.replace(",", "")
final_df["symptoms"] = final_df["symptoms"].fillna("")
final_df["age_yrs"] = final_df["age_yrs"].fillna(0)
final_df["numdays"] = final_df["numdays"].fillna(-1)
final_df["state"] = final_df["state"].fillna("unk")
for i in final_df.columns:
    if "_cleaned" in i:
        final_df[i] = final_df[i].fillna("none")
        final_df[i] = final_df[i].apply(lambda x: "none" if x == "" else x)

In [27]:
final_df = final_df[["vaers_id", "vax_manu", "vax_lot", "vax_dose_series", "symptoms", "state", "age_yrs", 
          "sex", "numdays", "lab_data", "lab_data_cleaned", "other_meds", "other_meds_cleaned", "cur_ill", "cur_ill_cleaned", 
           "history", "history_cleaned", "prior_vax", "prior_vax_cleaned", "allergies", "allergies_cleaned", "serious_se"]]

In [28]:
final_df.isnull().sum()

vaers_id                   0
vax_manu                   0
vax_lot                    0
vax_dose_series            0
symptoms                   0
state                      0
age_yrs                    0
sex                        0
numdays                    0
lab_data               62511
lab_data_cleaned           0
other_meds             40438
other_meds_cleaned         0
cur_ill                52792
cur_ill_cleaned            0
history                31874
history_cleaned            0
prior_vax             102495
prior_vax_cleaned          0
allergies              44067
allergies_cleaned          0
serious_se                 0
dtype: int64

In [31]:
final_df.to_csv("../../src/processed/covid_vaccine_symptoms.csv", index = False)

### Map Drugs to Active Ingredients and Group

In [22]:
final_df = pd.read_csv("../../src/processed/covid_vaccine_symptoms.csv")

In [33]:
drugnames = read_and_lower("../../src/raw/drugs/drugnames.csv", vaers= False)[["drugname", "activeingredient"]]
drug_stem = read_and_lower("../../src/processed/drug_stems_cleaned.csv", vaers = False)

In [34]:
drugnames["drugname_cleaned"] = clean_text(drugnames["drugname"])
drugnames["activeingredient_cleaned"] = clean_text(drugnames["activeingredient"])
drugnames_cleaned = drugnames[["drugname_cleaned", "activeingredient_cleaned"]].drop_duplicates()

#For short drugs, split other_meds_cleaned into words and need to match short drug name exactly 
short_drugs = drugnames_cleaned[drugnames_cleaned["drugname_cleaned"].apply(lambda x: len(x) <= 5)]
short_drugs = short_drugs[short_drugs["drugname_cleaned"] != ""]
short_drugs = dict(zip(short_drugs["drugname_cleaned"], short_drugs["activeingredient_cleaned"]))
#For long drugs, just search for long drug name in entire text to see if there's a match 
long_drugs = drugnames_cleaned[drugnames_cleaned["drugname_cleaned"].apply(lambda x: len(x) > 5)]
long_drugs = dict(zip(long_drugs["drugname_cleaned"], long_drugs["activeingredient_cleaned"]))


In [35]:
start = time.time()
print("Picking out and converting short drug names")
final_df["meds_list"] = final_df["other_meds_cleaned"].astype(str).str.split().apply(lambda x: np.array([short_drugs[i] if i in short_drugs.keys() else "" for i in x]))
print(f'Time elapsed: {round(time.time() - start, 1)} seconds')
start = time.time()
print("Picking out and converting long drug names")
final_df["meds_list2"] = final_df["other_meds_cleaned"].astype(str).apply(lambda x: np.unique(np.array([long_drugs[i] if i in x else "" for i in long_drugs.keys()])))
print(f'Time elapsed: {round(time.time() - start, 1)} seconds')
start = time.time()
print("Picking out active ingredients")
final_df["meds_list3"] = final_df["other_meds_cleaned"].astype(str).apply(lambda x: np.unique(np.array([i if i in x else "" for i in (list(short_drugs.values()) + list(long_drugs.values()))])))
print(f'Time elapsed: {round(time.time() - start, 1)} seconds')
start = time.time()
print("Combining lists and removing empty strings")
final_df["active_ingredients"] = final_df.apply(lambda x: list(np.unique(np.concatenate((x["meds_list"], x["meds_list2"], x["meds_list3"]), axis = 0))), axis = 1)
final_df["active_ingredients"] = final_df["active_ingredients"].apply(lambda x: list(filter(None, x)))
print(f'Time elapsed: {round(time.time() - start, 1)} seconds')

Picking out and converting short drug names
Time elapsed: 0.9 seconds
Picking out and converting long drug names
Time elapsed: 238.1 seconds
Picking out active ingredients
Time elapsed: 212.6 seconds
Combining lists and removing empty strings
Time elapsed: 5.1 seconds


In [36]:
#convert active ingredients to corresponding drug group based on active ingredient's name  
prefix_dict = dict(zip(drug_stem[drug_stem["pos"] == "prefix"]["stem"], drug_stem[drug_stem["pos"] == "prefix"]["definition"]))
suffix_dict = dict(zip(drug_stem[drug_stem["pos"] == "suffix"]["stem"], drug_stem[drug_stem["pos"] == "suffix"]["definition"]))
middle_dict = dict(zip(drug_stem[drug_stem["pos"] == "middle"]["stem"], drug_stem[drug_stem["pos"] == "middle"]["definition"]))

def dict_lookup(dict_, active_ing_list, type_):
    '''
    Determines which stems in the drug stem dictionary are present in a list of active ingredients
    
    Inputs:
        dict_ (dict): drug stem dictionary, contains the generic drug stem as the keys and the drug group as values 
        active_ing_list (list): list of active ingredients to iterate through and check if drug stems are present
        type_ (str): type of drug stem dictionary, i.e. prefix, middle, suffix 
    
    Outputs:
        lookup_keys (list): list of dictionary keys that are present in the active_ing_list to later convert to groups 
    '''
    keys = list(dict_.keys())
    #if the dictionary is a prefix drug stem, check to see if each item in active_ing_list starts with any of the keys
    if type_ == "prefix":
        key_bool = np.array([np.array([i.startswith(j) for j in keys]) for i in active_ing_list])
    #if the dictionary is a suffix drug stem, check to see if each item in active_ing_list ends with any of the keys 
    if type_ == "suffix": 
        key_bool = np.array([np.array([i.endswith(j) for j in keys]) for i in active_ing_list])
    #if the dictionary is a middle drug stem, check to see if each item in active_ing_list contains any of the keys
    if type_ == "middle": 
        key_bool = np.array([np.array([j in i[1:-1] for j in keys]) for i in active_ing_list])
    lookup_keys = list(itertools.chain(*[list(np.array(keys)[i]) for i in key_bool]))
    return lookup_keys

def translate_drug_group(active_ing_list): 
    '''
    Takes in a list of drug active ingredients and returns list of all drug groups the active ingredients are part of 
    
    Inputs: 
        active_ing_list (list): list of active ingredients to iterate through and list out drug groups ingredients belong to
    
    Outputs: 
        converted (list): list of all drug groups that are present in the active ingredients list 
    '''
    #get the drug stem keys for each prefix, middle, and suffix dictionary 
    prefix_keys = dict_lookup(prefix_dict, active_ing_list, "prefix")
    suffix_keys = dict_lookup(suffix_dict, active_ing_list, "suffix")
    middle_keys = dict_lookup(middle_dict, active_ing_list, "middle")
    #convert to drug group based off of drug stem keys 
    converted = [prefix_dict[key] for key in prefix_keys]
    converted.extend([suffix_dict[key] for key in suffix_keys])
    converted.extend([middle_dict[key] for key in middle_keys])
    return converted

In [37]:
final_df["drug_group"] = final_df["active_ingredients"].apply(lambda x: translate_drug_group(re.split(' ', ' '.join(x))) if len(x) > 0 else x)

* Could further process and pick out active ingredients by using fuzzywuzzy to handle spelling errors for long drug names, however this could introduce inaccurate matching and arduous cleaning process 

In [48]:
final_df["active_ingredients_cleaned"] = clean_text(final_df["active_ingredients"].apply(lambda x: ", ".join(i for i in x)))
final_df["drug_group_cleaned"] = clean_text(final_df["drug_group"].apply(lambda x: ", ".join(i for i in x)))


In [49]:
final_df.to_csv("../../src/processed/covid_vaccine_symptoms 2.csv", index = False)