In [1]:
# Imports 
import numpy as np
import pandas as pd
import pyodbc as db
import matplotlib.pyplot as plt
import os
%matplotlib inline
import requests
from requests.auth import HTTPBasicAuth
from io import StringIO
from sodapy import Socrata
from datetime import date
import datetime
import seaborn as sns
import configparser

# Constants 
stringVAERSDATA     = "VAERSDATA"
stringVAERSSYMPTOMS = "VAERSSYMPTOMS"
stringVAERSVAX      = "VAERSVAX"
inputDirectory      = '.\RAW_DATA'
outputDirectory     = '.\OUTPUT'
pklInputDirectory   = '.\INPUT'

usePickle           = 0

In [2]:
# Locals 
dfVAERSDATA_Initialized     = False
dfVAERSSYMPTOMS_Initialized = False
dfVAERSVAX_Initialized      = False

# @TODO Column datatypes - need to finish  - getting the following warning when reading in files:
# DtypeWarning: Columns (11,13,15,16,27) have mixed types.Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
dtype_VAERSDATA = {"VAERS_ID": np.int64, "RECVDATE": np.object, "STATE": np.str, "AGE_YRS": np.float64, "CAGE_YR": np.float64, "CAGE_MO": np.float64, "SEX": np.str, "RPT_VDATE": np.object, "SYMPTOM_TEXT": np.str, "DIED": np.str, "DATEDIED": np.object, "L_THREAD": np.str, "ER_VISIT": np.str}

# Iterate over files 
for filename in os.listdir(inputDirectory):

    # Create the file name 
    fullFilename = os.path.join(inputDirectory, filename)

    # Ensure its a file
    if os.path.isfile(fullFilename):

        # Print the file name to see where it breaks - if needed 
        # print(fullFilename)

        # Overall
        if(fullFilename.__contains__(stringVAERSDATA)):
            
            # Check if the dataframe has been initialized
            if(dfVAERSDATA_Initialized == False):
                
                # Read in the file to this dataframe directly 
                dfVAERSDATA = pd.read_csv(fullFilename, encoding='cp1252', dtype=dtype_VAERSDATA)

                # Set to init 
                dfVAERSDATA_Initialized = True

            else:

                # Add the new file to the dataframe that has been init'd
                dfVAERSDATA = pd.concat([dfVAERSDATA, pd.read_csv(fullFilename, encoding='cp1252', dtype=dtype_VAERSDATA)], ignore_index=True)
            
        # Symptoms
        elif(fullFilename.__contains__(stringVAERSSYMPTOMS)):
            
            # Check if the dataframe has been initialized
            if(dfVAERSSYMPTOMS_Initialized == False):
                
                # Read in the file to this dataframe directly 
                dfVAERSSYMPTOMS = pd.read_csv(fullFilename, encoding='cp1252')

                # Set to init 
                dfVAERSSYMPTOMS_Initialized = True

            else:

                # Add the new file to the dataframe that has been init'd
                dfVAERSSYMPTOMS = pd.concat([dfVAERSSYMPTOMS, pd.read_csv(fullFilename, encoding='cp1252')], ignore_index=True)

        # Vax
        elif(fullFilename.__contains__(stringVAERSVAX)):
            
            # Check if the dataframe has been initialized
            if(dfVAERSVAX_Initialized == False):
                
                # Read in the file to this dataframe directly 
                dfVAERSVAX = pd.read_csv(fullFilename, encoding='cp1252')

                # Set to init 
                dfVAERSVAX_Initialized = True

            else:

                # Add the new file to the dataframe that has been init'd
                dfVAERSVAX = pd.concat([dfVAERSVAX, pd.read_csv(fullFilename, encoding='cp1252')], ignore_index=True)
        
completeDataFrame = pd.merge(left = dfVAERSVAX, right = dfVAERSDATA, \
                      how = "inner", left_on = ["VAERS_ID"], \
                      right_on = ["VAERS_ID"])

# Create a pickle 
completeDataFrame.to_pickle(os.path.join(outputDirectory, "completeDataFrame.pkl"))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype_VAERSDATA = {"VAERS_ID": np.int64, "RECVDATE": np.object, "STATE": np.str, "AGE_YRS": np.float64, "CAGE_YR": np.float64, "CAGE_MO": np.float64, "SEX": np.str, "RPT_VDATE": np.object, "SYMPTOM_TEXT": np.str, "DIED": np.str, "DATEDIED": np.object, "L_THREAD": np.str, "ER_VISIT": np.str}
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype_VAERSDATA = {"VAERS_ID": np.int64, "RECVDATE": np.object, "STATE": np.str, "AGE_YRS": np.float64, "CAGE_YR": np.float64, "CAGE_MO": np.float64, "SEX": np.str, "RPT_VDATE": np.object, "SYMPTOM_TEXT": np.str, "DIED": np.str, "DATEDIED": np.object, "L_THREAD": np.str, "ER_VISIT": np.str}
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_no

In [13]:
# Use the pickle or not
if(usePickle == True):

    # Read in the pickle 
    rawVaxFrame = pd.read_pickle(os.path.join(pklInputDirectory, "completeDataFrame.pkl"))

else:

    # Just make a copy
    rawVaxFrame = completeDataFrame.copy()

# Show the columns 
# rawVaxFrame.info()

In [26]:
# Make a copy 
workingCovidFrame = rawVaxFrame.copy(deep=True)

# Only get the covid vaxes 
workingCovidFrame = workingCovidFrame[workingCovidFrame["VAX_TYPE"] == "COVID19"]

# Drop the duplicates 
workingCovidFrame = workingCovidFrame.drop_duplicates(subset=["VAERS_ID"])

# Get rid of all columns except those we want 
workingCovidFrame = workingCovidFrame[['VAX_MANU','VAX_LOT', 'VAX_DOSE_SERIES', 'STATE', 'AGE_YRS', 'CAGE_YR', 'SEX', 'VAX_DATE', 'ONSET_DATE', 'NUMDAYS', 'DIED']]

# Print this out 
# workingCovidFrame.to_csv(os.path.join(outputDirectory, "workingCovidFrame.csv"))

In [27]:
# If NA then assign as 0 else they died
workingCovidFrame["DIED"] = np.where(workingCovidFrame["DIED"].isna(), 0, 1)

In [28]:
# If NA then assign as 0 else they died
workingCovidFrame["AGE_YRS"] = np.where(workingCovidFrame["AGE_YRS"].isna(), workingCovidFrame["CAGE_YR"], workingCovidFrame["AGE_YRS"])

In [29]:
# If NA then assign as 0 else they died
workingCovidFrame["CAGE_YR"] = np.where(workingCovidFrame["CAGE_YR"].isna(), workingCovidFrame["AGE_YRS"], workingCovidFrame["CAGE_YR"])

In [34]:
# someData = workingCovidFrame[workingCovidFrame["AGE_YRS"].isna()].groupby("DIED").count()
workingCovidFrame = workingCovidFrame[workingCovidFrame["AGE_YRS"].isna() == False]

In [37]:
workingCovidFrame

Unnamed: 0,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,STATE,AGE_YRS,CAGE_YR,SEX,VAX_DATE,ONSET_DATE,NUMDAYS,DIED
1070834,PFIZER\BIONTECH,EH9899,1,NJ,56.0,56.0,F,12/15/2020,12/15/2020,0.0,0
1070853,PFIZER\BIONTECH,EH 9899,1,AZ,35.0,35.0,F,12/15/2020,12/15/2020,0.0,0
1070860,PFIZER\BIONTECH,EH9899,1,WV,55.0,55.0,F,12/15/2020,12/15/2020,0.0,0
1070880,PFIZER\BIONTECH,EH9899,UNK,LA,42.0,42.0,M,12/15/2020,12/15/2020,0.0,0
1070881,PFIZER\BIONTECH,EH9899,1,AR,60.0,60.0,F,12/15/2020,12/15/2020,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
2322235,PFIZER\BIONTECH,,2,FR,69.0,69.0,F,11/05/2021,11/05/2021,0.0,1
2322238,PFIZER\BIONTECH,FG4686,3,FR,67.0,67.0,M,12/07/2021,12/07/2021,0.0,1
2322244,PFIZER\BIONTECH,FF2382,3,FR,66.0,66.0,M,11/29/2021,11/29/2021,0.0,1
2322245,PFIZER\BIONTECH,PCA0008,1,FR,14.0,14.0,F,11/08/2021,11/29/2021,21.0,1
