# **Respiratory Virus Hospitalization Surveillance Network (RESP-NET) Data**

Monitoring COVID-19, respiratory syncytial virus (RSV), and influenza

Data Source: https://www.cdc.gov/resp-net/dashboard/index.html

Run on Python 3.12 | No Errors | No Warnings

In [24]:
# Import packages

# For data manipulation
import pandas as pd

# For making HTTP requests
import requests

# For adding delays in between HTTP requests
import time

# For creating in-memory text streams
from io import StringIO as sio

# For creating and displaying progress bars
from tqdm import tqdm

In [25]:
# Define the API endpoint
url = "https://data.cdc.gov/resource/kvib-3txy"

In [26]:
# Function to get total record count with error handling
def get_total_count():
    try:
        response = requests.get(f"{url}.json", params={"$select": "count(*)"})
        response.raise_for_status()  # Check if the request was successful
        data = response.json()
        return int(data[0]['count'])
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")  # Print the HTTP error
    except requests.exceptions.RequestException as req_err:
        print(f"Error occurred during the request: {req_err}")  # Print other request errors
    except Exception as err:
        print(f"An error occurred: {err}")  # Print any other errors
    return None  # Return None if there was an error

In [27]:
# Get total record count
total_count = get_total_count()

if total_count is not None:
    print(f"Total record count: {total_count}")
else:
    print("Failed to retrieve the total record count.")

Total record count: 43007


In [28]:
# Initialize parameters
params = {
    "$limit": 1000,  # Number of records per request
    "$offset": 0     # Starting point for records
}

In [29]:
# Initialize an empty list to store the data
data_list = []

# Loop to fetch all records with progress bar
with tqdm(total=total_count, desc="Downloading Records", unit="record") as pbar:
    while params['$offset'] < total_count:
        # Make the request to the API
        response = requests.get(f"{url}.csv", params=params)

        # Check if the response is successful
        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            print(f"Response: {response.text}")
            break

        # Attempt to parse the response as CSV
        try:
            data = pd.read_csv(sio(response.text))
        except Exception as e:
            print(f"Error: Unable to parse response as CSV: {e}")
            break

        # Check if there are records in the response
        if data.empty:
            print("No data to fetch.")
            break

        # Append the data to the list
        data_list.append(data)

        # Update the progress bar
        pbar.update(len(data))

        # Increment the offset for the next request
        params['$offset'] += params['$limit']

        # Delay between requests to handle rate limiting
        time.sleep(1)  # Delay for 1 second

Downloading Records: 100%|██████████| 43007/43007 [01:00<00:00, 712.23record/s]


In [30]:
# Concatenate all data into a single DataFrame
df0 = pd.concat(data_list, ignore_index=True)

# Display the final record count
print(f"Total records retrieved: {len(df0)}")

Total records retrieved: 43007


In [31]:
# Display the first 5 rows of the dataframe
df0.head()

Unnamed: 0,surveillance_network,season,mmwr_year,mmwr_week,age_group,sex,race_ethnicity,site,weekly_rate,cumulative_rate,_weekenddate,type
0,FluSurv-NET,2018-19,2018,40,Overall,Overall,"AI/AN, non-Hispanic",Overall,0.0,0.0,2018-10-06T00:00:00.000,Unadjusted Rate
1,FluSurv-NET,2018-19,2018,41,Overall,Overall,"AI/AN, non-Hispanic",Overall,0.0,0.0,2018-10-13T00:00:00.000,Unadjusted Rate
2,FluSurv-NET,2018-19,2018,42,Overall,Overall,"AI/AN, non-Hispanic",Overall,0.0,0.0,2018-10-20T00:00:00.000,Unadjusted Rate
3,FluSurv-NET,2018-19,2018,43,Overall,Overall,"AI/AN, non-Hispanic",Overall,0.0,0.0,2018-10-27T00:00:00.000,Unadjusted Rate
4,FluSurv-NET,2018-19,2018,44,Overall,Overall,"AI/AN, non-Hispanic",Overall,0.0,0.0,2018-11-03T00:00:00.000,Unadjusted Rate


In [32]:
# Verify the data types of the columns
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43007 entries, 0 to 43006
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   surveillance_network  43007 non-null  object 
 1   season                43007 non-null  object 
 2   mmwr_year             43007 non-null  int64  
 3   mmwr_week             43007 non-null  int64  
 4   age_group             43007 non-null  object 
 5   sex                   43007 non-null  object 
 6   race_ethnicity        43007 non-null  object 
 7   site                  43007 non-null  object 
 8   weekly_rate           42739 non-null  float64
 9   cumulative_rate       42739 non-null  float64
 10  _weekenddate          43007 non-null  object 
 11  type                  43007 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 3.9+ MB


In [33]:
# Check for missing values in the dataframe
df0.isnull().sum()

surveillance_network      0
season                    0
mmwr_year                 0
mmwr_week                 0
age_group                 0
sex                       0
race_ethnicity            0
site                      0
weekly_rate             268
cumulative_rate         268
_weekenddate              0
type                      0
dtype: int64

In [34]:
# Drop rows with missing values
df0.dropna(inplace=True)

In [35]:
# Verify that there are no missing values
df0.isnull().sum()

surveillance_network    0
season                  0
mmwr_year               0
mmwr_week               0
age_group               0
sex                     0
race_ethnicity          0
site                    0
weekly_rate             0
cumulative_rate         0
_weekenddate            0
type                    0
dtype: int64

In [36]:
# Specify the age_group rows to delete.

# Create a list of age groups to drop (they overlap with other age groups)
agegroups_to_drop = ['0-4 years', '18-49 years', '5-17 years', '65+ years', '75+years']

# Create a mask to filter the rows
mask = df0['age_group'].isin(agegroups_to_drop)

# Drop the rows that are not in the list
df0.drop(df0[mask].index, inplace=True)

In [37]:
# Drop the rows where weekly_rate and cumulative_rate are 0
df0 = df0[df0['weekly_rate'] != 0]
df0 = df0[df0['cumulative_rate'] != 0]

In [38]:
# Verify the records have been dropped
df0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35015 entries, 6 to 43006
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   surveillance_network  35015 non-null  object 
 1   season                35015 non-null  object 
 2   mmwr_year             35015 non-null  int64  
 3   mmwr_week             35015 non-null  int64  
 4   age_group             35015 non-null  object 
 5   sex                   35015 non-null  object 
 6   race_ethnicity        35015 non-null  object 
 7   site                  35015 non-null  object 
 8   weekly_rate           35015 non-null  float64
 9   cumulative_rate       35015 non-null  float64
 10  _weekenddate          35015 non-null  object 
 11  type                  35015 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 3.5+ MB


In [39]:
# Drop the columns that are not needed
df0.drop(columns=['mmwr_year', 'mmwr_week'], inplace=True)

In [40]:
# Rename Race/Ethnicity values
df0['race_ethnicity'] = df0['race_ethnicity'].replace('AI/AN, non-Hispanic', 'American Indian/Alaska Native')
df0['race_ethnicity'] = df0['race_ethnicity'].replace('A/PI, non-Hispanic', 'Asian/Pacific Islander')
df0['race_ethnicity'] = df0['race_ethnicity'].replace('Black, non-Hispanic', 'Black/African American')
df0['race_ethnicity'] = df0['race_ethnicity'].replace('White, non-Hispanic', 'White')

In [41]:
# Rename Age Group Values
df0['age_group'] = df0['age_group'].replace('0-17 years (Children)', 'Children (0-17 years)')
df0['age_group'] = df0['age_group'].replace('18+ years (Adults)', 'Adults (18+ years)')

In [42]:
# Rename the columns
df0.rename(columns={'surveillance_network': 'Network',
                    'season': 'Season',
                    'age_group': 'Age Group',
                    'sex': 'Sex',
                    'race_ethnicity': 'Race/Ethnicity',
                    'site': 'State',
                    'weekly_rate': 'Weekly Rate',
                    'cumulative_rate': 'Cumulative Rate',
                    '_weekenddate': 'Week Ending',
                    'type': 'Type'}, inplace=True)

In [43]:
# Convert the 'Week Ending' column to datetime format
df0['Week Ending'] = pd.to_datetime(df0['Week Ending'])

In [44]:
# Verify the changes
df0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35015 entries, 6 to 43006
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Network          35015 non-null  object        
 1   Season           35015 non-null  object        
 2   Age Group        35015 non-null  object        
 3   Sex              35015 non-null  object        
 4   Race/Ethnicity   35015 non-null  object        
 5   State            35015 non-null  object        
 6   Weekly Rate      35015 non-null  float64       
 7   Cumulative Rate  35015 non-null  float64       
 8   Week Ending      35015 non-null  datetime64[ns]
 9   Type             35015 non-null  object        
dtypes: datetime64[ns](1), float64(2), object(7)
memory usage: 2.9+ MB


In [45]:
# Display the first 5 rows of the dataframe
df0.head()

Unnamed: 0,Network,Season,Age Group,Sex,Race/Ethnicity,State,Weekly Rate,Cumulative Rate,Week Ending,Type
6,FluSurv-NET,2018-19,Overall,Overall,American Indian/Alaska Native,Overall,0.5,0.5,2018-11-17,Unadjusted Rate
9,FluSurv-NET,2018-19,Overall,Overall,American Indian/Alaska Native,Overall,1.0,1.5,2018-12-08,Unadjusted Rate
10,FluSurv-NET,2018-19,Overall,Overall,American Indian/Alaska Native,Overall,2.5,3.9,2018-12-15,Unadjusted Rate
11,FluSurv-NET,2018-19,Overall,Overall,American Indian/Alaska Native,Overall,5.4,9.3,2018-12-22,Unadjusted Rate
12,FluSurv-NET,2018-19,Overall,Overall,American Indian/Alaska Native,Overall,5.4,14.7,2018-12-29,Unadjusted Rate


In [46]:
# Save the cleaned data to a CSV file
df0.to_csv('cleaned_respnet_data.csv', index=False)