# Data extraction and cleaning for SeroTracker
### Author: Hailey Robertson
### Date: 2024-09-24
### Citations:
- Whelan, Mairead G, Harriet Ware, Himanshu Ranka, Sean Kenny, Sabah Shaikh, Yannik Roell, Shaila Akter, et al. “Arbotracker: A Multipathogen Dashboard and Data Platform for Arbovirus Seroprevalence Studies.” The Lancet Infectious Diseases, September 10, 2024. https://doi.org/10.1016/s1473-3099(24)00585-1


In [19]:
# Imports
import requests
import pandas as pd
from datetime import datetime
import os
import numpy as np

In [20]:
# Query for the SeroTracker API
query = """
query {
  arbovirusEstimates {
  id
  estimateId
  pathogen
  serotype
  country
  countryAlphaTwoCode
  countryAlphaThreeCode
  sampleStartDate
  sampleEndDate
  seroprevalence
  city
  state
  latitude
  longitude
  unRegion
  whoRegion
  seroprevalenceStudy95CILower
  seroprevalenceStudy95CIUpper
  seroprevalenceCalculated95CILower
  seroprevalenceCalculated95CIUpper
  sampleSize
  sampleNumerator
  sampleFrame
  antibodies
  assay
  assayOther
  sameFrameTargetGroup
  antigen
  inclusionCriteria
  ageGroup
  ageMinimum
  ageMaximum
  pediatricAgeGroup
  sex
  url
  sourceSheetId
  sourceSheetName
  createdAt
  producer
  producerOther
  }
}
"""

# Backend URL
url = "https://iit-backend-v2.vercel.app/api/graphql"

# Request to the API
response = requests.post(
    url,
    json={'query': query},
    headers={'Content-Type': 'application/json'}
)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()
else:
    print(f"Query failed with status code {response.status_code}: {response.text}")


In [21]:
# Extract data and create dataframe
if 'data' in data and 'arbovirusEstimates' in data['data']:
    estimates = data['data']['arbovirusEstimates']
    
    # Create DataFrame from the estimates list
    sero_df = pd.DataFrame(estimates)

    # Replace empty lists with NaN
    sero_df = sero_df.apply(lambda col: col.map(lambda x: np.nan if isinstance(x, list) and len(x) == 0 else x)) 

    # Get today's date
    today_date = datetime.now().strftime('%Y-%m-%d')

    # Define the directory to save the CSV file
    data_directory = '../data'  # Relative path from src to data directory

    # Define the filename with today's date
    filename = f"serotracker_estimates_{today_date}.csv"

    # Create the full path for saving the CSV
    file_path = os.path.join(data_directory, filename)

    # Save the DataFrame as a CSV file
    sero_df.to_csv(file_path, index=False)

    print(f"DataFrame saved as {file_path}")

else:
    print("No data found in the response. Check query.")

DataFrame saved as ../data/serotracker_estimates_2024-09-24.csv
