In [11]:
import requests
import pandas as pd
from scipy import stats

# Step 1 – Load data from API
url = "https://api.covidtracking.com/v1/status.json"
response = requests.get(url)
data = response.json()

# Wrap single JSON object in a list to create DataFrame
df = pd.DataFrame([data])
print("✅ Step 1 – Data loaded:")
print(df.head(1))

# Step 2 – Rename columns for clarity
df = df.rename(columns={
    'fips': 'state_fips',
    'state': 'state_code',
    'positive': 'cases_total',
    'death': 'deaths_total',
    'totalTestResults': 'total_tests',
    'hospitalizedCurrently': 'hospitalized_now',
    'inIcuCurrently': 'icu_now',
    'onVentilatorCurrently': 'ventilator_now'
})
print("\n✅ Step 2 – Columns renamed:")
print(df.columns.tolist())

# Step 3 – Clean numeric data and check for 'date'
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(0).astype(int)

if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='coerce')
    print("\n✅ Step 3 – Date and numerics cleaned:")
    print(df[['date'] + list(numeric_cols)].head(1))
else:
    print("\n✅ Step 3 – Numeric columns cleaned (no date column available):")
    print(df[numeric_cols].head(1))

# Step 4 – Identify outliers using Z-score
z_scores = df[numeric_cols].apply(lambda x: stats.zscore(x), axis=0).fillna(0)

# If multiple rows, check for outliers row-wise
if len(df) > 1:
    outlier_flags = (z_scores.abs() > 3).any(axis=1)
    print("\n✅ Step 4 – Outlier detected? ", outlier_flags.any())
else:
    print("\n✅ Step 4 – Only one row: skipping outlier detection (Z-scores calculated)")
    print(z_scores)


# Step 5 – Fix text casing and strip whitespace
text_cols = df.select_dtypes(include='object').columns
for col in text_cols:
    df[col] = df[col].astype(str).str.strip().str.upper()

print("\n✅ Step 5 – Text fields cleaned:")
print(df[text_cols].head(1))

# Final – Print cleaned data
print("\n✅ Final Cleaned Dataset:")
print(df.head(1).to_string(index=False))


✅ Step 1 – Data loaded:
                  buildTime  production runNumber
0  2021-06-01T07:02:53.446Z        True      6244

✅ Step 2 – Columns renamed:
['buildTime', 'production', 'runNumber']

✅ Step 3 – Numeric columns cleaned (no date column available):
Empty DataFrame
Columns: []
Index: [0]

✅ Step 4 – Only one row: skipping outlier detection (Z-scores calculated)
Series([], dtype: float64)

✅ Step 5 – Text fields cleaned:
                  buildTime runNumber
0  2021-06-01T07:02:53.446Z      6244

✅ Final Cleaned Dataset:
               buildTime  production runNumber
2021-06-01T07:02:53.446Z        True      6244
