<a href="https://colab.research.google.com/github/kkipngenokoech/DA-course/blob/main/Lecture_6_Visualizations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports

import xarray as xr
from pathlib import Path
import glob
import numpy as np
import pandas as pd
import time
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import date
from datetime import datetime, timedelta

In [None]:
# Download the data from Github


base_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/"

all_data = []
start_date = datetime(2020, 4, 12)

# We're getting 1 year of data
end_date = datetime(2021,4, 12) # but can get data up until March 9th, 2023,


current = start_date
while current <= end_date:
    date_str = current.strftime("%m-%d-%Y") + ".csv"
    try:
        df_temp = pd.read_csv(base_url + date_str)
        all_data.append(df_temp)
    except:
        pass
    current += timedelta(days=1)

df = pd.concat(all_data, ignore_index=True)


In [None]:
def add_region_and_filter(df):
    """Add region column and filter to 50 states + DC only."""

    regions = {
        'Northeast': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire',
                      'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania'],
        'Midwest': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin',
                    'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska',
                    'North Dakota', 'South Dakota'],
        'South': ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina',
                  'South Carolina', 'Virginia', 'West Virginia', 'District of Columbia',
                  'Alabama', 'Kentucky', 'Mississippi', 'Tennessee',
                  'Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
        'West': ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico',
                 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
    }

    state_to_region = {state: region for region, states in regions.items() for state in states}
    df_clean = df[df['Province_State'].isin(state_to_region.keys())].copy()
    df_clean['Region'] = df_clean['Province_State'].map(state_to_region)

    return df_clean

# Apply it
df_clean = add_region_and_filter(df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Get the most recent data for each state
df_latest = df.groupby('Province_State').last().reset_index()
df_topn = df_latest.nlargest(25, 'Confirmed').sort_values('Confirmed', ascending=False)

# Create figure
fig, ax = plt.subplots(figsize=(14, 8))

# Create barplot with seaborn
sns.barplot(data=df_topn,
            x='Province_State',
            y='Confirmed',
            ax=ax,
            edgecolor='black',
            linewidth=0.7)


# Formatting
ax.set_ylabel('Confirmed Cases', fontsize=8, fontweight='bold')
ax.set_xlabel('State', fontsize=8, fontweight='bold')
ax.set_title('Cases',
             fontsize=16,
             fontweight='bold',
             pad=20)

plt.xticks(rotation=25, ha='right')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x):,}'))
ax.grid(axis='y', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)

plt.tight_layout()
plt.show()

Seaborn Heatmap documentation: https://seaborn.pydata.org/generated/seaborn.heatmap.html

In [None]:
import seaborn as sns

# Pivot for heatmap
df_pivot = df_clean.pivot_table(values='Confirmed',
                           index='Province_State',
                           columns='Date')
df_pivot = df_pivot.nlargest(25, df_pivot.columns[-1])

plt.figure(figsize=(14, 8))
sns.heatmap(df_pivot, cmap='tab20b', cbar_kws={'label': 'Confirmed Cases'})
plt.title('COVID-19 Cases Heatmap: Top 20 States Over Time')
plt.xlabel('Date')
plt.ylabel('State')
plt.tight_layout()
plt.show()

# Mapping the Spread of COVID-19

In [None]:
# Show geographically how COVID-19 had spread in a year

Geopandas Information: https://geopandas.org/en/stable/getting_started/introduction.html

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd


# Load US states map
url = 'https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json'
us_states = gpd.read_file(url)

# Get latest COVID data
df_latest = df_clean.groupby('Province_State').last().reset_index()
df_latest['Mortality_Rate'] = (df_latest['Deaths'] / df_latest['Confirmed'] * 100)

# Create plot
fig, ax = plt.subplots(figsize=(15, 10))

# Draw the US map
us_states.plot(ax=ax, color='hotpink', edgecolor='lime', linewidth=3)

# Add COVID data as dots
scatter = ax.scatter(df_latest['Long_'],
                    df_latest['Lat'],
                    s=df_latest['Deaths'] / 50,
                    c=df_latest['Mortality_Rate'],
                    cmap='Blues',
                    alpha=0.6,
                    edgecolors='black',
                    linewidth=0.5)

# Colorbar
plt.colorbar(scatter, ax=ax, label='Mortality Rate (%)')

# Formatting
ax.set_title('COVID-19 Deaths and Mortality Rate by State', fontsize=16, fontweight='bold')
ax.axis('off')

plt.tight_layout()
plt.show()

# Generate a GIF

Read more details about Animations in Matplotlb here: https://matplotlib.org/stable/users/explain/animations/animations.html

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import matplotlib.animation as animation
from matplotlib.animation import PillowWriter
import numpy as np

# Load US states map ONCE
url = 'https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json'
us_states = gpd.read_file(url)


### -----  !! Replace with you  your plotting function here !! ---- ###
def plot_covid_map(ax, data, current_date, us_states, max_mortality):
  # Do something!
    return



# ============================================
# TEST THE FUNCTION (single frame)
# ============================================

fig, ax = plt.subplots(figsize=(15, 10))
plot_covid_map(ax, df_clean, df_clean['Date'].max(), us_states, max_mortality)
plt.tight_layout()
plt.show()



In [None]:

# ------ CREATE ANIMATED GIF -------- #

def create_covid_map_gif(df, us_states, filename='covid_map.gif', fps=5, step=7):
    """
    Create animated GIF of COVID map over time

    Parameters:
    -----------
    df : DataFrame with COVID data (must have Date column as date type)
    us_states : geopandas GeoDataFrame with US state boundaries
    filename : output filename for GIF
    fps : frames per second
    step : take every Nth day (7 = weekly, 14 = bi-weekly)
    """

    # Get dates to animate
    unique_dates = sorted(df['Date'].unique())[::step]

    print(f"Creating animation with {len(unique_dates)} frames...")
    print(f"Date range: {unique_dates[0]} to {unique_dates[-1]}")

    # Calculate max mortality for consistent scaling
    max_mortality = df['Mortality_Rate'].max()

    # Create figure
    fig, ax = plt.subplots(figsize=(15, 10))

    def animate(frame_num):
        ax.clear()
        current_date = unique_dates[frame_num]
        plot_covid_map() # < ------ Edit this to fit your function!

        if (frame_num + 1) % 10 == 0:
            print(f"  Processed {frame_num + 1}/{len(unique_dates)} frames")

    # Create animation
    ani = animation.FuncAnimation(fig, animate,
                                  frames=len(unique_dates),
                                  interval=1000/fps,
                                  repeat=True)

    # Save as GIF
    #print(f"Saving to {filename}...")
    ani.save(filename, writer=PillowWriter(fps=fps))
    plt.close()
    #print(f"âœ“ Done! Created {filename}")

# CREATE THE GIF
create_covid_map_gif(df_clean, us_states,
                     filename='covid_deaths_map.gif',
                     fps=5,
                     step=14)  # Bi-weekly snapshots

In [None]:
# DISPLAY THE GIF
from IPython.display import Image, display
display(Image(filename='covid_deaths_map.gif'))