# COVID-19 Global Data Tracker

## 1. Introduction
**Objective**: Analyze global COVID-19 trends using data from Our World in Data.

Key aspects to examine:
- Case and death trends over time
- Vaccination progress by country
- Comparative analysis between regions

In [None]:
# Download the dataset directly from Our World in Data
import pandas as pd

url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
df = pd.read_csv(url)

# Save a local copy for offline work
df.to_csv('owid-covid-data.csv', index=False)

## 2. Data Loading & Initial Exploration

### Data Source
Using the [Our World in Data COVID-19 Dataset](https://ourworldindata.org/covid-cases):

- Updated daily
- Contains 200+ countries
- Includes cases, deaths, and vaccination metrics

In [None]:
# Load the dataset
df = pd.read_csv('owid-covid-data.csv')

# Basic exploration
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
display(df.head())

# Check data types and missing values
print("\nData types and missing values:")
print(df.info())

# Summary statistics for numerical columns
print("\nSummary statistics:")
print(df.describe())

## 2. Data Cleaning and Preprocessing

### Initial Data Quality Assessment
```python
print(f"Original dataset shape: {df.shape}")
print("\nMissing values per column:")
print(df.isnull().sum().sort_values(ascending=False)[:10])

In [None]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Select countries of interest
countries = ['Kenya', 'United States', 'India', 'Brazil', 'Germany']
df_filtered = df[df['location'].isin(countries)].copy()

# Handle missing values - fill forward for time series data
cols_to_fill = ['total_cases', 'total_deaths', 'total_vaccinations']
df_filtered[cols_to_fill] = df_filtered.groupby('location')[cols_to_fill].ffill()

# Calculate daily change metrics
df_filtered['daily_cases'] = df_filtered.groupby('location')['total_cases'].diff()
df_filtered['daily_deaths'] = df_filtered.groupby('location')['total_deaths'].diff()

# Calculate death rate (with handling for zero cases)
df_filtered['death_rate'] = df_filtered['total_deaths'] / df_filtered['total_cases']
df_filtered['death_rate'] = df_filtered['death_rate'].replace([np.inf, -np.inf], np.nan)

# 4.Exploratory Data Analysis (EDA)

This code creates three visualizations for COVID-19 data using Matplotlib and Seaborn.

## 1. Total Cases Over Time

```python
import matplotlib.pyplot as plt
import seaborn as sns

# Set the visual style
sns.set(style="whitegrid")
plt.figure(figsize=(12, 8))

# Plot total cases over time
plt.figure(figsize=(14, 7))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the visual style
sns.set(style="whitegrid")
plt.figure(figsize=(12, 8))

# Plot total cases over time
plt.figure(figsize=(14, 7))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.xticks(rotation=45)
plt.show()

# Plot new cases with 7-day rolling average
plt.figure(figsize=(14, 7))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    country_data = country_data.set_index('date').sort_index()
    plt.plot(country_data['daily_cases'].rolling(7).mean(), label=f'{country} (7-day avg)')

plt.title('Daily New COVID-19 Cases (7-day Average)')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.show()

# Calculate and plot death rates
latest_data = df_filtered.sort_values('date').groupby('location').last()
plt.figure(figsize=(10, 6))
sns.barplot(x=latest_data.index, y=latest_data['death_rate']*100)
plt.title('COVID-19 Death Rates by Country (%)')
plt.ylabel('Death Rate (%)')
plt.xlabel('Country')
plt.xticks(rotation=45)
plt.show()

# COVID-19 Vaccination Progress Visualizations

## 4️⃣ Vaccination Progress Over Time

```python
# Plot vaccination progress
plt.figure(figsize=(14, 7))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations_per_hundred'], label=country)

plt.title('COVID-19 Vaccinations per 100 People')
plt.xlabel('Date')
plt.ylabel('Vaccinations per 100 People')
plt.legend()
plt.show()

In [None]:
# Plot vaccination progress
plt.figure(figsize=(14, 7))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations_per_hundred'], label=country)

plt.title('COVID-19 Vaccinations per 100 People')
plt.xlabel('Date')
plt.ylabel('Vaccinations per 100 People')
plt.legend()
plt.show()

# Compare current vaccination status
latest_vax = df_filtered.groupby('location')['people_fully_vaccinated_per_hundred'].last()
plt.figure(figsize=(10, 6))
latest_vax.plot(kind='bar')
plt.title('Percentage of Population Fully Vaccinated')
plt.ylabel('% Fully Vaccinated')
plt.xlabel('Country')
plt.xticks(rotation=45)
plt.show()

# 6️⃣ Global COVID-19 Choropleth Map Visualization

```python
import plotly.express as px

# Prepare latest global data
latest_global = df.sort_values('date').groupby('location').last().reset_index()

# Create choropleth map
fig = px.choropleth(latest_global, 
                    locations="iso_code",
                    color="total_cases_per_million",
                    hover_name="location",
                    hover_data=["total_cases", "total_deaths"],
                    color_continuous_scale=px.colors.sequential.Plasma,
                    title="Total COVID-19 Cases per Million People")
fig.show()

In [None]:
import plotly.express as px

# Prepare latest global data
latest_global = df.sort_values('date').groupby('location').last().reset_index()

# Create choropleth map
fig = px.choropleth(latest_global, 
                    locations="iso_code",
                    color="total_cases_per_million",
                    hover_name="location",
                    hover_data=["total_cases", "total_deaths"],
                    color_continuous_scale=px.colors.sequential.Plasma,
                    title="Total COVID-19 Cases per Million People")
fig.show()

# 7️⃣ COVID-19 Data Insights & Automated Reporting

```python
# Generate key insights programmatically
latest_global = df.sort_values('date').groupby('location').last()

# Insight 1: Countries with highest case rates
top_case_rates = latest_global['total_cases_per_million'].sort_values(ascending=False).head(5)
print("Countries with highest cases per million:")
print(top_case_rates)

# Insight 2: Vaccination leaders
top_vax = latest_global['people_fully_vaccinated_per_hundred'].sort_values(ascending=False).head(5)
print("\nCountries with highest vaccination rates:")
print(top_vax)

# Insight 3: Global death rate
global_death_rate = latest_global['total_deaths'].sum() / latest_global['total_cases'].sum()
print(f"\nGlobal death rate: {global_death_rate:.2%}")

# Insight 4: Vaccination vs. death rate correlation
print("\nCorrelation between vaccination rate and death rate:")
print(latest_global[['people_fully_vaccinated_per_hundred', 'death_rate']].corr())

In [None]:
# Generate key insights programmatically
latest_global = df.sort_values('date').groupby('location').last()

# Insight 1: Countries with highest case rates
top_case_rates = latest_global['total_cases_per_million'].sort_values(ascending=False).head(5)
print("Countries with highest cases per million:")
print(top_case_rates)

# Insight 2: Vaccination leaders
top_vax = latest_global['people_fully_vaccinated_per_hundred'].sort_values(ascending=False).head(5)
print("\nCountries with highest vaccination rates:")
print(top_vax)

# Insight 3: Global death rate
global_death_rate = latest_global['total_deaths'].sum() / latest_global['total_cases'].sum()
print(f"\nGlobal death rate: {global_death_rate:.2%}")

# Insight 4: Vaccination vs. death rate correlation
print("\nCorrelation between vaccination rate and death rate:")
print(latest_global[['people_fully_vaccinated_per_hundred', 'death_rate']].corr())