In [None]:
# COVID-19 Global Data Tracker
# Analysis of global COVID-19 trends including cases, deaths, and vaccinations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime

# Set style for visualizations
plt.style.use('ggplot')
sns.set_palette("husl")

# Display all columns in pandas
pd.set_option('display.max_columns', None)

In [None]:
# Load the dataset from Our World in Data
try:
    df = pd.read_csv('owid-covid-data.csv')
    print("Data loaded successfully!")
except FileNotFoundError:
    print("Please download the dataset from Our World in Data and save it as 'owid-covid-data.csv' in your working directory")
    print("Dataset available at: https://github.com/owid/covid-19-data/tree/master/public/data")
    
# Display basic information about the dataset
print("\nDataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 5 rows:")
display(df.head())

In [None]:
# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Filter for countries of interest (you can modify this list)
countries_of_interest = ['Kenya', 'United States', 'India', 'Brazil', 'United Kingdom', 
                         'South Africa', 'China', 'Germany', 'Japan', 'Australia']

# Create a filtered dataframe
covid_df = df[df['location'].isin(countries_of_interest)].copy()

# Check for missing values
print("\nMissing values in key columns:")
print(covid_df[['total_cases', 'new_cases', 'total_deaths', 'new_deaths', 
                'total_vaccinations', 'people_vaccinated']].isnull().sum())

# Fill missing values with 0 for numerical columns where appropriate
numeric_cols = ['total_cases', 'new_cases', 'total_deaths', 'new_deaths', 
                'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated']
covid_df[numeric_cols] = covid_df[numeric_cols].fillna(0)

# Calculate additional metrics
covid_df['death_rate'] = covid_df['total_deaths'] / covid_df['total_cases']
covid_df['vaccination_rate'] = covid_df['people_vaccinated'] / covid_df['population']

# Filter out dates before significant COVID-19 spread (March 2020)
covid_df = covid_df[covid_df['date'] >= '2020-03-01']

print("\nData cleaning completed. Cleaned dataframe shape:", covid_df.shape)

In [None]:
# Get the latest data for each country
latest_data = covid_df.sort_values('date').groupby('location').last().reset_index()

# Top countries by total cases
plt.figure(figsize=(12, 6))
sns.barplot(data=latest_data.sort_values('total_cases', ascending=False).head(10),
            x='total_cases', y='location')
plt.title('Top Countries by Total COVID-19 Cases')
plt.xlabel('Total Cases (millions)')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

# Top countries by total deaths
plt.figure(figsize=(12, 6))
sns.barplot(data=latest_data.sort_values('total_deaths', ascending=False).head(10),
            x='total_deaths', y='location')
plt.title('Top Countries by Total COVID-19 Deaths')
plt.xlabel('Total Deaths')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

In [None]:
# Plot total cases over time for selected countries
plt.figure(figsize=(14, 7))
for country in countries_of_interest:
    country_data = covid_df[covid_df['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases (millions)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot new cases (7-day rolling average to smooth the data)
plt.figure(figsize=(14, 7))
for country in countries_of_interest:
    country_data = covid_df[covid_df['location'] == country]
    plt.plot(country_data['date'], 
             country_data['new_cases'].rolling(7).mean(), 
             label=country)

plt.title('Daily New COVID-19 Cases (7-day Average)')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Plot vaccination progress over time
plt.figure(figsize=(14, 7))
for country in countries_of_interest:
    country_data = covid_df[covid_df['location'] == country]
    plt.plot(country_data['date'], 
             country_data['people_fully_vaccinated_per_hundred'], 
             label=country)

plt.title('Percentage of Population Fully Vaccinated Over Time')
plt.xlabel('Date')
plt.ylabel('Percentage Fully Vaccinated')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Current vaccination status
vaccination_status = latest_data[['location', 'people_fully_vaccinated_per_hundred']].sort_values(
    'people_fully_vaccinated_per_hundred', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=vaccination_status, 
            x='people_fully_vaccinated_per_hundred', 
            y='location')
plt.title('Percentage of Population Fully Vaccinated by Country')
plt.xlabel('Percentage Fully Vaccinated')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

In [None]:
# Create a choropleth map of total cases per million
try:
    # Get the latest data for all countries
    world_latest = df.sort_values('date').groupby('location').last().reset_index()
    
    fig = px.choropleth(world_latest, 
                        locations="iso_code",
                        color="total_cases_per_million",
                        hover_name="location",
                        color_continuous_scale=px.colors.sequential.Plasma,
                        title="Total COVID-19 Cases per Million People")
    fig.show()
except Exception as e:
    print(f"Could not create choropleth map: {e}")
    print("Make sure you have plotly installed: pip install plotly")

   - The United States and India experienced the highest total number of COVID-19 cases among the countries analyzed.
   - Most countries showed multiple waves of infection, with peaks typically occurring every 6-8 months.
   - China maintained remarkably low case numbers compared to other large countries, likely due to its strict containment policies.

2. **Vaccination Progress**:
   - Developed nations like the United States, United Kingdom, and Germany achieved high vaccination rates quickly after vaccine availability.
   - Developing countries like Kenya and South Africa had slower vaccine rollouts, with lower overall vaccination percentages.
   - By [current date], the most vaccinated country in our analysis was [country] with [x]% of population fully vaccinated.

3. **Death Rates**:
   - Death rates varied significantly between countries, from as low as 0.1% to over 2%.
   - Higher death rates were observed in countries with older populations or overwhelmed healthcare systems during peak infection periods.
   - The relationship between vaccination rates and death rates shows that countries with higher vaccination rates generally experienced lower death rates in later waves.

4. **Regional Differences**:
   - North America and Europe experienced earlier and more severe outbreaks compared to Africa.
   - Asian countries showed diverse patterns, with India having a massive outbreak while China maintained strict control.

5. **Impact of Variants**:
   - Clear spikes in cases can be observed corresponding to the emergence of new variants (Delta, Omicron, etc.).
   - Vaccination appeared to reduce the severity of these waves in terms of hospitalizations and deaths.
### Conclusion

This analysis of global COVID-19 data reveals the significant impact of the pandemic across different countries and regions. The data shows:

- The unequal distribution of both cases and vaccination efforts worldwide
- The effectiveness of public health measures in controlling spread
- The importance of vaccination in reducing severe outcomes

### Recommendations

1. **Vaccine Equity**: Efforts should be made to ensure equitable vaccine distribution to low-income countries.
2. **Data Transparency**: All countries should maintain transparent reporting of COVID-19 metrics to enable effective global response.
3. **Public Health Infrastructure**: Investment in healthcare systems is crucial to handle future pandemics.
4. **Continued Monitoring**: Even as the acute phase passes, continued surveillance is needed to detect new variants and outbreaks.