In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np # Often used implicitly by pandas, good to have
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Optional: Set consistent styles for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_style("whitegrid")

print("Libraries imported successfully!")


# Cell 2: Project Title and Description (Markdown Cell)
COVID-19 Global Data Tracker
Project Description:
This notebook analyzes global COVID-19 trends, including cases, deaths, and vaccinations, using data from Our World in Data. It involves data cleaning, exploratory data analysis (EDA), and visualization to derive insights.
Project Objectives:
✅ Import and clean COVID-19 global data
✅ Analyze time trends (cases, deaths, vaccinations)
✅ Compare metrics across countries/regions
✅ Visualize trends with charts and maps
✅ Communicate findings



```python
# Cell 3: Data Loading and Initial Exploration
# 1. Data Collection (Dataset should be in the same folder)
DATA_FILE = 'owid-covid-data.csv'

try:
    df_raw = pd.read_csv(DATA_FILE)
    print(f"Dataset '{DATA_FILE}' loaded successfully.")
    print(f"Shape of raw dataset: {df_raw.shape}")
except FileNotFoundError:
    print(f"Error: '{DATA_FILE}' not found. Make sure it's in the same directory as the notebook.")
    df_raw = pd.DataFrame() # Create an empty DataFrame to prevent further errors if file not found

if not df_raw.empty:
    # 2. Data Loading & Exploration
    print("\nFirst 5 rows of the dataset:")
    print(df_raw.head())

    print("\nColumns in the dataset:")
    print(df_raw.columns)

    print("\nBasic information about the dataset:")
    df_raw.info()

    print("\nSum of missing values per column (showing top 15):")
    print(df_raw.isnull().sum().sort_values(ascending=False).head(15))

    # Key columns for this project (reminder):
    # date, location, total_cases, total_deaths, new_cases, new_deaths,
    # total_vaccinations, people_vaccinated, people_fully_vaccinated,
    # new_cases_smoothed, new_deaths_smoothed, population, iso_code, continent

    # Cell 4: Data Cleaning
if not df_raw.empty:
    df = df_raw.copy() # Work on a copy

    # Convert 'date' column to datetime objects
    df['date'] = pd.to_datetime(df['date'])
    print("\n'date' column converted to datetime format.")

    # Define countries of interest for focused analysis
    # You can change these or add more
    countries_of_interest = ['Kenya', 'United States', 'India', 'United Kingdom', 'Brazil', 'Germany']
    df_countries = df[df['location'].isin(countries_of_interest)].copy() # Use .copy()

    print(f"\nData filtered for countries: {', '.join(countries_of_interest)}")
    print(f"Shape of filtered dataset: {df_countries.shape}")

    # Handle missing values for key numeric columns used in analysis
    # For cumulative counts and new counts, filling with 0 is often reasonable,
    # especially for early dates or if data is sparse.
    # Forward fill (ffill) can be used if intermittent NaNs are expected within a country's series.
    # Here, we'll apply ffill first then fill remaining initial NaNs with 0 for selected columns.

    cols_to_clean = [
        'total_cases', 'total_deaths', 'new_cases', 'new_deaths',
        'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
        'new_vaccinations', 'new_cases_smoothed', 'new_deaths_smoothed',
        'people_fully_vaccinated_per_hundred', 'total_vaccinations_per_hundred',
        'icu_patients', 'hosp_patients' # Adding a few more potentially interesting ones
    ]

    for col in cols_to_clean:
        if col in df_countries.columns:
            # Group by location, then forward fill, then fill any remaining NaNs with 0
            df_countries[col] = df_countries.groupby('location')[col].ffill().fillna(0)
        if col in df.columns: # Also apply to the main dataframe for global analyses
             df[col] = df.groupby('location')[col].ffill().fillna(0)


    print("\nMissing values handled for key columns in 'df_countries'.")
    print("Preview of cleaned 'df_countries':")
    print(df_countries[['date', 'location', 'total_cases', 'total_deaths', 'total_vaccinations']].head())

    # Remove OWID aggregate regions (like 'World', 'Asia') from country-specific analyses if needed later for global maps
    # These often have 'iso_code' starting with 'OWID_' or continent is NaN.
    # For df_all_countries_latest later, we will filter these.
    # df = df[~df['iso_code'].str.startswith('OWID_', na=False)] # Example if you want to apply it to the main df

else:
    print("Skipping Data Cleaning as dataset was not loaded.")
    df = pd.DataFrame() # Ensure df exists
    df_countries = pd.DataFrame() # Ensure df_countries exists

    # Cell 5: Exploratory Data Analysis (EDA)
if not df_countries.empty:
    print("\n--- Exploratory Data Analysis ---")

    # Plot total cases over time for selected countries
    plt.figure(figsize=(15, 7))
    sns.lineplot(data=df_countries, x='date', y='total_cases', hue='location', linewidth=2)
    plt.title('Total COVID-19 Cases Over Time (Log Scale)', fontsize=16)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Total Cases (Logarithmic Scale)', fontsize=12)
    plt.yscale('log') # Use log scale due to exponential growth for better visualization
    plt.legend(title='Country', fontsize=10)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Plot total deaths over time for selected countries
    plt.figure(figsize=(15, 7))
    sns.lineplot(data=df_countries, x='date', y='total_deaths', hue='location', linewidth=2)
    plt.title('Total COVID-19 Deaths Over Time (Log Scale)', fontsize=16)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Total Deaths (Logarithmic Scale)', fontsize=12)
    plt.yscale('log')
    plt.legend(title='Country', fontsize=10)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Compare daily new cases (smoothed) between countries
    plt.figure(figsize=(15, 7))
    sns.lineplot(data=df_countries, x='date', y='new_cases_smoothed', hue='location', linewidth=2)
    plt.title('Daily New COVID-19 Cases (7-day Smoothed)', fontsize=16)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Daily New Cases (Smoothed)', fontsize=12)
    plt.legend(title='Country', fontsize=10)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Calculate and plot current death rate for selected countries
    # Death Rate = (Total Deaths / Total Cases) * 100
    df_latest_selected = df_countries.loc[df_countries.groupby('location')['date'].idxmax()].copy()
    df_latest_selected['death_rate'] = (
        df_latest_selected['total_deaths'] / df_latest_selected['total_cases'].replace(0, float('nan')) # Avoid division by zero
    ) * 100
    df_latest_selected.dropna(subset=['death_rate'], inplace=True) # Remove if death rate is NaN

    plt.figure(figsize=(10, 6))
    sns.barplot(data=df_latest_selected.sort_values('death_rate', ascending=False),
                x='death_rate', y='location', palette='viridis')
    plt.title('Current COVID-19 Death Rate by Selected Country (%)', fontsize=16)
    plt.xlabel('Death Rate (%)', fontsize=12)
    plt.ylabel('Country', fontsize=12)
    plt.tight_layout()
    plt.show()

    # Bar chart for top N countries by total cases (from the full dataset)
    if not df.empty:
        df_all_countries_latest = df.loc[df.groupby('location')['date'].idxmax()].copy()
        # Filter out aggregate OWID entries and locations without continents (often aggregates)
        df_all_countries_latest_filtered = df_all_countries_latest[
            (~df_all_countries_latest['iso_code'].str.startswith('OWID_', na=True)) &
            (df_all_countries_latest['continent'].notna())
        ]

        top_n = 15
        df_top_n_cases = df_all_countries_latest_filtered.nlargest(top_n, 'total_cases')

        plt.figure(figsize=(12, 8))
        barplot_top_cases = sns.barplot(data=df_top_n_cases, x='total_cases', y='location', palette='mako')
        plt.title(f'Top {top_n} Countries by Total COVID-19 Cases (Latest Data)', fontsize=16)
        plt.xlabel('Total Cases', fontsize=12)
        plt.ylabel('Country', fontsize=12)
        barplot_top_cases.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x)))) # Format x-axis
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
    else:
        print("Skipping Top N Countries plot as full 'df' is empty.")
else:
    print("Skipping EDA as 'df_countries' is empty.")

    # Cell 6: Visualizing Vaccination Progress
if not df_countries.empty:
    print("\n--- Visualizing Vaccination Progress ---")

    # Plot cumulative vaccinations over time for selected countries
    plt.figure(figsize=(15, 7))
    sns.lineplot(data=df_countries, x='date', y='total_vaccinations', hue='location', linewidth=2)
    plt.title('Cumulative COVID-19 Vaccinations Over Time', fontsize=16)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Total Vaccinations (Log Scale)', fontsize=12)
    plt.yscale('log') # May need log scale depending on numbers
    plt.legend(title='Country', fontsize=10)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Compare % of population fully vaccinated
    plt.figure(figsize=(15, 7))
    sns.lineplot(data=df_countries, x='date', y='people_fully_vaccinated_per_hundred', hue='location', linewidth=2)
    plt.title('Percentage of Population Fully Vaccinated Over Time', fontsize=16)
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('People Fully Vaccinated (per hundred)', fontsize=12)
    plt.legend(title='Country', fontsize=10)
    plt.xticks(rotation=45)
    plt.ylim(0, 100) # Percentage is 0-100
    plt.tight_layout()
    plt.show()

    # Bar chart of current % fully vaccinated for selected countries
    # Using df_latest_selected from EDA section
    if 'people_fully_vaccinated_per_hundred' in df_latest_selected.columns:
        plt.figure(figsize=(10, 6))
        sns.barplot(data=df_latest_selected.sort_values('people_fully_vaccinated_per_hundred', ascending=False),
                    x='people_fully_vaccinated_per_hundred', y='location', palette='crest_r')
        plt.title('Current % of Population Fully Vaccinated (Selected Countries)', fontsize=16)
        plt.xlabel('People Fully Vaccinated (per hundred)', fontsize=12)
        plt.ylabel('Country', fontsize=12)
        plt.xlim(0, 100)
        plt.tight_layout()
        plt.show()
    else:
        print("Column 'people_fully_vaccinated_per_hundred' not found for bar chart.")
else:
    print("Skipping Vaccination Progress visualization as 'df_countries' is empty.")


    # Cell 7: Optional - Build a Choropleth Map
if not df.empty:
    print("\n--- Choropleth Maps ---")
    # Prepare latest data for all countries from the main 'df'
    # We can reuse df_all_countries_latest_filtered if it was created in EDA
    if 'df_all_countries_latest_filtered' not in locals() or df_all_countries_latest_filtered.empty:
        df_all_countries_latest = df.loc[df.groupby('location')['date'].idxmax()].copy()
        df_all_countries_latest_filtered = df_all_countries_latest[
            (~df_all_countries_latest['iso_code'].str.startswith('OWID_', na=True)) &
            (df_all_countries_latest['continent'].notna()) & # Ensure continent exists for filtering
            (df_all_countries_latest['iso_code'].notna()) # Ensure iso_code exists for mapping
        ]

    if not df_all_countries_latest_filtered.empty:
        # Choropleth map for Total Cases per Million
        fig_cases_map = px.choropleth(df_all_countries_latest_filtered.dropna(subset=['iso_code', 'total_cases_per_million']),
                                locations="iso_code",
                                color="total_cases_per_million",
                                hover_name="location",
                                hover_data={'iso_code': False, 'total_cases_per_million': ':.0f', 'total_cases':':,', 'population':':,'},
                                projection="natural earth",
                                color_continuous_scale=px.colors.sequential.Plasma,
                                title="Global COVID-19 Total Cases per Million (Latest Data)")
        fig_cases_map.show()

        # Choropleth map for Vaccination Rates (% fully vaccinated per hundred)
        if 'people_fully_vaccinated_per_hundred' in df_all_countries_latest_filtered.columns:
            df_map_vacc_data = df_all_countries_latest_filtered.dropna(subset=['iso_code', 'people_fully_vaccinated_per_hundred'])
            if not df_map_vacc_data.empty:
                fig_vacc_map = px.choropleth(df_map_vacc_data,
                                        locations="iso_code",
                                        color="people_fully_vaccinated_per_hundred",
                                        hover_name="location",
                                        hover_data={'iso_code': False, 'people_fully_vaccinated_per_hundred': ':.2f'},
                                        projection="natural earth",
                                        color_continuous_scale=px.colors.sequential.Viridis_r,
                                        title="Global COVID-19 Full Vaccination Rate (% of Population, Latest Data)")
                fig_vacc_map.show()
            else:
                print("No data available for vaccination rate choropleth map after filtering.")
        else:
            print("Column 'people_fully_vaccinated_per_hundred' not found for vaccination map.")
    else:
        print("Skipping Choropleth maps as filtered latest data for all countries is empty.")
else:
    print("Skipping Choropleth maps as main 'df' is empty.")


```python
# Cell 8: End of Notebook (Optional Message)
print("\n--- COVID-19 Global Data Tracker Analysis Complete ---")
print("Please review the generated plots and insights.")    