In [None]:
# -------------------------------------------------------
# 🧪 1. Data Collection & Loading
# -------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: for interactive world map
# import plotly.express as px

# Set seaborn style
sns.set(style='darkgrid')

# Load the dataset
try:
    df = pd.read_csv('owid-covid-data.csv')
    print("✅ Dataset loaded successfully!")
except FileNotFoundError:
    print("❌ File not found. Please make sure 'owid-covid-data.csv' is in your working directory.")

# Preview dataset
df.head()


In [None]:
# -------------------------------------------------------
# 🔍 2. Data Exploration
# -------------------------------------------------------

# View column names
print("Columns in dataset:")
print(df.columns)

# Check shape and missing values
print(f"\nDataset Shape: {df.shape}")
print("\nMissing values per column:")
print(df.isnull().sum())

# Focus on key columns
columns_of_interest = ['date', 'location', 'total_cases', 'new_cases', 'total_deaths',
                       'new_deaths', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated']
df = df[columns_of_interest]
df.head()


In [None]:
# -------------------------------------------------------
# 🧹 3. Data Cleaning
# -------------------------------------------------------

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Filter countries of interest
countries = ['Kenya', 'India', 'United States']
df = df[df['location'].isin(countries)]

# Fill missing numeric values with forward fill
df.fillna(method='ffill', inplace=True)

# Drop remaining rows with critical missing data
df.dropna(subset=['total_cases', 'total_deaths'], inplace=True)

# Confirm cleaning
df.isnull().sum()


In [None]:
# -------------------------------------------------------
# 📊 4. Exploratory Data Analysis (EDA)
# -------------------------------------------------------

# Line chart: Total cases over time
plt.figure(figsize=(12, 6))
for country in countries:
    sns.lineplot(data=df[df['location'] == country], x='date', y='total_cases', label=country)

plt.title('📈 Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.tight_layout()
plt.show()

# Line chart: Total deaths over time
plt.figure(figsize=(12, 6))
for country in countries:
    sns.lineplot(data=df[df['location'] == country], x='date', y='total_deaths', label=country)

plt.title('☠️ Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.tight_layout()
plt.show()

# Death rate calculation
df['death_rate'] = df['total_deaths'] / df['total_cases']
df[df['location'] == 'Kenya'][['date', 'death_rate']].tail()


In [None]:
# -------------------------------------------------------
# 💉 5. Vaccination Progress
# -------------------------------------------------------

# Line chart: Total vaccinations over time
plt.figure(figsize=(12, 6))
for country in countries:
    sns.lineplot(data=df[df['location'] == country], x='date', y='total_vaccinations', label=country)

plt.title('💉 COVID-19 Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# -------------------------------------------------------
# 🗺️ 6. Optional: Choropleth Map (Cases by Country)
# -------------------------------------------------------

# Uncomment the next block if using plotly (make sure plotly is installed)

# import plotly.express as px

# latest_df = df[df['date'] == df['date'].max()]
# choropleth_df = latest_df.groupby('location').max().reset_index()

# fig = px.choropleth(choropleth_df,
#                     locations="location",
#                     locationmode="country names",
#                     color="total_cases",
#                     title="🌍 COVID-19 Total Cases by Country",
#                     color_continuous_scale="Reds")

# fig.show()


In [None]:
# -------------------------------------------------------
# 🧠 7. Insights & Reporting
# -------------------------------------------------------

# Write narrative insights as markdown in Jupyter, e.g.:

"""
### 🔍 Insights

1. The United States had the highest number of total cases throughout the pandemic.
2. India's vaccination rollout ramped up significantly in 2021.
3. Kenya’s death rate remained lower than the global average.
4. Significant spikes in daily new cases were visible in early 2021 and 2022 across countries.
5. Vaccination coverage varied widely, impacting the pandemic trajectory.

You can expand these insights into a formal report or presentation.
"""
