In [3]:
import pandas as pd

# Load the dataset
# Make sure 'owid-covid-data.csv' is in the same directory as your notebook,
# or provide the full path to the file.
file_path = 'owid-covid-data.csv'
df = pd.read_csv(file_path)

# Check columns
print("Columns in the dataset:")
print(df.columns)
print("\n" + "="*50 + "\n") # Separator for better readability

# Preview rows
print("First 5 rows of the dataset:")
print(df.head())
print("\n" + "="*50 + "\n") # Separator

# Identify missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\n" + "="*50 + "\n") # Separator

# Get a general overview of the DataFrame
print("DataFrame Info:")
df.info()

Columns in the dataset:
Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinati

In [5]:
# Display missing values for key columns
key_columns = ['date', 'location', 'total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']
print("Missing values in key columns:")
print(df[key_columns].isnull().sum())

Missing values in key columns:
date                       0
location                   0
total_cases            17631
total_deaths           17631
new_cases              19276
new_deaths             18827
total_vaccinations    344018
dtype: int64


In [6]:
# Convert 'date' column to datetime objects
# This might have been done in the first cell by df.info() if pandas inferred it,
# but it's good practice to do it explicitly.
df['date'] = pd.to_datetime(df['date'])

# Define countries of interest
# Note: The dataset uses 'United States' for USA.
countries_of_interest = ['Kenya', 'United States', 'India']
df_countries = df[df['location'].isin(countries_of_interest)].copy() # Use .copy() to avoid SettingWithCopyWarning

# Verify the filtering
print(f"Selected countries: {df_countries['location'].unique()}")
print(f"Earliest date in filtered data: {df_countries['date'].min()}")
print(f"Latest date in filtered data: {df_countries['date'].max()}")
print(f"Shape of the filtered data: {df_countries.shape}")
print("\n" + "="*50 + "\n")

# Check missing values in the filtered data for key columns
# We re-use 'key_columns' defined in the previous cell.
# If you are running this cell in a new session, you might need to redefine key_columns:
# key_columns = ['date', 'location', 'total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']
print("Missing values in key columns for selected countries:")
print(df_countries[key_columns].isnull().sum())
print("\n" + "="*50 + "\n")

# Display the first few rows of the filtered data
print("First 5 rows of the filtered dataset (df_countries):")
print(df_countries.head())
print("\n" + "="*50 + "\n")

# Display the last few rows to see recent data and potential NaNs in vaccinations
print("Last 5 rows of the filtered dataset (df_countries):")
print(df_countries.tail())

Selected countries: ['India' 'Kenya' 'United States']
Earliest date in filtered data: 2020-01-05 00:00:00
Latest date in filtered data: 2024-08-12 00:00:00
Shape of the filtered data: (5030, 67)


Missing values in key columns for selected countries:
date                     0
location                 0
total_cases              8
total_deaths             8
new_cases              450
new_deaths               8
total_vaccinations    2611
dtype: int64


First 5 rows of the filtered dataset (df_countries):
       iso_code continent location       date  total_cases  new_cases  \
173549      IND      Asia    India 2020-01-05          0.0        0.0   
173550      IND      Asia    India 2020-01-06          0.0        0.0   
173551      IND      Asia    India 2020-01-07          0.0        0.0   
173552      IND      Asia    India 2020-01-08          0.0        0.0   
173553      IND      Asia    India 2020-01-09          0.0        0.0   

        new_cases_smoothed  total_deaths  new_deaths 

In [8]:
# Sort the DataFrame by location and date to ensure correct filling
df_countries = df_countries.sort_values(by=['location', 'date'])

# Fill missing values for case and death counts
# For new cases/deaths, 0 is a reasonable fill for missing reports
df_countries['new_cases'] = df_countries['new_cases'].fillna(0)
df_countries['new_deaths'] = df_countries['new_deaths'].fillna(0)

# For total cases/deaths, NaNs are few.
# Assuming they are at the start, fill with 0.
# If these were intermittent, more advanced filling like interpolate() or ffill() per group might be better,
# but for only 8 NaNs, this is a simple start.
df_countries['total_cases'] = df_countries['total_cases'].fillna(0)
df_countries['total_deaths'] = df_countries['total_deaths'].fillna(0)

# Handle missing total_vaccinations
# First, fill all NaNs with 0 (for dates before vaccinations started or no data)
df_countries['total_vaccinations'] = df_countries['total_vaccinations'].fillna(0)

# Then, for total_vaccinations, it's cumulative. So, after the first real value,
# subsequent NaNs for a country should be filled with the previous known value.
# We group by 'location' and then use ffill (forward fill).
df_countries['total_vaccinations'] = df_countries.groupby('location', group_keys=False)['total_vaccinations'].ffill()


# Re-check missing values for key columns in the cleaned data
# key_columns was defined in a previous cell:
# key_columns = ['date', 'location', 'total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']
print("Missing values in key columns after cleaning (df_countries):")
print(df_countries[key_columns].isnull().sum())
print("\n" + "="*50 + "\n")

# Display a sample of the data to verify changes
# Let's look at India around when vaccinations might have started
print("Sample of data from India around early 2021:")
# Filter for India and a specific date range to check vaccination data
india_early_2021 = df_countries[
    (df_countries['location'] == 'India') &
    (df_countries['date'] >= '2021-01-10') &
    (df_countries['date'] <= '2021-01-25')
]
print(india_early_2021[['date', 'location', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_vaccinations']].head(15))
print("\n" + "="*50 + "\n")

print("Last few rows for United States to check vaccination fill:")
print(df_countries[df_countries['location'] == 'United States'][['date', 'location', 'total_vaccinations']].tail())
print("\n" + "="*50 + "\n")

print("Last few rows for Kenya to check vaccination fill:")
print(df_countries[df_countries['location'] == 'Kenya'][['date', 'location', 'total_vaccinations']].tail())

Missing values in key columns after cleaning (df_countries):
date                  0
location              0
total_cases           0
total_deaths          0
new_cases             0
new_deaths            0
total_vaccinations    0
dtype: int64


Sample of data from India around early 2021:
             date location  total_cases  new_cases  total_deaths  new_deaths  \
173920 2021-01-10    India   10450284.0   126319.0      150999.0      1564.0   
173921 2021-01-11    India   10450284.0        0.0      150999.0         0.0   
173922 2021-01-12    India   10450284.0        0.0      150999.0         0.0   
173923 2021-01-13    India   10450284.0        0.0      150999.0         0.0   
173924 2021-01-14    India   10450284.0        0.0      150999.0         0.0   
173925 2021-01-15    India   10450284.0        0.0      150999.0         0.0   
173926 2021-01-16    India   10450284.0        0.0      150999.0         0.0   
173927 2021-01-17    India   10557985.0   107701.0      152274.0      1

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a pleasant style for the plots
sns.set_style("whitegrid")

# Plot total cases over time for selected countries
plt.figure(figsize=(14, 8)) # Set the figure size for better readability
sns.lineplot(data=df_countries, x='date', y='total_cases', hue='location')

plt.title('Total COVID-19 Cases Over Time by Country')
plt.xlabel('Date')
plt.ylabel('Total Cases') # Consider Log Scale if ranges are vast

# Optional: If the case numbers vary dramatically, a log scale can be helpful.
# plt.yscale('log')
# If using log scale, update ylabel: plt.ylabel('Total Cases (Log Scale)')

plt.legend(title='Country')
plt.xticks(rotation=45) # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()