In [27]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows = 4000

In [None]:
covid_df = pd.read_csv('owid_covid_data.csv')

pd.options.display.max_rows = 4000

missing_values = covid_df.isnull().sum()
missing_percent = (missing_values / len(covid_df)) * 100
missing_report = pd.DataFrame({'Missing Values': missing_values, 'Percent Missing': missing_percent})
missing_report = missing_report[missing_report['Missing Values'] > 0]
missing_report

Unnamed: 0,Missing Values,Percent Missing
continent,26525,6.176721
total_cases,17631,4.105627
new_cases,19276,4.488689
new_cases_smoothed,20506,4.775111
total_deaths,17631,4.105627
new_deaths,18827,4.384133
new_deaths_smoothed,20057,4.670555
total_cases_per_million,17631,4.105627
new_cases_per_million,19276,4.488689
new_cases_smoothed_per_million,20506,4.775111


In [19]:
covid_df = pd.read_csv('owid_covid_data.csv')

relevant_cols = [
    'iso_code', 'continent', 'location', 'date',
    'population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
    'gdp_per_capita', 'extreme_poverty', 'human_development_index',
    'life_expectancy', 'cardiovasc_death_rate', 'diabetes_prevalence',
    'female_smokers', 'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand',
    'stringency_index', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated'
]

df = covid_df[relevant_cols]


reasoning behind the columns kept

gdp_per_capita | Measures economic wealth and development
extreme_poverty | Shows poverty rates
population_density | Can influence disease spread and resource strain
median_age | Older populations have different risk profiles
aged_65_older, aged_70_older | Aging population affects healthcare demand
life_expectancy | General indicator of healthcare and living conditions
human_development_index | Composite index (income, education, life expectancy)
cardiovasc_death_rate | Underlying health risk
diabetes_prevalence | Another comorbidity linked to COVID severity
female_smokers, male_smokers | Lifestyle risk factors
hospital_beds_per_thousand | Health infrastructure availability
handwashing_facilities | Hygiene infrastructure quality
population | Needed to scale many indicators (per capita adjustments)

In [9]:
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

# Combine into a report
missing_report = pd.DataFrame({
    'Missing Values': missing_values,
    'Percent Missing': missing_percent
})

# Sort the report by percent missing, highest first
missing_report = missing_report.sort_values(by='Percent Missing', ascending=False)
missing_report

Unnamed: 0,Missing Values,Percent Missing
excess_mortality,416024,96.877059
excess_mortality_cumulative,416024,96.877059
people_fully_vaccinated,351374,81.822395
people_vaccinated,348303,81.107269
total_vaccinations,344018,80.109446
handwashing_facilities,267694,62.336326
stringency_index,233245,54.31439
extreme_poverty,217439,50.63374
male_smokers,185618,43.223771
female_smokers,182270,42.444142


In [23]:
df = df.sort_values(['location', 'date'])  #filling in columns pt1
for col in ['total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated']:
    df[col] = df.groupby('location')[col].ffill()

col_to_fill = [
    'population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
    'gdp_per_capita', 'extreme_poverty', 'human_development_index',
    'life_expectancy', 'cardiovasc_death_rate', 'diabetes_prevalence',
    'female_smokers', 'male_smokers', 'handwashing_facilities', 'hospital_beds_per_thousand', 'stringency_index'
]
for col in col_to_fill:
    df[col] = df[col].fillna(df.groupby('location')[col].transform('mean'))



In [28]:
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

# Combine into a report
missing_report = pd.DataFrame({
    'Missing Values': missing_values,
    'Percent Missing': missing_percent
})

# Sort the report by percent missing, highest first
missing_report = missing_report.sort_values(by='Percent Missing', ascending=False)
missing_by_location = df[df.isna().any(axis=1)].groupby('location').size()
missing_by_location

location
Afghanistan                         1674
Africa                              1674
Albania                             1674
Algeria                              593
American Samoa                      1674
Andorra                             1674
Angola                              1674
Anguilla                            1674
Antigua and Barbuda                 1674
Argentina                           1678
Armenia                             1674
Aruba                               1674
Asia                                1684
Australia                           1674
Austria                             1674
Azerbaijan                          1674
Bahamas                             1674
Bahrain                             1674
Bangladesh                           459
Barbados                            1674
Belarus                             1674
Belgium                             1674
Belize                              1674
Benin                                502
Bermuda

In [None]:
missing_locations = df[df.isna().any(axis=1)]['location'].unique()

# Show which columns are missing for these locations
for loc in missing_locations:
    missing_cols = df[df['location'] == loc].isna().sum()
    print(f"Location: {loc}")
    print(missing_cols[missing_cols > 0])
    print("---")
# lots of systemic incompleteness

Location: Afghanistan
extreme_poverty            1674
female_smokers             1674
male_smokers               1674
total_vaccinations          414
people_vaccinated           414
people_fully_vaccinated     492
dtype: int64
---
Location: Africa
continent                     1674
population_density            1674
median_age                    1674
aged_65_older                 1674
aged_70_older                 1674
gdp_per_capita                1674
extreme_poverty               1674
human_development_index       1674
life_expectancy               1674
cardiovasc_death_rate         1674
diabetes_prevalence           1674
female_smokers                1674
male_smokers                  1674
handwashing_facilities        1674
hospital_beds_per_thousand    1674
stringency_index              1674
total_vaccinations             370
people_vaccinated              370
people_fully_vaccinated        395
dtype: int64
---
Location: Albania
handwashing_facilities     1674
total_vaccinations  