<a href="https://colab.research.google.com/github/kavi-ai-technosmith/kavi-python-browse/blob/main/AI_Model_Covid19_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



# Data source from real data bank from owid
# Load directly from URL

url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'

# create data frane to read data from URL

df = pd.read_csv(url)

# Display data types and non-null counts
print("\n--- Data Info ---")
df.info()

# Display basic statistics for numerical columns
print("\n--- Numerical Column Statistics ---")
print(df.describe())

# Check unique locations (countries/regions)
print(f"\nTotal unique locations: {df['location'].nunique()}")
print("Sample locations:", df['location'].unique()[:10])

# Convert 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])

# Sort data for easier processing later (by location and date)
df = df.sort_values(by=['location', 'date']).reset_index(drop=True)


# Look at vaccination columns specifically for a sample country (e.g., United States)
print("\n--- Sample Vaccination Data for United States ---")
us_data = df[df['location'] == 'United States'][['date', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated',
                                                 'people_fully_vaccinated_per_hundred', 'total_boosters']].head(10)
print(us_data)


print("\n--- Latest Vaccination Data for a few countries ---")
latest_data = df.groupby('location').tail(1).set_index('location')[['people_fully_vaccinated_per_hundred', 'population']].dropna()

# Filter out non-country locations (continents, 'World', etc.)
# Identify these non-country entities. Common ones are:

non_country_locations = ['World', 'European Union', 'Africa', 'Asia', 'Europe', 'North America', 'South America', 'Oceania']
df_filtered = df[~df['location'].isin(non_country_locations)].copy()


# Drop rows where 'continent' is NaN (these are usually the non-country entities we already filtered, but good to be explicit)
df_filtered.dropna(subset=['continent'], inplace=True)

# 1. Define the target date (e.g., end of 2022)
target_date = pd.to_datetime('2022-12-31')
print("2022-12-31--TARGET DATE", target_date)


# 2. Get each country's latest vaccination data up to the target date
# (This ensures we don't 'cheat' by looking into the future beyond our target)
df_target_snapshot = df_filtered[df_filtered['date'] <= target_date].groupby('location').tail(1).copy()



# 3. Define the target variable: 'high_vaccination_rate'
# Let's say 'high' means > 70% people fully vaccinated by the target date.
df_target_snapshot['high_vaccination_rate'] = (df_target_snapshot['people_fully_vaccinated_per_hundred'] >= 70).astype(int)
print(" 'high' means > 70% people fully vaccinated by the target date.")
print(df_target_snapshot)



# 4. Select relevant features for prediction (static country characteristics and early vaccination indicators)
# We'll use features that were largely stable or available early on.
features_for_model = [
    'location',
    'continent',
    'population',
    'population_density',
    'median_age',
    'aged_65_older',
    'aged_70_older',
    'gdp_per_capita',
    'cardiovasc_death_rate',
    'diabetes_prevalence',
    'handwashing_facilities',        #proxy for hygiene/development
    'hospital_beds_per_thousand',
    'life_expectancy',
    'human_development_index',
    'stringency_index',       # Average stringency over a period
                              # Let's add an 'early vaccination rate' as a feature
                              # For this, we'll need to go back and calculate it
]
print("Select relevant features for prediction (static country characteristics and early vaccination indicators")
print(features_for_model)

#5
# Calculate an 'early vaccination rate' feature: e.g., 'people_fully_vaccinated_per_hundred' at 3 months after the first reported vaccination date.
# This requires a bit more advanced grouping and shifting. Let's simplify for now
# and use a fixed early date, e.g., '2021-06-30' as an early snapshot point.

early_snapshot_date = pd.to_datetime('2021-06-30')
print("Calculate an 'early vaccination rate' feature: e.g., 'people_fully_vaccinated_per_hundred' at 3 months after the first reported vaccination date.")
df_early_snapshot = df_filtered[df_filtered['date'] <= early_snapshot_date].groupby('location').tail(1).copy()
df_early_snapshot = df_early_snapshot[['location', 'people_fully_vaccinated_per_hundred']].rename(columns={'people_fully_vaccinated_per_hundred': 'early_full_vax_per_hundred'})
print("\n")
print(df_target_snapshot)



#Merge early vaccination data into the target snapshot
print("Merge early vaccination data into the target snapshot")
df_model_data = pd.merge(df_target_snapshot[features_for_model + ['high_vaccination_rate']],
                         df_early_snapshot,
                         on='location',
                         how='left')

print(df_model_data)


# Drop the 'location' column as it's an identifier, not a feature
df_model_data.drop('location', axis=1, inplace=True)


# --- Handle Missing Values in the Model Data ---
print(f"\nShape of model data before final NaN handling: {df_model_data.shape}")
print("\nMissing values in model data:")
print(df_model_data.isnull().sum())



# Impute numerical columns with their median (more robust to outliers than mean)
for col in ['population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older',
            'gdp_per_capita', 'cardiovasc_death_rate', 'diabetes_prevalence',
            'handwashing_facilities', 'hospital_beds_per_thousand', 'life_expectancy',
            'human_development_index', 'stringency_index', 'early_full_vax_per_hundred']:
    if col in df_model_data.columns:
        df_model_data[col].fillna(df_model_data[col].median(), inplace=True)
        print("Impute numerical columns with their median (more robust to outliers than mean")
        print(col)

# For 'stringency_index', we need an average. Let's calculate the average stringency index
# for each country up to the target date.
# Re-calculate stringency_index for each country as its mean up to the target date for better representation


avg_stringency = df_filtered[df_filtered['date'] <= target_date].groupby('location')['stringency_index'].mean().reset_index()
df_model_data = pd.merge(df_model_data.drop('stringency_index', axis=1), avg_stringency, on='location', how='left') # Re-merge, dropping old stringency and re-merging
df_model_data['stringency_index'].fillna(df_model_data['stringency_index'].median(), inplace=True) # Impute remaining NaNs after merge (if any

# Drop any remaining rows with NaN in the target variable or critical features (if any after imputation)
df_model_data.dropna(inplace=True)

print(f"\nShape of model data after final NaN handling: {df_model_data.shape}")
print("\nMissing values in model data after imputation:")
print(df_model_data.isnull().sum().sum()) # Should be 0








#print(latest_data.head())


# Quick look at the data
#print(df.head(10))             #reading first 10 rows
#print(df.tail(5))              #reading last 5 rows
#print(df.columns)
#print(df.shape)




--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429435 entries, 0 to 429434
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    429435 non-null  object 
 1   continent                                   402910 non-null  object 
 2   location                                    429435 non-null  object 
 3   date                                        429435 non-null  object 
 4   total_cases                                 411804 non-null  float64
 5   new_cases                                   410159 non-null  float64
 6   new_cases_smoothed                          408929 non-null  float64
 7   total_deaths                                411804 non-null  float64
 8   new_deaths                                  410608 non-null  float64
 9   new_deaths_smoothed                         409378 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model_data[col].fillna(df_model_data[col].median(), inplace=True)


KeyError: 'location'