In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
covid_df = pd.read_csv('owid_covid_data.csv')

pd.options.display.max_rows = 4000

def missing_report_gen(covid_df):
    missing_values = covid_df.isnull().sum()
    missing_percent = (missing_values / len(covid_df)) * 100
    missing_report = pd.DataFrame({'Missing Values': missing_values, 'Percent Missing': missing_percent})
    missing_report = missing_report[missing_report['Missing Values'] > 0]
    
    return missing_report

missing_report = missing_report_gen(covid_df)
missing_report

#### Data Preprocessing
---

In [None]:
# Removing columns 
high_missing_cols = covid_df.columns[covid_df.isnull().mean() > 0.90]
medium_missing_cols = covid_df.columns[(covid_df.isnull().mean() > 0.50) & (covid_df.isnull().mean() <= 0.90)]

covid_df_clean = covid_df.drop(columns=high_missing_cols).drop(columns=medium_missing_cols)

missing_report_clean = missing_report_gen(covid_df_clean)
missing_report_clean

#### Feature Selection and Derivation


In [None]:
covid_df_clean['case_fatality_rate'] = covid_df_clean['total_deaths'] / covid_df_clean['total_cases']
covid_df_clean['log_total_cases'] = np.log1p(covid_df_clean['total_cases'])
covid_df_clean['log_total_deaths'] = np.log1p(covid_df_clean['total_deaths'])

### Exploratory Data Analysis
---

#### Visualizations

In [None]:
# Summary statistics for key variables
print(covid_df_clean[['total_cases', 'total_deaths', 'gdp_per_capita', 'life_expectancy']].describe())

# Histograms for total_cases and total_deaths
plt.figure(figsize=(10, 5))
sns.histplot(covid_df_clean['log_total_cases'], kde=True, color='lightgreen')
plt.title('Distribution of Log Total Cases')
plt.xlabel('Log of Total Cases')
plt.ylabel('Number of Countries')
plt.tight_layout
plt.show()

plt.figure(figsize=(10, 5))
sns.histplot(covid_df_clean['log_total_deaths'], kde=True, color='lightcoral')
plt.title('Distribution of Log Total Deaths')
plt.xlabel('Log of Total Deaths')
plt.ylabel('Number of Countries')
plt.tight_layout
plt.show()

# Scatterplots
sns.scatterplot(x='gdp_per_capita', y='log_total_cases', data=covid_df_clean)
plt.title('GDP per Capita vs Total COVID-19 Cases (Log Scale)')
plt.show()

sns.scatterplot(x='human_development_index', y='case_fatality_rate', data=covid_df_clean)
plt.title('HDI vs Case Fatality Rate')
plt.show()


# Boxplot for total_cases by continent
sns.boxplot(x='continent', y='log_total_cases', data=covid_df_clean)
plt.xticks(rotation=45)
plt.title('Log Total Cases by Continent')
plt.show()


#### Correlation Analysis

In [None]:
corr_vars = [
    'total_cases', 'total_deaths', 'case_fatality_rate',
    'gdp_per_capita', 'life_expectancy', 'population_density',
    'human_development_index', 'median_age', 'aged_65_older'
]

sns.heatmap(covid_df_clean[corr_vars].corr(method='pearson'), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Between COVID-19 Metrics and Socioeconomic Indicators')
plt.show()