# 🧠 Exploratory Data Analysis (EDA) on a Synthetic COVID-19 Dataset
This project demonstrates data cleaning, preprocessing, and exploratory data analysis using **Python**, **Pandas**, **NumPy**, **Matplotlib**, and **Seaborn**.

It uses a **synthetic COVID-like dataset** generated locally, so it runs fully offline and error-free.

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')
pd.set_option('display.max_columns', 20)
print('✅ Libraries imported successfully!')

## Step 1. Generate Synthetic Dataset

In [ ]:
np.random.seed(42)
dates = pd.date_range(start='2020-01-01', periods=300)
countries = ['USA', 'India', 'Brazil', 'France', 'UK', 'Germany']
continents = ['North America', 'Asia', 'South America', 'Europe', 'Europe', 'Europe']

data = []
for c, cont in zip(countries, continents):
    total_cases, total_deaths = 0, 0
    pop = np.random.randint(30_000_000, 330_000_000)
    for d in dates:
        new_cases = np.random.poisson(lam=1000) + np.random.randint(0, 5000)
        new_deaths = np.random.poisson(lam=50) + np.random.randint(0, 200)
        total_cases += new_cases
        total_deaths += new_deaths
        data.append([cont, c, d, total_cases, new_cases, total_deaths, new_deaths, pop])

df = pd.DataFrame(data, columns=['continent', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'population'])
print('✅ Synthetic dataset created!')
print('Shape:', df.shape)
df.head()

## Step 2. Initial Data Exploration

In [ ]:
print('--- Basic Info ---')
df.info()
print('\n--- Missing Values ---')
print(df.isnull().sum())
print('\n--- Summary Statistics ---')
df.describe().T

## Step 3. Data Cleaning & Preprocessing

In [ ]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['continent', 'location'])
num_cols = ['total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'population']
df[num_cols] = df[num_cols].fillna(0)

df['death_rate'] = np.where(df['total_cases'] > 0, (df['total_deaths'] / df['total_cases']) * 100, 0)
df['cases_per_million'] = (df['total_cases'] / df['population']) * 1e6
latest_df = df.groupby('location').last().reset_index()
print('✅ Data preprocessing complete!')
latest_df.head()

## Step 4. Exploratory Data Analysis (EDA)

In [ ]:
# 1️⃣ Global trend
global_trend = df.groupby('date')[['new_cases', 'new_deaths']].sum().reset_index()
plt.figure(figsize=(10,5))
plt.plot(global_trend['date'], global_trend['new_cases'], label='New Cases')
plt.plot(global_trend['date'], global_trend['new_deaths'], label='New Deaths')
plt.title('Global COVID-19 (Synthetic) - New Cases and Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
plt.show()

# 2️⃣ Top 5 countries by total cases
top5_cases = latest_df.nlargest(5, 'total_cases')[['location', 'total_cases']]
plt.figure(figsize=(8,4))
sns.barplot(data=top5_cases, x='total_cases', y='location', palette='viridis')
plt.title('Top 5 Countries by Total COVID-19 Cases (Synthetic)')
plt.xlabel('Total Cases')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

# 3️⃣ Top 5 countries by death rate
top5_death_rate = latest_df.nlargest(5, 'death_rate')[['location', 'death_rate']]
plt.figure(figsize=(8,4))
sns.barplot(data=top5_death_rate, x='death_rate', y='location', palette='Reds_r')
plt.title('Top 5 Countries by Death Rate (Synthetic)')
plt.xlabel('Death Rate (%)')
plt.ylabel('Country')
plt.tight_layout()
plt.show()

# 4️⃣ Scatter plot: Cases per million vs Population
plt.figure(figsize=(7,5))
sns.scatterplot(data=latest_df, x='population', y='cases_per_million', hue='location', s=80)
plt.title('Cases per Million vs Population (Synthetic Data)')
plt.xlabel('Population')
plt.ylabel('Cases per Million')
plt.xscale('log')
plt.tight_layout()
plt.show()

# 5️⃣ Death rate by continent
continent_stats = df.groupby('continent')[['total_cases', 'total_deaths']].max()
continent_stats['death_rate'] = (continent_stats['total_deaths'] / continent_stats['total_cases']) * 100
continent_stats['death_rate'].plot(kind='bar', color='steelblue')
plt.title('Average Death Rate by Continent (Synthetic)')
plt.ylabel('Death Rate (%)')
plt.tight_layout()
plt.show()

## Step 5. Insights Summary

In [ ]:
insights = {
    'Total countries analyzed': latest_df['location'].nunique(),
    'Average death rate (%)': round(latest_df['death_rate'].mean(), 2),
    'Highest death rate country': latest_df.loc[latest_df['death_rate'].idxmax(), 'location'],
    'Highest total cases country': latest_df.loc[latest_df['total_cases'].idxmax(), 'location']
}

print('\n--- 📊 Insights Summary ---')
for k, v in insights.items():
    print(f'{k}: {v}')