
# Foundation of Data Science Project
# COVID-19 Data Analysis 




In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
from sklearn.preprocessing import MinMaxScaler



sns.set_style("whitegrid")  
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 18
warnings.filterwarnings('ignore')

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)

##  DATA UNDERSTANDING AND PREPROCESSING

This section covers data loading, inspection, cleaning, and transformation.

In [None]:
def load_and_inspect_data(file_path):
   
    print(f"Loading data from {file_path}...")
    df = pd.read_csv(file_path)
    
    print("\n--- Dataset Overview ---")
    print(f"Shape: {df.shape}")
    print("\nFirst 5 rows:")
    display(df.head())
    

    
    print("\nData types:")
    print(df.dtypes)
    
    print("\nSummary statistics:")
    display(df.describe())
    
    return df

# Load COVID-19 confirmed cases data
covid_data = load_and_inspect_data('Datasets/confirmed.csv')

# Filter for Nepal data
nepal_data = covid_data[covid_data['Country/Region'] == 'Nepal']

# Display Nepal data
print("\n--- Nepal Data ---")
display(nepal_data)

In [None]:
print("\n--- Data Structure Analysis ---")
print(f"Number of countries/regions: {covid_data['Country/Region'].nunique()}")
print(f"Number of provinces/states: {covid_data['Province/State'].nunique()}")
print(f"Date range: From {covid_data.columns[4]} to {covid_data.columns[-1]}")

### 3.1 Types of data: Structured, unstructured, semi-structured

In [None]:
print("This dataset is STRUCTURED data in tabular format with:")
print("- Geographic identifiers (Country/Region, Province/State)")
print("- Geospatial coordinates (Lat, Long)")
print("- Time series data (daily confirmed cases)")

### 3.2-3.4 Data Preprocessing & Cleaning

In [None]:
def preprocess_covid_data(df):
    """
    Preprocessing  COVID-19 data by:
    1. Reshaping from wide to long format
    2. Converting date columns to proper datetime
    3. Handling missing values
    4. Creating aggregated country-level data
    """
    print("Preprocessing COVID-19 data...")
    
    preprocessed_df = df.copy()
    
    # Drop unnecessary columns
    preprocessed_df.drop(columns=['Province/State', 'Lat', 'Long'], inplace=True)
    #here inplace=True is used to modify the original DataFrame without creating a copy
    
    # 1. Check and handle missing values
    print(f"\nMissing values before cleaning:")
    print(preprocessed_df.isnull().sum().sum())
    
    # The first .sum() computes the sum of missing values for each column, and the second .sum() computes the total across all columns.
    
    # 2. Reshape data from wide to long format
    print("\nReshaping data from wide to long format...")
    geo_cols = ['Country/Region']
    
    date_cols = preprocessed_df.columns[1:]  # everything after 'Country/Region'
    long_df = pd.melt(
        preprocessed_df, 
        id_vars=geo_cols, #geo cols to keep as identifiers and remain unchanged
        value_vars=date_cols, 
        var_name='Date',
        value_name='Confirmed'
    )
    
    # 3. Convert date strings to datetime objects 01/22/20 type formatting
    long_df['Date'] = pd.to_datetime(long_df['Date'], format='%m/%d/%y')
    
    # 4. Ensure confirmed cases are numeric
    long_df['Confirmed'] = pd.to_numeric(long_df['Confirmed'], errors='coerce').fillna(0).astype(int)
        # Ensures the 'Confirmed' column contains numeric values:
            # pd.to_numeric() converts values to numbers
            # errors='coerce' turns problematic values into NaN
            # fillna(0) replaces any NaN values with zero
            # astype(int) converts everything to integers
    
    # 5. Create a country-level aggregated dataset
    print("\nCreating country-level aggregated dataset...")
    country_df = long_df.groupby(['Country/Region', 'Date'])['Confirmed'].sum().reset_index() # ensures one row per country per data
    
    # 6. Sort  countries and then by date in asceinding order and calculate daily new cases
    country_df = country_df.sort_values(['Country/Region', 'Date'])
    country_df['Daily_New_Cases'] = country_df.groupby('Country/Region')['Confirmed'].diff().fillna(0)
    country_df['Daily_New_Cases'] = country_df['Daily_New_Cases'].apply(lambda x: max(0, x)) # ensures no negative values
    
    # 7. Calculate rolling averages of 7 days . meaning average in like day(0) day(0,1 ) day(0,1,2) day(0,1,2,3) day(0,1,2,3,4) day(0,1,2,3,4,5) day(0,1,2,3,4,5,6) and so on.
    country_df['7_Day_Avg_New_Cases'] = country_df.groupby('Country/Region')['Daily_New_Cases'].transform(
        lambda x: x.rolling(7, min_periods=1).mean()
    )
    
    print("\nData preprocessing completed!")
    print(f"Long format shape: {long_df.shape}")
    print(f"Country-level aggregated data shape: {country_df.shape}")
    
    return long_df, country_df

# Apply preprocessing
covid_long_df, covid_country_df = preprocess_covid_data(covid_data)


In [None]:
# Display the preprocessed data
print("--- Preprocessed Data Sample (Long Format) ---")
display(covid_long_df.head())

print("\n--- Preprocessed Data Sample (Country-Level) - Nepal ---")
display(covid_country_df[covid_country_df['Country/Region'] == 'Nepal'].head())

### 3.5-3.6 Data Wrangling, Enrichment & Validation

In [None]:
def enrich_covid_data(df):
    """
    Enrich COVID-19 data with additional features:
    1. Extract month and year for temporal analysis
    2. Calculate metrics like infection rate and growth rate
    3. Add categorical variables for high/medium/low infection periods
    
    Parameters:
    -----------
    df : pd.DataFrame
        Preprocessed country-level COVID-19 dataframe
        
    Returns:
    --------
    pd.DataFrame
        Enriched dataframe
    """
    print("Enriching COVID-19 data with additional features...")
    
    # Create a copy of the dataframe
    enriched_df = df.copy()
    
    # 1. Extract temporal features like separate year month week and day of week
    enriched_df['Year'] = enriched_df['Date'].dt.year
    enriched_df['Month'] = enriched_df['Date'].dt.month
    enriched_df['Week'] = enriched_df['Date'].dt.isocalendar().week
    enriched_df['DayOfWeek'] = enriched_df['Date'].dt.dayofweek
    
    # 2. Calculate infection rate (rolling 7-day average of new cases)
    # it is different from the 7-day average of confirmed cases as it calculates for all the column. not grouped by country
    enriched_df['Infection_Rate'] = enriched_df['Daily_New_Cases'].rolling(7).mean()
    
    # 3. Calculate growth rate (percentage change)
    # Fixing the error by applying the growth rate calculation correctly
    enriched_df['Growth_Rate'] = enriched_df.groupby('Country/Region')['Confirmed'].apply(
        lambda x: x.replace(0, np.nan).pct_change().replace([np.inf, -np.inf], np.nan).fillna(0) * 100
    ).reset_index(level=0, drop=True)  # Reset index to align with the original dataframe
             # x.replace(0, np.nan) - Replaces 0 values with NaN to avoid division by zero
             # pct_change() - Calculates percentage change
             # replace([np.inf, -np.inf], np.nan) - Replaces infinite values with NaN
             # fillna(0) - Replaces NaN with 0
             # * 100 - Converts to percentage



    # 4. Calculate infection ratio (new cases divided by total confirmed cases)
    enriched_df['Infection_Ratio'] = enriched_df['Daily_New_Cases'] / enriched_df['Confirmed'].replace(0, 1)
    
    # 5. Add categorical variables (based on 7-day average of new cases)
    conditions = [
        (enriched_df['7_Day_Avg_New_Cases'] == 0),
        (enriched_df['7_Day_Avg_New_Cases'] > 0) & (enriched_df['7_Day_Avg_New_Cases'] <= 100),
        (enriched_df['7_Day_Avg_New_Cases'] > 100) & (enriched_df['7_Day_Avg_New_Cases'] <= 1000),
        (enriched_df['7_Day_Avg_New_Cases'] > 1000)
    ]
    choices = ['No Cases', 'Low', 'Medium', 'High']
    enriched_df['Infection_Level'] = np.select(conditions, choices, default='Unknown')
    
    # 6. Validate data: Check for missing values
    missing_values = enriched_df.isnull().sum()
    print("\nMissing values after enrichment:")
    print(missing_values[missing_values > 0])
    
    # Fill missing values with 0 for numeric columns
    numeric_cols = enriched_df.select_dtypes(include=[np.number]).columns
    enriched_df[numeric_cols] = enriched_df[numeric_cols].fillna(0)
    
    print("\nData enrichment completed!")
    return enriched_df

# Enrich the country-level data
covid_enriched_df = enrich_covid_data(covid_country_df)

# Display the enriched data
print("\n--- Enriched Data Sample ---")
display(covid_enriched_df.head())

# Display the enriched data for Nepal only
print("\n--- Enriched Data Sample (Nepal) ---")
display(covid_enriched_df[covid_enriched_df['Country/Region'] == 'Nepal'].head())


### 3.7-3.8 Data Transformation & Dimensionality Reduction

In [None]:
# Select top 20 countries by total confirmed cases
top_countries = covid_enriched_df.groupby('Country/Region')['Confirmed'].max().sort_values(ascending=False).head(20).index
# covid_enriched_df.groupby('Country/Region')['Confirmed'] - Groups the data by country and selects the 'Confirmed' column
# .max() - Gets the maximum value for each country (which represents their peak or most recent total case count)
# .sort_values(ascending=False) - Sorts countries from highest to lowest case counts
# .head(20) - Takes only the top 20 countries
# .index - Extracts just the country names
top_countries_df = covid_enriched_df[covid_enriched_df['Country/Region'].isin(top_countries)].copy() # create a boolean mask that's true only for rows where the countryt is in our top 20 list and the filtered data is copied to a new data frame to avoid modifying the original data

print(f"Selected top 20 countries by total confirmed cases:")
print(top_countries.tolist())

# Normalize data for comparison across countries
print("\nNormalizing data for comparison across countries...")

In [None]:
def normalize_for_comparison(df, countries, target_columns):
    """
    Normalize data for each country to enable fair comparison
    
    Parameters:
    -----------
    df : pd.DataFrame
        Enriched dataframe
    countries : list
        List of countries to normalize
    target_columns : list
        List of columns to normalize
        
    Returns:
    --------
    pd.DataFrame
        Dataframe with normalized columns
    """
    normalized_df = df[df['Country/Region'].isin(countries)].copy() # This creates a copy with only those countries' data and initializes the MinMaxScaler.
    
    # Create a scaler
    scaler = MinMaxScaler()
    
    # Normalize each column for each country
    for country in countries:
        country_mask = normalized_df['Country/Region'] == country
        for column in target_columns:
            # Get the values for this country and column
            values = normalized_df.loc[country_mask, column].values
            
            # Replace infinities and NaNs with 0
            values = np.nan_to_num(values, nan=0.0, posinf=0.0, neginf=0.0)
            
            # Skip if all values are the same (would cause division by zero in scaling)
            if len(values) == 0 or np.min(values) == np.max(values):
                normalized_df.loc[country_mask, f'Normalized_{column}'] = 0.5  # Set to middle value
                continue
                
            # Reshape for the scaler and transform
            values = values.reshape(-1, 1)
            try:
                normalized_values = scaler.fit_transform(values)
                normalized_df.loc[country_mask, f'Normalized_{column}'] = normalized_values
            except ValueError:
                # If scaling fails, manually normalize to 0-1 range
                min_val = np.min(values)
                max_val = np.max(values)
                if min_val == max_val:
                    normalized_values = np.zeros(values.shape)
                else:
                    normalized_values = (values - min_val) / (max_val - min_val)
                normalized_df.loc[country_mask, f'Normalized_{column}'] = normalized_values
    
    return normalized_df

# Columns to normalize
columns_to_normalize = ['Confirmed', 'Daily_New_Cases', '7_Day_Avg_New_Cases', 'Growth_Rate']

# Fix any problematic values in the source data
for column in columns_to_normalize:
    top_countries_df[column] = top_countries_df[column].replace([np.inf, -np.inf], np.nan)
    top_countries_df[column] = top_countries_df[column].fillna(0)

# Apply normalization
try:
    normalized_df = normalize_for_comparison(top_countries_df, top_countries, columns_to_normalize)
    print("\n--- Normalized Data Sample ---")
    print(normalized_df.head().to_string())  # Using print instead of display
except Exception as e:
    print(f"Error during normalization: {e}")
    # Create a simplified version without normalization as a fallback
    normalized_df = top_countries_df.copy()
    print("\n--- Data Sample (normalization failed) ---")
    print(normalized_df.head().to_string())

## SECTION 4: DATA ANALYSIS

This section covers descriptive analytics, exploratory data analysis, and data visualization.

### 4.1-4.2 Descriptive Analytics & Exploratory Data Analysis

In [None]:
def perform_descriptive_analysis(df, column_name='Confirmed'):
    """
    Perform descriptive statistical analysis on a column
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame to analyze
    column_name : str
        Column to analyze
    """
    print(f"\nDescriptive statistics for {column_name}:")
    stats_df = df.groupby('Country/Region')[column_name].describe()
    # groups that data by country , for each country calculates summary statitics....
    #here the describe() method is used to get summary statistics for the specified column (like mean, std, min, 25%, 50%, 75%, max) for each country.
    print(stats_df.head().to_string())  # Using print instead of display
    
    # Additional statistics
    print("\nSkewness and Kurtosis (indicating distribution shape):")
    
    # Using agg with lambda for each country
    def calculate_skew_kurt(group):
        from scipy.stats import skew, kurtosis
        return pd.Series({ # panda series is like a single row of data
            'skew': skew(group),
            'kurtosis': kurtosis(group)
        })
    
    skew_kurt = df.groupby('Country/Region')[column_name].apply(calculate_skew_kurt) # generates a new DataFrame with skewness and kurtosis for each country
    print(skew_kurt.head().to_string())  # Using print instead of display

# Perform descriptive analysis for confirmed cases and daily new cases
perform_descriptive_analysis(covid_enriched_df, 'Confirmed')
perform_descriptive_analysis(covid_enriched_df, 'Daily_New_Cases')

In [None]:
# Temporal analysis - how cases evolved over time
print("\nTemporal analysis - Monthly averages:")
monthly_avg = covid_enriched_df.groupby(['Year', 'Month'])['Daily_New_Cases'].mean().reset_index()
display(monthly_avg.head(10))

### 4.3-4.5 Data Visualization

In [None]:
def plot_global_trend():
    """Plot the global COVID-19 trend over time"""
    
    # Calculate global daily cases
    global_daily = covid_enriched_df.groupby('Date')['Daily_New_Cases'].sum().reset_index()
    global_total = covid_enriched_df.groupby('Date')['Confirmed'].sum().reset_index()

            # Group the COVID data by date
            # Sum up the daily new cases across all countries for each date (into global_daily)
            # Sum up the total confirmed cases across all countries for each date (into global_total)
            # reset_index() converts the result back to a regular dataframe with 'Date' as a column

            # For example:
            # Date         Daily_New_Cases
            # 2020-01-01   100
            # 2020-01-02   150
    
    # Create a figure with two y-axes
    fig, ax1 = plt.subplots(figsize=(14, 8))
    
    # Plot daily new cases on the first y-axis
    ax1.set_xlabel('Date', fontsize=14)
    ax1.set_ylabel('Daily New Cases', color='tab:blue', fontsize=14)
    ax1.bar(global_daily['Date'], global_daily['Daily_New_Cases'], 
            alpha=0.3, color='tab:blue', label='Daily New Cases')
    ax1.tick_params(axis='y', labelcolor='tab:blue')
    
    # Create a 7-day rolling average
    global_daily['7_Day_Avg'] = global_daily['Daily_New_Cases'].rolling(7).mean()
    ax1.plot(global_daily['Date'], global_daily['7_Day_Avg'], 
             color='navy', linewidth=2, label='7-Day Moving Average')
    
    # Create a second y-axis for total confirmed cases
    ax2 = ax1.twinx()
    ax2.set_ylabel('Total Confirmed Cases', color='tab:red', fontsize=14)
    ax2.plot(global_total['Date'], global_total['Confirmed'], 
             color='tab:red', linewidth=2, label='Total Confirmed Cases')
    ax2.tick_params(axis='y', labelcolor='tab:red')
    
    # Combine legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
    
    plt.title('Global COVID-19 Trend Over Time', fontsize=18)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    plt.show()

# Execute the visualization function
plot_global_trend()

In [None]:
def plot_top_countries_comparison():
    """Plot comparison of COVID-19 cases across top countries"""
    
    # Get top 10 countries by total confirmed cases
    top_10 = covid_enriched_df.groupby('Country/Region')['Confirmed'].max().sort_values(ascending=False).head(10)
    
    # Create a bar plot
    plt.figure(figsize=(14, 10))
    sns.barplot(x=top_10.values, y=top_10.index, palette='viridis')
    
    plt.title('Top 10 Countries by Total Confirmed COVID-19 Cases', fontsize=18)
    plt.xlabel('Total Confirmed Cases', fontsize=14)
    plt.ylabel('Country', fontsize=14)
    plt.grid(True, axis='x', alpha=0.3)
    plt.ticklabel_format(style='plain', axis='x')
    
    plt.show()

# Execute the visualization function
plot_top_countries_comparison()

In [None]:
def plot_infection_curves():
    """Plot normalized infection curves for top countries"""
    
    # Select top 6 countries for clarity
    top_6_countries = normalized_df['Country/Region'].value_counts().head(10).index
    
    # Plot normalized confirmed cases
    plt.figure(figsize=(14, 8))
    for country in top_6_countries:
        country_data = normalized_df[normalized_df['Country/Region'] == country]
        plt.plot(country_data['Date'], country_data['Normalized_Confirmed'], 
                 label=country, linewidth=2)
    
    plt.title('Normalized COVID-19 Infection Curves for Top 10 Countries', fontsize=18)
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Normalized Confirmed Cases (0-1 scale)', fontsize=14)
    plt.legend(loc='upper left')
    plt.grid(True, alpha=0.3)
    
    plt.show()

# Execute the visualization function
plot_infection_curves()

In [None]:
def plot_heatmap():
    """Plot heatmap of COVID-19 cases by month for top countries"""
    
    # Prepare data for heatmap - monthly cases for top countries
    top_10_countries = covid_enriched_df.groupby('Country/Region')['Confirmed'].max().sort_values(ascending=False).head(10).index
    monthly_cases = covid_enriched_df[covid_enriched_df['Country/Region'].isin(top_10_countries)]
    
    # Create a pivot table: Countries x Months with avg daily cases as values
    heatmap_data = pd.pivot_table(
        monthly_cases,
        values='Daily_New_Cases',
        index='Country/Region',
        columns=['Year', 'Month'],
        aggfunc='mean'
    )
    
    # Create a heatmap
    plt.figure(figsize=(18, 10))
    sns.heatmap(heatmap_data, cmap='YlOrRd', annot=False, linewidths=.5)
    
    plt.title('Average Daily New COVID-19 Cases by Month and Country', fontsize=18)
    plt.xlabel('Year and Month', fontsize=14)
    plt.ylabel('Country', fontsize=14)
    
    plt.show()

# Execute the visualization function
plot_heatmap()

### 4.6 Feature Engineering for Advanced Analysis

In [None]:
def extract_key_pandemic_phases(df):
    """
    Extract key pandemic phases based on global trends
    
    Parameters:
    -----------
    df : pd.DataFrame
        Enriched COVID dataframe
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with pandemic phases
    """
    # Calculate global daily cases
    global_daily = df.groupby('Date')['Daily_New_Cases'].sum().reset_index()
    
    # Calculate the global 7-day average
    global_daily['Global_7Day_Avg'] = global_daily['Daily_New_Cases'].rolling(7).mean()
    
    # Find peaks and troughs to identify waves
    # This is a simplified approach - in reality, more sophisticated algorithms would be used
    
    # Calculate the difference between consecutive days
    global_daily['diff'] = global_daily['Global_7Day_Avg'].diff()
    
    # Identify sign changes (from positive to negative) to find peaks
    global_daily['sign_change'] = np.sign(global_daily['diff']) != np.sign(global_daily['diff'].shift(1))
    
    # Identify potential waves (peaks)
    peaks = global_daily[(global_daily['sign_change']) & (global_daily['diff'] < 0)]
    
    # Only keep significant peaks (those with substantial cases)
    significant_peaks = peaks[peaks['Global_7Day_Avg'] > peaks['Global_7Day_Avg'].quantile(0.75)]
    
    print(f"\nIdentified {len(significant_peaks)} significant COVID-19 waves globally")
    print("Peak dates:")
    display(significant_peaks[['Date', 'Global_7Day_Avg']].sort_values('Global_7Day_Avg', ascending=False).head())
    
    # Define pandemic phases based on significant peaks
    if len(significant_peaks) >= 3:
        phase_boundaries = significant_peaks.nlargest(3, 'Global_7Day_Avg')['Date'].sort_values().tolist()
        
        # Function to assign phases
        def assign_phase(date):
            if date < phase_boundaries[0]:
                return "First Wave"
            elif date < phase_boundaries[1]:
                return "Second Wave"
            elif date < phase_boundaries[2]:
                return "Third Wave" 
            else:
                return "Later Waves"
        
        # Add phase information to the original dataframe
        df['Pandemic_Phase'] = df['Date'].apply(assign_phase)
        
        print("\nPandemic phases assigned based on global peaks")
    else:
        print("\nInsufficient data to identify multiple pandemic waves")
        df['Pandemic_Phase'] = "Undetermined"
    
    return df

# Add pandemic phases to the enriched dataframe
covid_enriched_df = extract_key_pandemic_phases(covid_enriched_df)

# Display the distribution of data across pandemic phases
print("\nData distribution across pandemic phases:")
display(covid_enriched_df['Pandemic_Phase'].value_counts())

In [None]:
def calculate_peak_metrics(df):
    """
    Calculate peak metrics for each country
    
    Parameters:
    -----------
    df : pd.DataFrame
        Enriched COVID dataframe with pandemic phases
        
    Returns:
    --------
    pd.DataFrame
        DataFrame with peak metrics by country
    """
    # Group by country
    countries = df['Country/Region'].unique()
    peak_metrics = []
    
    for country in countries:
        country_data = df[df['Country/Region'] == country].copy()
        
        # Skip countries with limited data
        if len(country_data) < 30:
            continue
            
        # Find peak daily cases
        peak_day = country_data.loc[country_data['Daily_New_Cases'].idxmax()]
        peak_daily = peak_day['Daily_New_Cases']
        peak_date = peak_day['Date']
        
        # Find peak 7-day average
        peak_avg = country_data['7_Day_Avg_New_Cases'].max()
        peak_avg_date = country_data.loc[country_data['7_Day_Avg_New_Cases'].idxmax(), 'Date']
        
        # Calculate days to peak from first case
        first_case_date = country_data.loc[country_data['Confirmed'] > 0, 'Date'].min()
        if pd.notna(first_case_date) and pd.notna(peak_date):
            days_to_peak = (peak_date - first_case_date).days
        else:
            days_to_peak = None
            
        # Calculate growth metrics
        max_growth_rate = country_data['Growth_Rate'].max()
        
        # Store metrics
        peak_metrics.append({
            'Country/Region': country,
            'Peak_Daily_Cases': peak_daily,
            'Peak_Date': peak_date,
            'Peak_7Day_Avg': peak_avg,
            'Peak_Avg_Date': peak_avg_date,
            'Days_To_Peak': days_to_peak,
            'Max_Growth_Rate': max_growth_rate
        })
    
    # Create dataframe of peak metrics
    peak_metrics_df = pd.DataFrame(peak_metrics)
    
    print("\nCalculated peak metrics for each country:")
    display(peak_metrics_df.head())
    
    return peak_metrics_df

# Calculate peak metrics for each country
peak_metrics_df = calculate_peak_metrics(covid_enriched_df)

In [None]:
def plot_days_to_peak():
    """Plot days to peak for top countries"""
    
    # Select top 20 countries with most cases for analysis
    top_20_countries = peak_metrics_df.sort_values('Peak_Daily_Cases', ascending=False).head(20)
    
    # Create the bar chart
    plt.figure(figsize=(14, 8))
    sns.barplot(x='Days_To_Peak', y='Country/Region', 
                data=top_20_countries.sort_values('Days_To_Peak'), 
                palette='coolwarm')
    
    plt.title('Days from First Case to Peak COVID-19 Outbreak by Country', fontsize=18)
    plt.xlabel('Number of Days', fontsize=14)
    plt.ylabel('Country', fontsize=14)
    plt.grid(True, axis='x', alpha=0.3)
    
    plt.show()

# Execute the visualization function
plot_days_to_peak()

## JOINING WITH WORLD HAPPINESS REPORT DATASET

In [None]:



def load_and_preprocess_happiness_data(file_path='Datasets/worldwide_happiness_report.csv'):
    """
    Load and preprocess World Happiness Report data
    
    Parameters:
    -----------
    file_path : str
        Path to the World Happiness Report CSV
        
    Returns:
    --------
    pd.DataFrame
        Preprocessed World Happiness dataframe
    """
    print("\nLoading World Happiness Report data...")
    
   
        # Try to load the actual file if available
    happiness_df = pd.read_csv(file_path)
    print("Successfully loaded World Happiness Report data")
   
    
    # Display the data
    print("\n--- World Happiness Report Data Sample ---")
    print(happiness_df.head().to_string())
   
    
    return happiness_df

# Load happiness data
happiness_df = load_and_preprocess_happiness_data()

In [None]:
def join_covid_and_happiness_data(covid_df, happiness_df, peak_metrics_df):
    """
    Join COVID-19 data with World Happiness Report data for correlation analysis
    
    Parameters:
    -----------
    covid_df : pd.DataFrame
        Enriched COVID-19 dataframe
    happiness_df : pd.DataFrame
        World Happiness Report dataframe
    peak_metrics_df : pd.DataFrame
        Peak metrics by country
        
    Returns:
    --------
    pd.DataFrame
        Joined dataframe for analysis
    """
    print("\nJoining COVID-19 data with World Happiness Report data...")
    
    # Ensure country names are compatible between datasets
    covid_countries = set(covid_df['Country/Region'].unique())
    happiness_countries = set(happiness_df['Country'].unique())
    
    print(f"\nCOVID-19 dataset has {len(covid_countries)} countries")
    print(f"World Happiness Report has {len(happiness_countries)} countries")
    print(f"Common countries: {len(covid_countries.intersection(happiness_countries))}")
    
    # Create a mapping for inconsistent country names
    country_mapping = {
        'US': 'United States',
        'Korea, South': 'South Korea',
        'United Kingdom': 'UK',
        # Add more mappings as needed
    }
    
    # Apply country name standardization
    covid_summary = covid_df.copy()
    covid_summary['Country_Std'] = covid_summary['Country/Region'].map(
        lambda x: country_mapping.get(x, x))
    
    # Create a dataframe with summary COVID metrics per country
    covid_summary = covid_summary.groupby('Country_Std').agg({
        'Confirmed': 'max',
        'Daily_New_Cases': 'max',
        '7_Day_Avg_New_Cases': 'max'
    }).reset_index()
    
    # Merge the peak metrics
    if peak_metrics_df is not None:
        # Standardize country names in peak metrics
        peak_metrics_copy = peak_metrics_df.copy()
        peak_metrics_copy['Country_Std'] = peak_metrics_copy['Country/Region'].map(
            lambda x: country_mapping.get(x, x))
        
        covid_summary = covid_summary.merge(
            peak_metrics_copy[['Country_Std', 'Peak_7Day_Avg', 'Days_To_Peak', 'Max_Growth_Rate']], 
            on='Country_Std', 
            how='left'
        )
    
    # Merge with happiness data
    # Standardize happiness country names for better matching
    happiness_copy = happiness_df.copy()
    happiness_copy['Country_Std'] = happiness_copy['Country'].map(
        lambda x: country_mapping.get(x, x))
    
    # Perform the join
    joined_df = covid_summary.merge(
        happiness_copy,
        left_on='Country_Std',
        right_on='Country_Std',
        how='inner'
    )
    
    # Clean up the joined dataframe
    if 'Country_x' in joined_df.columns and 'Country_y' in joined_df.columns:
        joined_df = joined_df.rename(columns={'Country_y': 'Country'})
        joined_df = joined_df.drop(columns=['Country_x'])
    
    print(f"\nSuccessfully joined data for {joined_df.shape[0]} countries")
    print("\n--- Joined Data Sample ---")
    print(joined_df.head().to_string())
    
    return joined_df

# Join the datasets
joined_df = join_covid_and_happiness_data(covid_enriched_df, happiness_df, peak_metrics_df)

In [None]:

print("CORRELATION ANALYSIS")

def calculate_correlations(df):
    """
    Calculate correlations between COVID-19 metrics and happiness factors
    
    Parameters:
    -----------
    df : pd.DataFrame
        Joined dataframe with COVID and happiness data
        
    Returns:
    --------
    pd.DataFrame
        Correlation dataframe
    """
    print("\nCalculating correlations between COVID-19 metrics and happiness factors...")
    
    # Select relevant columns for correlation analysis
    covid_cols = ['Confirmed', 'Daily_New_Cases', 'Peak_7Day_Avg', 'Days_To_Peak', 'Max_Growth_Rate']
    happiness_cols = ['Score', 'GDP_per_capita', 'Social_support', 'Healthy_life_expectancy', 
                     'Freedom_to_make_life_choices', 'Generosity', 'Perceptions_of_corruption']
    
    # Ensure all columns exist in the dataframe
    covid_cols = [col for col in covid_cols if col in df.columns]
    happiness_cols = [col for col in happiness_cols if col in df.columns]
    
    # Calculate correlation matrix
    correlation_df = df[covid_cols + happiness_cols].corr()
    
    print("\n--- Correlation Matrix ---")
    print(correlation_df.loc[covid_cols, happiness_cols].to_string())
    
    return correlation_df

# Calculate correlations
correlation_df = calculate_correlations(joined_df)

In [None]:
def plot_correlation_heatmap(corr_df):
    """Plot a heatmap of correlations"""
    
    # Select relevant sections of the correlation matrix
    covid_cols = ['Confirmed', 'Daily_New_Cases', 'Peak_7Day_Avg', 'Days_To_Peak', 'Max_Growth_Rate']
    happiness_cols = ['Score', 'GDP_per_capita', 'Social_support', 'Healthy _life_expectancy', 
                     'Freedom_to_make_life_choices', 'Generosity', 'Perceptions_of_corruption']
    #here score is happiness score
    
    # Ensure all columns exist in the dataframe
    covid_cols = [col for col in covid_cols if col in corr_df.index]
    happiness_cols = [col for col in happiness_cols if col in corr_df.columns]
    
    if not covid_cols or not happiness_cols:
        print("Warning: Not enough data for correlation heatmap")
        return
        
    plot_corr = corr_df.loc[covid_cols, happiness_cols]
    
    # Create the heatmap
    plt.figure(figsize=(14, 10))
    sns.heatmap(plot_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
    
    plt.title('Correlation between COVID-19 Metrics and Socioeconomic Factors', fontsize=18)
    plt.tight_layout()
    
    plt.show()

def plot_scatter_relationships(df):
    """Plot scatter plots for key relationships"""
    
    # Check if we have the necessary columns
    required_columns = ['GDP_per_capita', 'Confirmed', 'Healthy _life_expectancy', 'Days_To_Peak', 
                       'Social_support', 'Max_Growth_Rate', 'Score', 'Daily_New_Cases']
    
    # Filter to only include columns that exist in the dataframe
    available_columns = [col for col in required_columns if col in df.columns]
    
    if len(available_columns) < 4:  # Need at least 4 columns for 2x2 plot
        print("Warning: Not enough data columns for scatter plots")
        return
    
    # Create a subplot grid
    fig, axes = plt.subplots(2, 2, figsize=(16, 14))
    axes = axes.flatten()  # Flatten for easier iteration
    
    # Define plot pairs (x, y, title)
    plot_pairs = [
        ('GDP_per_capita', 'Confirmed', 'GDP per Capita vs Total COVID-19 Cases'),
        ('Healthy _life_expectancy', 'Days_To_Peak', 'Life Expectancy vs Days to Peak Infection'),
        ('Social_support', 'Max_Growth_Rate', 'Social Support vs Maximum Growth Rate'),
        ('Score', 'Daily_New_Cases', 'Happiness Score vs Peak Daily Cases')
    ]
    
    # Create only the plots where we have both columns
    plot_idx = 0
    for i, (x_col, y_col, title) in enumerate(plot_pairs):
        if x_col in df.columns and y_col in df.columns and plot_idx < len(axes):
            ax = axes[plot_idx]
            
            # Create scatter plot
            sns.scatterplot(x=x_col, y=y_col, data=df, ax=ax, alpha=0.7)
            
            # Add title and labels
            ax.set_title(title, fontsize=14)
            ax.set_xlabel(x_col.replace('_', ' '), fontsize=12)
            ax.set_ylabel(y_col.replace('_', ' '), fontsize=12)
            
            # Add regression line if we have enough data points
            if len(df) > 2:
                x = df[x_col].values
                y = df[y_col].values
                if len(set(x)) > 1:  # Make sure x has at least 2 different values
                    try:
                        ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), color='red')
                    except Exception as e:
                        print(f"Warning: Could not draw regression line for {title}: {e}")
            
            plot_idx += 1
    
    plt.tight_layout()
    plt.show()

# Plot correlations and relationships
try:
    plot_correlation_heatmap(correlation_df)
except Exception as e:
    print(f"Error plotting correlation heatmap: {e}")

try:
    plot_scatter_relationships(joined_df)
except Exception as e:
    print(f"Error plotting scatter relationships: {e}")