In [7]:
import pandas as pd
import numpy as np

# ============================================================
# 1. File paths (adjust if needed)
# ============================================================
COVID_CONFIRMED_PATH = "/Users/jannatrahman/your_project/data/covid19_Confirmed_dataset.csv"
COVID_DEATHS_PATH    = "/Users/jannatrahman/your_project/data/covid19_deaths_dataset.csv"
HAPPY_PATH           = "/Users/jannatrahman/your_project/data/worldwide_happiness_report.csv"

# ============================================================
# 2. Load Data
# ============================================================
covid_confirmed = pd.read_csv(COVID_CONFIRMED_PATH)
covid_deaths    = pd.read_csv(COVID_DEATHS_PATH)
happy           = pd.read_csv(HAPPY_PATH)

print("Confirmed shape:", covid_confirmed.shape)
print("Deaths shape:", covid_deaths.shape)
print("Happiness shape:", happy.shape)

# ============================================================
# 3. Reshape COVID Data (wide → long)
# ============================================================

covid_confirmed_long = covid_confirmed.melt(
    id_vars=["Country/Region"],  # keep only country
    value_vars=covid_confirmed.columns[4:],  # start from column 5 (dates only)
    var_name="date",
    value_name="confirmed"
)

covid_deaths_long = covid_deaths.melt(
    id_vars=["Country/Region"],
    value_vars=covid_deaths.columns[4:],  # start from column 5 (dates only)
    var_name="date",
    value_name="deaths"
)

# Convert date strings → datetime
covid_confirmed_long["date"] = pd.to_datetime(covid_confirmed_long["date"], errors="coerce")
covid_deaths_long["date"]    = pd.to_datetime(covid_deaths_long["date"], errors="coerce")

# Merge confirmed + deaths
covid_long = pd.merge(
    covid_confirmed_long,
    covid_deaths_long,
    on=["Country/Region", "date"],
    how="outer"
)

# ============================================================
# 4. Prepare Happiness Data
# ============================================================
# Add year column (report is 2020 only)
happy["year"] = 2020  

# Rename columns to snake_case for consistency
happy = happy.rename(columns=lambda x: x.strip().lower().replace(" ","_"))

print("Happiness cols:", happy.columns)

# ============================================================
# 5. Normalize Country Names
# ============================================================
def normalize_country(s):
    return str(s).strip().lower().replace(" ","")

covid_yearly["country_clean"] = covid_yearly["Country/Region"].map(normalize_country)
happy["country_clean"]        = happy["country_or_region"].map(normalize_country)

# ============================================================
# 6. Merge COVID + Happiness
# ============================================================
merged = pd.merge(
    covid_yearly,
    happy,
    left_on=["country_clean","year"],
    right_on=["country_clean","year"],
    how="inner"
)

print("Merged shape:", merged.shape)
print(merged.head(10))


Confirmed shape: (266, 104)
Deaths shape: (266, 104)
Happiness shape: (156, 9)
Happiness cols: Index(['overall_rank', 'country_or_region', 'score', 'gdp_per_capita',
       'social_support', 'healthy_life_expectancy',
       'freedom_to_make_life_choices', 'generosity',
       'perceptions_of_corruption', 'year'],
      dtype='object')
Merged shape: (143, 14)
  Country/Region  year  confirmed  deaths country_clean  overall_rank  \
0    Afghanistan  2020      28462     889   afghanistan           154   
1        Albania  2020      17864     819       albania           107   
2        Algeria  2020      74325    9521       algeria            88   
3      Argentina  2020      84105    3640     argentina            47   
4        Armenia  2020      40610     539       armenia           116   
5      Australia  2020     224354    2007     australia            11   
6        Austria  2020     502063   12289       austria            10   
7     Azerbaijan  2020      37281     466    azerbaija

  covid_confirmed_long["date"] = pd.to_datetime(covid_confirmed_long["date"], errors="coerce")
  covid_deaths_long["date"]    = pd.to_datetime(covid_deaths_long["date"], errors="coerce")
