# Import Modules

In [None]:
# Import modules
import pandas as pd
import os

# Load Data

In [None]:
# Set up paths
RAW_PATH = "../data/raw/"
PROCESSED_PATH = "../data/processed/"

In [None]:
# Load and clean yield.csv
yield_df = pd.read_csv(os.path.join(RAW_PATH, "yield.csv"))[['Area', 'Item', 'Year', 'Value']]
yield_df['Value'] = pd.to_numeric(yield_df['Value'], errors='coerce')
yield_df = yield_df.dropna(subset=['Value']).rename(columns={'Value': 'Yield'})

# Load and clean area_harvested.csv
area_df = pd.read_csv(os.path.join(RAW_PATH, "area_harvested.csv"))[['Area', 'Item', 'Year', 'Value']]
area_df['Value'] = pd.to_numeric(area_df['Value'], errors='coerce')
area_df = area_df.dropna(subset=['Value']).rename(columns={'Value': 'Area_Harvested'})

# Load and clean production_quantity.csv
prod_df = pd.read_csv(os.path.join(RAW_PATH, "production_quantity.csv"))[['Area', 'Item', 'Year', 'Value']]
prod_df['Value'] = pd.to_numeric(prod_df['Value'], errors='coerce')
prod_df = prod_df.dropna(subset=['Value']).rename(columns={'Value': 'Production_Quantity'})

# Merge Data

In [None]:
# Harmonize country names
area_rename_map = {
    "Netherlands (Kingdom of the)": "Netherlands",
    "Republic of Moldova": "Moldova",
    "Slovakia": "Slovak Republic",
    "United Kingdom of Great Britain and Northern Ireland": "United Kingdom"
}
for df in [yield_df, area_df, prod_df]:
    df['Area'] = df['Area'].replace(area_rename_map)

In [None]:
# Merge the three main datasets
merged_df = pd.merge(yield_df, area_df, on=['Area', 'Item', 'Year'], how='inner')
merged_df = pd.merge(merged_df, prod_df, on=['Area', 'Item', 'Year'], how='inner')

In [None]:
# Drop obsolete countries
drop_countries = [
    "Belgium-Luxembourg", "Czechoslovakia", "Serbia and Montenegro", "USSR", "Yugoslav SFR"
]
merged_df = merged_df[~merged_df['Area'].isin(drop_countries)]

In [None]:
# Remove year 2023 (not present in all datasets)
merged_df = merged_df[merged_df['Year'] <= 2022]

In [None]:
# Load and process features.csv
features_df = pd.read_csv(os.path.join(RAW_PATH, "features.csv"))
features_long = features_df.melt(
    id_vars=["REF_AREA_LABEL", "INDICATOR", "INDICATOR_LABEL"],
    var_name="Year",
    value_name="Value"
)
features_long = features_long.rename(columns={"REF_AREA_LABEL": "Area"})
features_long['Year'] = pd.to_numeric(features_long['Year'], errors='coerce')
features_long = features_long.dropna(subset=['Year', 'Value'])
features_long['Year'] = features_long['Year'].astype(int)

In [None]:
# Pivot features
features_pivot = features_long.pivot_table(
    index=["Area", "Year"],
    columns="INDICATOR",
    values="Value",
    aggfunc="first"
).reset_index()

In [None]:
# Merge with features
final_df = pd.merge(merged_df, features_pivot, on=["Area", "Year"], how="left")

# Save Merged Dataset

In [None]:
# Uppercase headers and save
final_df.columns = [col.upper() for col in final_df.columns]
os.makedirs(PROCESSED_PATH, exist_ok=True)

In [None]:
# Save to CSV
final_df.to_csv(os.path.join(PROCESSED_PATH, "merged_dataset.csv"), index=False)

In [None]:
# Save to Excel
final_df.to_excel(os.path.join(PROCESSED_PATH, "merged_dataset.xlsx"), index=False)

In [None]:
# Preview
print("Final merged dataset preview:")
display(final_df.head())