In [6]:
import polars as pl

In [7]:
igbp_data = pl.read_csv("data/IGBPPerYearAndCountry.csv").select(
    pl.col("system:index").str.slice(0,10).str.strptime(format="%Y_%m_%d", dtype=pl.Datetime("ms")).alias("date"),
    pl.col.histogram.str.json_decode(),
    pl.col.shapeGroup,
    pl.col.shapeType
)
country_sizes = pl.read_csv("data/Roughness.csv")

In [9]:
class_mapping = {
    "Forests": [1,2,3,4,5],
    "Shrubs": [6,7],
    "Savannas": [8,9],
    "Grasslands": [10],
    "Wetlands": [11],
    "Agricultural Areas": [12],
    "Urban Areas": [13],
    "Water Areas": [17],
}

# Create a reversed mapping from class numbers to names
reversed_mapping = {num: category for category, nums in class_mapping.items() for num in nums}

classes = igbp_data.explode("histogram").with_columns(
    pl.col.histogram.list.to_struct(fields=["class", "count"]).struct.unnest()
).with_columns(
    pl.col("class").replace_strict(reversed_mapping, default="none", return_dtype=pl.String).alias("class_name")
).filter(
    pl.col.class_name != "none",
    pl.col.count != 0
).group_by("shapeGroup", "class_name", "date").agg(pl.col.count.sum())

In [10]:
classes

shapeGroup,class_name,date,count
str,str,datetime[ms],f64
"""YEM""","""Wetlands""",2019-01-01 00:00:00,12.109804
"""GMB""","""Grasslands""",2016-01-01 00:00:00,21293.466667
"""MYS""","""Forests""",2018-01-01 00:00:00,897912.917647
"""KIR""","""Agricultural Areas""",2018-01-01 00:00:00,1.0
"""MDV""","""Wetlands""",2001-01-01 00:00:00,105.870588
…,…,…,…
"""KAZ""","""Wetlands""",2020-01-01 00:00:00,84194.662745
"""MEX""","""Water Areas""",2008-01-01 00:00:00,65456.639216
"""VEN""","""Savannas""",2004-01-01 00:00:00,1.0586e6
"""124""","""Shrubs""",2003-01-01 00:00:00,1.141176


In [11]:
country_weights = (
    classes
    .join(country_sizes.select("shapeGroup", "area"), on="shapeGroup")
    .with_columns(
        country_weight = pl.col.count / pl.col.count.sum().over(["shapeGroup", "date"]),
    )
    .with_columns(
        country_land_cover_area=pl.col.area*pl.col.country_weight,
    )
    .with_columns(
        global_weight = pl.col.country_land_cover_area / pl.col.country_land_cover_area.sum().over(["class_name", "date"]),
        year = pl.col.date.dt.year()
    )
)

In [16]:
country_weights.filter(class_name="Wetlands", year=2001)["global_weight"].sum()

1.0000000000000009

In [13]:
country_weights

shapeGroup,class_name,date,count,area,country_weight,country_land_cover_area,global_weight,year
str,str,datetime[ms],f64,f64,f64,f64,f64,i32
"""YEM""","""Wetlands""",2019-01-01 00:00:00,12.109804,453322.221762,0.000039,17.707385,0.000011,2019
"""GMB""","""Grasslands""",2016-01-01 00:00:00,21293.466667,10717.453419,0.468979,5026.265637,0.000117,2016
"""MYS""","""Forests""",2018-01-01 00:00:00,897912.917647,329134.877105,0.679208,223550.939547,0.01029,2018
"""KIR""","""Agricultural Areas""",2018-01-01 00:00:00,1.0,924.598674,0.000284,0.262429,1.7999e-8,2018
"""MDV""","""Wetlands""",2001-01-01 00:00:00,105.870588,161.403794,0.168714,27.231141,0.000008,2001
…,…,…,…,…,…,…,…,…
"""KAZ""","""Wetlands""",2020-01-01 00:00:00,84194.662745,2.7253e6,0.003589,9780.571175,0.006296,2020
"""MEX""","""Water Areas""",2008-01-01 00:00:00,65456.639216,1.9564e6,0.007186,14058.292021,0.003808,2008
"""VEN""","""Savannas""",2004-01-01 00:00:00,1.0586e6,911914.529103,0.284003,258986.218188,0.009384,2004
"""124""","""Shrubs""",2003-01-01 00:00:00,1.141176,52.476817,0.003912,0.205282,9.8464e-9,2003


In [14]:
country_weights.rename({"class_name": "landCover"}).write_parquet("data/weights.pq")