In [14]:
import polars as pl
from datetime import datetime

In [3]:
# Load datasets
un = pl.read_csv("data/un_basic.csv", try_parse_dates=True)
forest_area = pl.read_csv("data/our_world_in_data/forest-area-km.csv")
weather = pl.read_parquet("data/florence-meteostat.parquet")

In [4]:
# Solution load-cities
cities = pl.read_parquet("data/worldcities.parquet")
cities

city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
str,str,f64,f64,str,str,str,str,str,i64,i64
"""Tokyo""","""Tokyo""",35.6897,139.6922,"""Japan""","""JP""","""JPN""","""Tōkyō""","""primary""",37732000,1392685764
"""Jakarta""","""Jakarta""",-6.175,106.8275,"""Indonesia""","""ID""","""IDN""","""Jakarta""","""primary""",33756000,1360771077
"""Delhi""","""Delhi""",28.61,77.23,"""India""","""IN""","""IND""","""Delhi""","""admin""",32226000,1356872604
"""Guangzhou""","""Guangzhou""",23.13,113.26,"""China""","""CN""","""CHN""","""Guangdong""","""admin""",26940000,1156237133
"""Mumbai""","""Mumbai""",19.0761,72.8775,"""India""","""IN""","""IND""","""Mahārāshtra""","""admin""",24973000,1356226629
…,…,…,…,…,…,…,…,…,…,…
"""Munha-dong""","""Munha-dong""",39.3813,127.2517,"""Korea, North""","""KP""","""PRK""","""Kangwŏn""",,,1408979215
"""Sil-li""","""Sil-li""",39.488,125.464,"""Korea, North""","""KP""","""PRK""","""P’yŏngnam""",,,1408767958
"""Muan""","""Muan""",34.9897,126.4714,"""Korea, South""","""KR""","""KOR""","""Jeonnam""","""admin""",,1410001061
"""Hongseong""","""Hongseong""",36.6009,126.665,"""Korea, South""","""KR""","""KOR""","""Chungnam""","""admin""",,1410822139


In [5]:
# Solution world-map
cities.plot.scatter(
    x="lng",
    y="lat",
    # The following arguments are optional
    hover_cols=["city"],
    color="country",
    title="Cities of the World",
    height=500,
    width=1000,
    legend=False,
    grid=True
)

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)


In [6]:
# Solution ten-smallest
sorted_population = un.sort("population")
ten_smallest = sorted_population.head(10)
ten_smallest.plot.bar(x="iso3", y="population", color="region", hover_cols=["country"])

In [7]:
# Solution energy-it
el_source = pl.read_csv("data/our_world_in_data/electricity-source.csv", infer_schema_length=5000)
el_source_italy = el_source.filter(country="Italy")
el_source_italy

country,iso3,year,renewables,nuclear,hydro,fossil
str,str,i64,f64,f64,f64,f64
"""Italy""","""ITA""",1960,3.772102,0.0,81.90034,14.327559
"""Italy""","""ITA""",1961,3.831551,0.0,69.19009,26.978363
"""Italy""","""ITA""",1962,3.6586688,0.0,60.354095,35.987236
"""Italy""","""ITA""",1963,3.5944583,0.454763,64.46794,31.482838
"""Italy""","""ITA""",1964,3.4573596,3.1467478,51.10418,42.291714
…,…,…,…,…,…,…
"""Italy""","""ITA""",2011,12.353317,0.0,15.241412,71.36984
"""Italy""","""ITA""",2012,16.935196,0.0,14.085194,67.964806
"""Italy""","""ITA""",2013,20.57525,0.0,18.330097,60.03633
"""Italy""","""ITA""",2014,22.341038,0.0,21.05057,55.489075


In [8]:
# Solution energy-it (1)
el_source_italy.plot.area(x="year", y=["nuclear", "hydro", "fossil", "renewables"], stacked=True)

In [9]:
# Solution energy-it (2)
el_source_italy.plot(x="year", y=["nuclear", "hydro", "fossil", "renewables"])

In [10]:
# Solution founding-members
first_date = un["admission_date"].min()
founding_members =  un.filter(admission_date = first_date)
founding_members

iso3,country,population,area,admission_date,region,subregion
str,str,i64,f64,date,str,str
"""ARG""","""Argentina""",46234830,2.7804e6,1945-10-24,"""Americas""","""Latin America and the Caribbea…"
"""BLR""","""Belarus""",9228071,207630.0,1945-10-24,"""Europe""","""Eastern Europe"""
"""BRA""","""Brazil""",215313498,8.51577e6,1945-10-24,"""Americas""","""Latin America and the Caribbea…"
"""CHL""","""Chile""",19603733,756700.0,1945-10-24,"""Americas""","""Latin America and the Caribbea…"
"""CHN""","""China""",1412175000,9.59696e6,1945-10-24,"""Asia""","""Eastern Asia"""
…,…,…,…,…,…,…
"""SYR""","""Syria""",22125249,185180.0,1945-10-24,"""Asia""","""Western Asia"""
"""TUR""","""Türkiye""",84979913,785350.0,1945-10-24,"""Asia""","""Western Asia"""
"""UKR""","""Ukraine""",38000000,603550.0,1945-10-24,"""Europe""","""Eastern Europe"""
"""GBR""","""United Kingdom""",66971395,243610.0,1945-10-24,"""Europe""","""Northern Europe"""


In [11]:
# Solution forest-change
forest_area.group_by("Entity").agg(
    area_first=pl.col("Forest area").first().alias("area_first"),
    area_last=pl.col("Forest area").last(),
    area_diff=pl.col("Forest area").last() - pl.col("Forest area").first()
).with_columns(
    rel_diff=pl.col("area_diff") / pl.col("area_first")
).filter(pl.col("rel_diff").is_finite()).sort("rel_diff").drop_nulls()

Entity,area_first,area_last,area_diff,rel_diff
str,f64,f64,f64,f64
"""Micronesia""",169920.0,64420.0,-105500.0,-0.62088
"""Nicaragua""",6.39932e6,3.40753e6,-2.99179e6,-0.467517
"""Niger""",1.945e6,1.0797e6,-865300.0,-0.444884
"""Gambia""",414660.0,242670.0,-171990.0,-0.414774
"""Paraguay""",2.554586e7,1.610226e7,-9.4436e6,-0.369672
…,…,…,…,…
"""Kuwait""",3450.0,6250.0,2800.0,0.811594
"""Uruguay""",798000.0,2.031e6,1.233e6,1.545113
"""Cape Verde""",15380.0,45720.0,30340.0,1.972692
"""Iceland""",17070.0,51350.0,34280.0,2.008202


In [12]:
# Solution forest-change
first_and_last_forest_area = forest_area.group_by("Entity").agg(
    area_first=pl.col("Forest area").first(),
    area_last=pl.col("Forest area").last(),
)
relative_change = first_and_last_forest_area.select(
    "Entity",
    rel_diff=(pl.col("area_last") - pl.col("area_first")) / pl.col("area_first")
).filter(pl.col("rel_diff").is_finite())
relative_change.sort("rel_diff")

Entity,rel_diff
str,f64
"""Micronesia""",-0.62088
"""Nicaragua""",-0.467517
"""Niger""",-0.444884
"""Gambia""",-0.414774
"""Paraguay""",-0.369672
…,…
"""Kuwait""",0.811594
"""Uruguay""",1.545113
"""Cape Verde""",1.972692
"""Iceland""",2.008202


In [15]:
# Solution hottest-night
recent_weather = weather.filter(pl.col("time") > datetime(2014, 1, 1))
min_daily_temperatures = recent_weather.set_sorted("time").group_by_dynamic("time", every="1d").agg(min_temp=pl.col("temp").drop_nans().min()).drop_nulls()
top_nights = min_daily_temperatures.sort("min_temp", descending=True).head(10)
top_nights

time,min_temp
datetime[ns],f64
2023-08-22 00:00:00,23.9
2016-06-24 00:00:00,23.2
2021-07-28 00:00:00,23.1
2017-08-06 00:00:00,22.7
2017-08-08 00:00:00,22.7
2019-08-23 00:00:00,22.6
2017-08-09 00:00:00,22.2
2019-08-11 00:00:00,22.1
2016-06-23 00:00:00,22.0
2015-07-17 00:00:00,21.9


In [16]:
# Solution million-cities
million_cities = cities.filter(pl.col("population") > 1e6)
million_cities_with_country = million_cities.join(un, on="iso3", how="inner")
million_cities_per_region = (
    million_cities_with_country.group_by("region", "subregion")
    .len()
    .rename({"len": "count"})
    .sort("count", descending=True)
)
million_cities_per_region


region,subregion,count
str,str,u32
"""Asia""","""Eastern Asia""",356
"""Asia""","""Southern Asia""",81
"""Africa""","""Sub-Saharan Africa""",68
"""Americas""","""Latin America and the Caribbea…",59
"""Americas""","""Northern America""",54
…,…,…
"""Africa""","""Northern Africa""",10
"""Europe""","""Southern Europe""",9
"""Europe""","""Northern Europe""",7
"""Oceania""","""Australia and New Zealand""",6


In [17]:
# Solution million-cities (bonus)
million_cities = cities.filter(pl.col("population") > 1e6).sort("population", descending=True)
million_cities_with_country = million_cities.join(un, on="iso3", how="inner")
million_cities_per_region = (
    million_cities_with_country.group_by("region", "subregion")
    .agg(
        pl.col("population").count().alias("count"),
        pl.col("city").first(),
        pl.col("population").first(),
    )
    .sort("count", descending=True)
)
million_cities_per_region


region,subregion,count,city,population
str,str,u32,str,i64
"""Asia""","""Eastern Asia""",356,"""Tokyo""",37732000
"""Asia""","""Southern Asia""",81,"""Delhi""",32226000
"""Africa""","""Sub-Saharan Africa""",68,"""Lagos""",16637000
"""Americas""","""Latin America and the Caribbea…",59,"""São Paulo""",23086000
"""Americas""","""Northern America""",54,"""New York""",18908608
…,…,…,…,…
"""Europe""","""Western Europe""",10,"""Paris""",11060000
"""Europe""","""Southern Europe""",9,"""Madrid""",6211000
"""Europe""","""Northern Europe""",7,"""London""",11262000
"""Asia""","""Central Asia""",6,"""Tashkent""",2956384


In [26]:
# Solution forest-region
forest_area_by_region = forest_area.join(un, left_on="Code", right_on="iso3", how="inner").pivot(
    values="Forest area", index="Year", columns="region", aggregate_function="sum"
)
forest_area_by_region.plot.area(stacked=True, x="Year")