In [6]:
# cell to add at index 0
import pandas as pd
from typing import Optional

CSV_PATH = "assets/worldcities.csv"

def load_worldcities(path: str = CSV_PATH) -> pd.DataFrame:
    """
    Load worldcities CSV and ensure a numeric 'population' column exists.
    """
    df = pd.read_csv(path)
    # try to find a population-like column
    pop_candidates = [c for c in df.columns if c.lower() in ("population", "pop", "pop_est", "pop2019")]
    if not pop_candidates:
        raise ValueError(f"No population-like column found in {path}. Columns: {list(df.columns)}")
    pop_col = pop_candidates[0]
    df["population"] = pd.to_numeric(df[pop_col], errors="coerce")
    return df

def filter_by_population(
    df: pd.DataFrame,
    min_pop: Optional[float] = None,
    max_pop: Optional[float] = None,
    top_n: Optional[int] = None,
    dropna: bool = True,
    sort_desc: bool = True
) -> pd.DataFrame:
    """
    Return rows filtered by population.
    - min_pop / max_pop: inclusive thresholds
    - top_n: return only the top N rows by population (applies after filtering)
    - dropna: drop rows with missing population before applying thresholds
    - sort_desc: sort by population descending if True
    """
    cond = pd.Series(True, index=df.index)
    if dropna:
        cond &= df["population"].notna()
    if min_pop is not None:
        cond &= df["population"] >= min_pop
    if max_pop is not None:
        cond &= df["population"] <= max_pop
    out = df[cond].copy()
    if sort_desc:
        out = out.sort_values("population", ascending=False)
    if top_n is not None:
        out = out.head(top_n)
    return out

# Example usage
if __name__ == "__main__":
    df = load_worldcities()
    # cities with at least 1,000,000 people
    million_plus = filter_by_population(df, min_pop=200_000)
    # print number of entries in the filtered "database"
    print(million_plus.shape[0])

3302
