In [1]:
import os

import numpy as np
import pandas
import matplotlib.pyplot as plt

In [2]:
major_cities = pandas.read_csv("./raw/most-populous-cities.csv").loc[:,"Name":"Population"]
major_cities

Unnamed: 0,Name,Country,Population
0,Tokyo,Japan,37339804
1,Delhi,India,31181376
2,Shanghai,China,27795702
3,Sao Paulo,Brazil,22237472
4,Mexico City,Mexico,21918936
...,...,...,...
1165,Kolwezi,DR Congo,501375
1166,Kabinda,DR Congo,501054
1167,Jianyang,China,500925
1168,Douai-Lens,France,500921


### Classification
Based off of: https://en.wikipedia.org/wiki/Settlement_hierarchy

But the ratios of subsequent categories should be somewhat consistent so we respect the power-law distribution of sizes.

- **Megacities** ("megalopolis") - More than 8m (The convention is 10m, but I wanted slightly more balanced categories)
- **Large city** ("conurbation") - More than 3m 
- **Medium city** ("metropolis") - More than 1m
- **Small city** ("regiopolis") - More than 300k
- **Tiny city** - More than 100k
- **Town** - More than 10k

In this first dataset, we only go down to small cities (at just above the 500k mark).

In [3]:
MEGA_CITY_THRESHOLD = 8e6 # extra _ for naming consistency
LARGE_CITY_THRESHOLD = 3e6
MEDIUM_CITY_THRESHOLD = 1e6
SMALL_CITY_THRESHOLD = 3e5
TINY_CITY_THRESHOLD = 1e5
TOWN_THRESHOLD = 1e4

In [4]:
def get_cities_within_range(cities, min_=0, max_=np.inf):
    """
    :param cities: a numpy dataframe of city populations in decreasing order. 
        Populations in a column titled "Population"
        
    """
    cities_pops = cities.loc[:, "Population"].to_numpy()
    
    min_index = len(cities_pops)
    max_index = -1
    
    cities_too_small = np.nonzero(cities_pops < min_)[0]
    cities_too_large = np.nonzero(cities_pops > max_)[0]
        
    if len(cities_too_small):
        min_index = cities_too_small[0]

    if len(cities_too_large):
        max_index = cities_too_large[-1] 
        
    return cities.loc[max_index+1:min_index-1, :]
            

In [5]:
mega_cities = get_cities_within_range(major_cities, min_=MEGA_CITY_THRESHOLD)
large_cities = get_cities_within_range(major_cities, max_=MEGA_CITY_THRESHOLD, min_=LARGE_CITY_THRESHOLD)
medium_cities = get_cities_within_range(major_cities, max_=LARGE_CITY_THRESHOLD, min_=MEDIUM_CITY_THRESHOLD)
small_cities = get_cities_within_range(major_cities, max_=MEDIUM_CITY_THRESHOLD, min_=SMALL_CITY_THRESHOLD)

In [6]:
print("MEGACITIES\n", mega_cities, "\n")
print("LARGE_CITIES\n", large_cities, "\n")
print("MEDIUM_CITIES\n", medium_cities, "\n")
print("SMALL_CITIES\n", small_cities, "\n")

MEGACITIES
                 Name         Country  Population
0              Tokyo           Japan    37339804
1              Delhi           India    31181376
2           Shanghai           China    27795702
3          Sao Paulo          Brazil    22237472
4        Mexico City          Mexico    21918936
5              Dhaka      Bangladesh    21741090
6              Cairo           Egypt    21322750
7            Beijing           China    20896820
8             Mumbai           India    20667656
9              Osaka           Japan    19110616
10           Karachi        Pakistan    16459472
11         Chongqing           China    16382376
12          Istanbul          Turkey    15415197
13      Buenos Aires       Argentina    15257673
14           Kolkata           India    14974073
15          Kinshasa        DR Congo    14970460
16             Lagos         Nigeria    14862111
17            Manila     Philippines    14158573
18           Tianjin           China    13794450
19      

In [7]:
mega_cities.to_csv("./cleaned/mega.csv")
large_cities.to_csv("./cleaned/large.csv")
medium_cities.to_csv("./cleaned/medium.csv")
small_cities.to_csv("./cleaned/small.csv")