# Our Country: Colombia

In [131]:
### Uncomment this to download
# !pip install wbdata
import numpy as np
import pandas as pd
import wbdata
import re
import plotly.offline as py
import plotly.graph_objs as go

# Population Statistics

In [2]:
def get_indicators_dict(src):
    indicators = wbdata.get_indicator(source = src)
    
    indicator_dict = {}
    for indicator in indicators:
        ID = indicator['id']
        name = indicator['name']
        indicator_dict[name] = ID
        
    return indicator_dict
    

def population(year, sex, age_range, place, whole_df = False, mute=True):
    src = 40
    
    indicator_dict = get_indicators_dict(src)
    
    try:
        try: # For queries like 'Population ages 10-14, female'
            label = f'Population ages {age_range[0]}-{age_range[1]}, {sex.lower()}'
            variable_labels = {(indicator_dict[label]): label}
            
        except: # For queries like 'Population ages 65 and above, female'
            label = f'Population ages {age_range[0]} and {age_range[1]}, {sex.lower()}'
            variable_labels = {(indicator_dict[label]): label}            

        try:
            df = wbdata.get_dataframe(variable_labels, country=place)
            df.index = df.index.astype(int)
            if not whole_df:
                return df.loc[year, label]
            else: ### Returns the whole df if no year specified
                return df
        except:
            if not mute:
                print('Invalid Country: Use find_country() to find valid countries')
            return 'Invalid Country'
        
    except:
        valid_queries = get_indicators_dict(40).keys()
        valid_queries = [q for q in valid_queries if q.find('Population ages') != -1 & q.find('(% of') == -1]
        error_message = f'''
Invalid Query:
Valid Queries:
{valid_queries}
        '''
        
        if not mute:
            print(error_message)
        return 'Invalid Query'

def find_country(country):
    print(wbdata.search_countries())

In [3]:
eg = population(2020, 'male', ('00','04'), 'COL')
eg

1873332.0

# Unit Tests

In [4]:
def unit_test(case):
    if case:
        print('Test Case Passed')
        return case
    else:
        print('Test Case Failed')
        return case

### Test the Value Returns Correct Answers

In [5]:
n = 1

### Test the Unit Test Works
case = population(2020, 'total', ('15','64'), 'COL') < 0
print(f'Test {n}:')
unit_test(~(case))
n += 1

### Basic Functionality
case = population(2020, 'total', ('15','64'), 'COL') > 1e6
print(f'\nTest {n}:')
unit_test(case)
n += 1

### Test the year hyperparameter works
case = population(2020, 'total', ('15','64'), 'COL') != population(2021, 'total', ('15','64'), 'COL')
print(f'\nTest {n}:')
unit_test(case)
n += 1

### Test the sex hyperparameter works
case = population(2020, 'total', ('15','64'), 'COL') > population(2020, 'male', ('15','64'), 'COL')
print(f'\nTest {n}:')
unit_test(case)
n += 1

### Test the place hyperparameter works
case = population(2020, 'total', ('15','64'), 'COL') != population(2020, 'total', ('15','64'), 'USA')
print(f'\nTest {n}:')
unit_test(case)
n += 1

### Test the age_range hyperparameter works w/ '-'
case = population(2020, 'male', ('15','19'), 'COL') != population(2020, 'male', ('0','14'), 'COL')
print(f'\nTest {n}:')
unit_test(case)
n += 1

### Test the age_range hyperparameter works w/ '-'
case = type(population(2020, 'male', ('80','above'), 'COL')) == int
print(f'\nTest {n}:')
unit_test(case)
n += 1

Test 1:
Test Case Passed

Test 2:
Test Case Passed

Test 3:
Test Case Passed

Test 4:
Test Case Passed

Test 5:
Test Case Passed

Test 6:
Test Case Passed

Test 7:
Test Case Failed


### Test the Error Messages Work

In [6]:
n = 1

### Test the Invalid Country Message Works
case = population(2020, 'total', ('15','64'), 'SPAM', mute=True) == 'Invalid Country'
print(f'Test {n}:')
unit_test(~(case))
n += 1

### Test the Invalid Query Message Works
case = population(2020, 'total', ('15','SPAM'), 'COL', mute=True) == 'Invalid Query'
print(f'\nTest {n}:')
unit_test(case)
n += 1

Test 1:
Test Case Passed

Test 2:
Test Case Passed


# Population Dataframes

In [9]:
all_countries = wbdata.get_country()

country_list = []
for country in all_countries:
    c = country['id']
    country_list += [c]

In [29]:
def get_valid_ranges(total = False):
    if total:
        valid_queries = get_indicators_dict(40).keys()
        valid_queries = [q for q in valid_queries if (q.find('Population ages') != -1) 
                         & (q.find(', total') != -1) & (q.find('(% of') == -1)]
    else:
        valid_queries = get_indicators_dict(40).keys()
        valid_queries = [q for q in valid_queries if (q.find('Population ages') != -1) 
                         & (q.find(', female') != -1) & (q.find('(% of') == -1)]
    
    valid_ranges = []
    for q in valid_queries:
        m = re.search(r'(\d+-\d+|\d+ and above)', q)
        age_range = m.group(1)
        valid_ranges += [age_range]

    valid_ranges = [(re.split(r"[- | '']", r)[0], 
                     re.split(r"[- | '']", r)[-1]) for r in valid_ranges]

    return valid_ranges
        

def population_dataframe(place):   
    data_dict = {}
    sexes = ['male', 'female']

    valid_ranges = get_valid_ranges()

    ## Need to iterate for male & female
    for s in sexes:
    ### Need to iterate for each age_range
        for r in valid_ranges:
            if r[1] == 'above':
                col_name = f'{s.capitalize()}s, Ages {r[0]} and {r[1]}'
            else:
                col_name = f'{s.capitalize()}s, Ages {r[0]}-{r[1]}'

            pop_df = population(2020, s, r, place, whole_df = True)

            try:
                data_dict['Year'] = pop_df.index
                data_dict[col_name] = pop_df.iloc[:, 0]
                data_dict['Country'] = [place] * len(data_dict['Year'])
            except: # Skips if query fails
                pass

    ### For the Totals Columns
    valid_ranges = get_valid_ranges(total = True)
    
    for r in valid_ranges:
        if r[1] == 'above':
            col_name = f'Totals, Ages {r[0]} and {r[1]}'
        else:
            col_name = f'Totals, Ages {r[0]}-{r[1]}'

        pop_df = population(2020, s, r, place, whole_df = True)

        try:
            data_dict[col_name] = pop_df.iloc[:, 0]
        except: # Skips if query fails
            pass   
    
    try:
        cdf = pd.DataFrame(data_dict)
        cdf.set_index(['Year', 'Country'], inplace = True)
        return cdf
    except: # Skips if df does not exist (i.e. Only Invalid Queries/Countries)
        return 'Could Not Return Dataframe'

In [42]:
df = wbdata.get_dataframe(variables,country="WLD")
df

NameError: name 'variables' is not defined

In [30]:
eg = population_dataframe('COL')
eg

Unnamed: 0_level_0,Unnamed: 1_level_0,"Males, Ages 00-04","Males, Ages 0-14","Males, Ages 05-09","Males, Ages 10-14","Males, Ages 15-19","Males, Ages 15-64","Males, Ages 20-24","Males, Ages 25-29","Males, Ages 30-34","Males, Ages 35-39",...,"Females, Ages 55-59","Females, Ages 60-64","Females, Ages 65-69","Females, Ages 65 and above","Females, Ages 70-74","Females, Ages 75-79","Females, Ages 80 and above","Totals, Ages 0-14","Totals, Ages 15-64","Totals, Ages 65 and above"
Year,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022,COL,1863450.0,5647955.0,1879870.0,1904636.0,2044762.0,17865331.0,2211820.0,2271699.0,2176677.0,1980767.0,...,1464314.0,1243123.0,960937.0,2605831.0,696194.0,472134.0,476565.0,5423484.0,18269102.0,2605831.0
2021,COL,1872904.0,5675317.0,1880579.0,1921834.0,2085844.0,17749414.0,2237669.0,2272080.0,2145863.0,1947363.0,...,1443643.0,1201652.0,922318.0,2501582.0,665106.0,452538.0,461621.0,5450187.0,18149552.0,2501582.0
2020,COL,1873332.0,5695619.0,1880087.0,1942201.0,2113654.0,17526007.0,2239826.0,2247557.0,2098935.0,1903638.0,...,1414124.0,1157392.0,883473.0,2397042.0,634029.0,432973.0,446568.0,5469585.0,17924448.0,2397042.0
2019,COL,1868705.0,5716525.0,1881583.0,1966236.0,2131193.0,17230258.0,2228019.0,2207686.0,2045197.0,1854395.0,...,1375497.0,1109581.0,842826.0,2284866.0,600957.0,411437.0,429646.0,5489346.0,17633894.0,2284866.0
2018,COL,1860651.0,5736828.0,1884949.0,1991227.0,2134154.0,16856472.0,2198986.0,2150158.0,1984049.0,1798082.0,...,1330663.0,1060247.0,799790.0,2170200.0,569642.0,389033.0,411735.0,5508007.0,17267700.0,2170200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,COL,1688662.0,4255437.0,1406775.0,1159999.0,919903.0,4363247.0,732719.0,587877.0,498942.0,419034.0,...,200371.0,156923.0,123563.0,310012.0,88759.0,57667.0,40023.0,4137052.0,4444896.0,310012.0
1963,COL,1649980.0,4122271.0,1358786.0,1113505.0,884553.0,4233603.0,706668.0,574609.0,488153.0,408355.0,...,193886.0,153687.0,120213.0,300078.0,86701.0,55023.0,38140.0,4011118.0,4319248.0,300078.0
1962,COL,1607957.0,3984776.0,1311023.0,1065797.0,853571.0,4112817.0,682476.0,563376.0,477068.0,397490.0,...,188221.0,150961.0,116467.0,290189.0,84973.0,52200.0,36549.0,3881925.0,4201114.0,290189.0
1961,COL,1563136.0,3848515.0,1263980.0,1021399.0,822609.0,3996507.0,660764.0,552624.0,466079.0,386731.0,...,183074.0,148274.0,112844.0,280704.0,83135.0,49499.0,35224.0,3753417.0,4086991.0,280704.0


# Population Pyramids

In [136]:
year = 2020
place = 'COL'

country_df = population_dataframe(place)

plot_df = eg[eg.index == (year, place)]

plot_df = plot_df.drop(columns=plot_df.filter(regex="Totals").columns) # Removes the Total Columns

valid_ranges = get_valid_ranges()

valid_ranges_int = [(int(r[0]), r[1]) for r in valid_ranges]
valid_ranges_int

desired_ranges = []

prior = -1

stop = False

for r in valid_ranges_int:
    if not stop:
        if r[0] != prior and r[0] != 65: # We don't want duplicates
            desired_ranges += [r]
            prior = r[0]
        if r[1] == 'above': # We don't want to continue adding after we get an above
            stop = True
desired_ranges += [(65, 'above')]
        
vals = plot_df.filter(regex="Male").values[0]

range_dict = dict(zip(valid_ranges_int, vals))

x_m = [range_dict[r] for r in desired_ranges] 
y_m = [int(r[0]) for r in desired_ranges]

vals = plot_df.filter(regex="Male").values[0]

range_dict = dict(zip(valid_ranges_int, vals))

x_f = [range_dict[r] for r in desired_ranges] 
y_f = [int(r[0]) for r in desired_ranges]

x_f = [-x for x in x_f]

py.init_notebook_mode(connected=True)

layout = go.Layout(barmode='overlay',
                   yaxis=go.layout.YAxis(range=[0, 90], title='Age'),
                   xaxis=go.layout.XAxis(title='Number'))

bins = [go.Bar(x = x_m,
               y = y_m,
               orientation='h',
               name='Men',
               marker=dict(color='purple'),
               hoverinfo='skip'
               ),

        go.Bar(x = x_f,
               y= y_f,
               orientation='h',
               name='Women',
               marker=dict(color='pink'),
               hoverinfo='skip',
               )
        ]



py.iplot(dict(data=bins, layout=layout))

In [44]:
valid_ranges = get_valid_ranges()

[]

In [81]:
plot_df.columns
[r[0] for r in valid_ranges]

['00',
 '0',
 '05',
 '10',
 '15',
 '15',
 '20',
 '25',
 '30',
 '35',
 '40',
 '45',
 '50',
 '55',
 '60',
 '65',
 '65',
 '70',
 '75',
 '80']

In [118]:
valid_ranges_int = [(int(r[0]), r[1]) for r in valid_ranges]
valid_ranges_int

desired_ranges = []

prior = -1

stop = False

for r in valid_ranges_int:
    if not stop:
        if r[0] != prior and r[0] != 65: # We don't want duplicates
            desired_ranges += [r]
            prior = r[0]
        if r[1] == 'above': # We don't want to continue adding after we get an above
            stop = True
desired_ranges += [(65, 'above')]
        
vals = plot_df.filter(regex="Male").values[0]

range_dict = dict(zip(valid_ranges_int, vals))

x = [range_dict[r] for r in desired_ranges] 
y = [int(r[0]) for r in desired_ranges]

In [119]:
y

[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65]