In [4]:
# Start writing code here...
!pip install wbgapi

Collecting wbgapi
  Downloading wbgapi-1.0.5-py3-none-any.whl (35 kB)
Installing collected packages: wbgapi
Successfully installed wbgapi-1.0.5
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [5]:
import csv
import numpy as np
import pandas as pd
import wbgapi as wb
import matplotlib.pyplot as plt

%matplotlib inline

In [7]:
def get_all_indicator_codes():
    '''
    (Helper function) Get list of code/id's for all indicators.
    '''
    return [row['id'] for row in wb.series.list(db=2)] # 2 specifies WDI

def get_iso_code(country):
    '''
    (Helper function) Get iso code for country.

    country (string) -- country to get iso code for
    '''
    iso_code = wb.economy.coder(country)

    if iso_code is None:
        print("ERROR: get_iso_code could not resolve country name.")
        return None
    else:
        return iso_code

def get_specific_indicator_code(keyword):
    '''
    (Helper function) Get specific codes for indicators related to a keyword

    keyword (string) -- keyword to get codes for
    '''

    indicator_codes = wb.series.info(q=keyword)

    if indicator_codes is None:
        print("ERROR: indicator_codes could not resolve keyword argument.")
        return None
    else:
        return indicator_codes
        
def get_indicators_for_country(country, min_year=None, max_year=None):
    '''
    Get set (generator) of indicators for specific country.
    This is a generator function, as it returns a generator object.

    country (string) -- country specified to get indicators for
    min_year (4-digit int) -- start year of data coverage consideration (only
                              used if max_year also available)
    max_year (4-digit int) -- end year of data coverage consideration (only
                              used if max_year also available)   
    '''
    joined_indicator_codes = ';'.join(get_all_indicator_codes()) # joined for refetch fxn
    country_iso_code = get_iso_code(country)
    
    if country_iso_code is None:
        return [] # no error print needed as one will be printed in get_iso_code

    if min_year is not None and max_year is not None:
        ind_generator = wb.refetch('sources/{source}/series/{series}/country/{economy}', \
            ['series', 'economy'], source=2, series=joined_indicator_codes, \
            economy=country_iso_code, time=range(min_year, max_year, 1))
    else:
       ind_generator = wb.refetch('sources/{source}/series/{series}/country/{economy}', \
            ['series', 'economy'], source=2, series=joined_indicator_codes, \
            economy=country_iso_code) 

    return ind_generator

def filter_indicators_by_coverage(ind_generator, threshold=0.0):
    '''
    Get set of indicators filtered by data coverage. Indicators that
    have data coverage equal to or above 'threshold' will be included.

    Deciding against converting the generator into a dataframe and just iterating
    due to immense size of data; it will probably be more efficient iterating
    than manipulating the large dataframe for each unique indicator.

    ind_generator (generator) -- list of indicators and their respective values
    threshold (float) -- minimum data coverage amount                         
    '''
    filtered_ind = set()
    prev_ind_code = None
    num_nan = 0
    num_total = 0

    for row in ind_generator:
        curr_ind_code = row['variable'][0]['id']
        curr_value = row['value']

        # calculate stats for prev indicator
        if curr_ind_code != prev_ind_code and prev_ind_code is not None:

            coverage_percentage = (num_total - num_nan) / num_total
            if coverage_percentage >= threshold:
                filtered_ind.add(prev_ind_code)

            num_nan = 0
            num_total = 0

        if curr_value is None:
            num_nan += 1
        num_total += 1
        prev_ind_code = curr_ind_code

    return list(filtered_ind)

def export_array(arr, filename):
    '''
    Exports array of values into a CSV file.

    arr (array) -- array of values to export
    filename (string) -- name of file to export to, including ".csv"
    '''
    # reshape file for export formatting
    len_arr = len(arr)

    if len_arr == 0:
        print("ERROR: export_array cannot convert empty data.")
        return

    reshaped_arr = np.array(arr).reshape((len_arr, 1))

    file = open(filename, 'w+', newline ='')

    with file:     
        write = csv.writer(file) 
        write.writerows(reshaped_arr)
    file.close()

    
if __name__ == "__main__":
    '''
    The Main Function of this file, where execution starts.
    '''
    fetched_ind = get_indicators_for_country('Colombia', 1980, 2010)
    filtered_ind = filter_indicators_by_coverage(fetched_ind, 0.90)
    print(len(filtered_ind))
    export_array(filtered_ind, 'NEW_col_data_90_threshold.csv')

300


In [8]:
csv_filepath = '/work/DSCI400-Project/NEW_col_data_90_threshold.csv' #set the filepath of the csv you want to read
thresholded_country_data = pd.read_csv(csv_filepath)



In [9]:
print(thresholded_country_data)

        NV.IND.TOTL.KN
0    SP.POP.1014.FE.5Y
1    NV.MNF.MTRN.ZS.UN
2    MS.MIL.XPND.GD.ZS
3       NE.GDI.STKB.CN
4       NY.GDP.PCAP.CN
..                 ...
294  EN.ATM.CO2E.GF.KT
295     NE.GDI.FTOT.ZS
296     NE.CON.PRVT.CN
297        SP.URB.TOTL
298  AG.LND.ARBL.HA.PC

[299 rows x 1 columns]


In [12]:
for row in thresholded_country_data.iterrows():
    print(row[1])


NV.IND.TOTL.KN    SP.POP.1014.FE.5Y
Name: 0, dtype: object
NV.IND.TOTL.KN    NV.MNF.MTRN.ZS.UN
Name: 1, dtype: object
NV.IND.TOTL.KN    MS.MIL.XPND.GD.ZS
Name: 2, dtype: object
NV.IND.TOTL.KN    NE.GDI.STKB.CN
Name: 3, dtype: object
NV.IND.TOTL.KN    NY.GDP.PCAP.CN
Name: 4, dtype: object
NV.IND.TOTL.KN    SP.POP.0014.FE.IN
Name: 5, dtype: object
NV.IND.TOTL.KN    TM.VAL.MRCH.WR.ZS
Name: 6, dtype: object
NV.IND.TOTL.KN    NV.IND.MANF.CD
Name: 7, dtype: object
NV.IND.TOTL.KN    NY.GDP.PCAP.KD.ZG
Name: 8, dtype: object
NV.IND.TOTL.KN    SP.POP.4549.FE.5Y
Name: 9, dtype: object
NV.IND.TOTL.KN    SP.POP.1519.MA.5Y
Name: 10, dtype: object
NV.IND.TOTL.KN    SP.ADO.TFRT
Name: 11, dtype: object
NV.IND.TOTL.KN    FM.AST.NFRG.CN
Name: 12, dtype: object
NV.IND.TOTL.KN    NE.CON.PRVT.PC.KD.ZG
Name: 13, dtype: object
NV.IND.TOTL.KN    FM.AST.CGOV.ZG.M3
Name: 14, dtype: object
NV.IND.TOTL.KN    SP.POP.3034.FE.5Y
Name: 15, dtype: object
NV.IND.TOTL.KN    NY.GNP.MKTP.CD
Name: 16, dtype: object
NV.IND.T

In [None]:
def pearson_correlation_feature_selection(csv_filepath, target_indicator_code, corr_threshold):
    '''
    Generates the pearson correlation matrix between a target indicator and all other indicators for a country.
    Then, remove the indicators that fail to meet a pre-determined correlation threshold.

    csv_filepath (string) -- the filepath of the csv file in which the list of indicators for a country is stored.
    target_indicator_code (string) -- The specific indicator code for the target indicator.
    corr_threshold (float) -- a correlation threshold that an indicator must meet with the target indicator in order to not be removed.
    
    '''


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=72b1e015-02d7-48b2-beb6-dcae4caaa79e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>