In [None]:
# Start writing code here...
!pip install wbgapi

Collecting wbgapi
  Downloading wbgapi-1.0.5-py3-none-any.whl (35 kB)
Installing collected packages: wbgapi
Successfully installed wbgapi-1.0.5
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
import csv
import numpy as np
import pandas as pd
import re
import wbgapi as wb
import matplotlib.pyplot as plt
import seaborn as sns
import DataCoverage as dc


%matplotlib inline

In [None]:
def indicator_dataframe(country, start_year, end_year, coverage_threshold=0.9):
    '''
    country (string) -- The country of interest.
    start_year (int) -- The earliest year of interest.
    end_year (int) -- The latest year of interest.
    coverage_threshold -- The required indicator coverage threshold. For example, if it is 0.9, then there must exist data for 90% of the selected years.
    '''
    fetched_ind = dc.get_indicators_for_country(country, start_year, end_year)
    filtered_ind = dc.filter_indicators_by_coverage(fetched_ind, coverage_threshold)
    country_code = dc.get_iso_code(country)
    df = wb.data.DataFrame(list(filtered_ind.index), country_code, time=range(start_year, end_year), skipBlanks=True, columns='series')
    
    return df

In [None]:
def pearson_correlation_feature_selection(country, target_indicator_code, start_year, end_year, coverage_threshold = 0.9, corr_threshold = 0.8):
    '''
    Generates the pearson correlation matrix between a target indicator and all other indicators for a country.
    Then, remove the indicators that fail to meet a pre-determined correlation threshold.

    country (string) -- The country of interest.
    target_indicator_code (string) -- The specific indicator code for the target indicator.
    start_year (int) -- The earliest year of interest.
    end_year (int) -- The latest year of interest.
    coverage_threshold (float) -- The required indicator coverage threshold. For example, if it is 0.9, then there must exist data for 90% of the selected years.
    corr_threshold (float) -- A correlation threshold that an indicator must meet with the target indicator in order to not be removed.
    
    Returns -- a dataframe of the features that meet the correlation threshold with the target indicator.
    '''
    if abs(corr_threshold) > 1.0:
        print("ERROR: Correlation threshold must be between -1 and 1")
        return None


    df = indicator_dataframe(country, start_year, end_year, coverage_threshold)

    cor = df.corr()[target_indicator_code]
    #plt.figure(figsize=(12,10)) #Plot the correlation matrix
    #sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)  
    #plt.show()

    abs_cor = abs(cor) #Absolute correlation with target variable
    relevant_features = abs_cor[abs_cor>corr_threshold]  #Select the indicators that meet the threshold
    return relevant_features



    


In [None]:
pearson_correlation_feature_selection('Colombia', 'SP.POP.TOTL', 2011, 2013, 1, 0.9)

AG.CON.FERT.PT.ZS          1.0
AG.CON.FERT.ZS             1.0
AG.LND.AGRI.K2             1.0
AG.LND.AGRI.ZS             1.0
AG.LND.ARBL.HA             1.0
                          ... 
per_si_allsi.cov_q1_tot    1.0
per_si_allsi.cov_q2_tot    1.0
per_si_allsi.cov_q3_tot    1.0
per_si_allsi.cov_q4_tot    1.0
per_si_allsi.cov_q5_tot    1.0
Name: SP.POP.TOTL, Length: 1010, dtype: float64

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=72b1e015-02d7-48b2-beb6-dcae4caaa79e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>