<a href="https://colab.research.google.com/github/hikikae/worldBank/blob/main/PipeLineWB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import requests
import plotly.express as px

# ------------------------        Extraction          --------------------------
def extract_api_data(indicator_code):
    try:
        url = f"http://api.worldbank.org/v2/country/all/indicator/{indicator_code}?format=json" 
        data = [] 
        response = requests.get(url)
        n = response.json()[0]['pages']#-299
        for x in range(1, n+1):
            response = requests.get(f"{url}&page={x}")   
            for item in response.json()[1]:
                data.append(item)
        return data
    except Exception as e:
        print(e)

# ---------------------------        Data Lake        --------------------------
def convert_to_dataframe(data):
    df = pd.DataFrame(data)
    #this could be the datalake
    df = df.to_json('./raw_data.json')


# ---------------------------      Data Warehouse     --------------------------
def transform_data():
    df = pd.read_json('./raw_data.json')    
    #flat json
    df= pd.concat([pd.json_normalize(df['indicator']).add_prefix('indicator.'),
                     pd.json_normalize(df['country']).add_prefix('country.'),
                     df.drop(['indicator', 'country'], axis=1)], axis=1)     
    #filling the nulls
    df['country.id'].fillna(df['countryiso3code'], inplace=True)
    df['countryiso3code'].fillna(df['country.id'], inplace=True)
    df=df[['country.value','countryiso3code','date','value']]
    df = df.rename(columns={'country.value': 'country',
                            'countryiso3code': 'country_code', 'date': 'year',
                            'value': 'population'})
    #this could be the datawarehouse
    df.to_csv('./clean_data.csv', index=False)


# ----------------------      Country Analysis  (Text)     ---------------------

def country_dataframe():
    # select only those that are countries
    df = pd.read_csv('./clean_data.csv')
    df_countries = df.iloc[3038:]
    df_countries.to_csv('./country_data.csv', index=False)

def average_population_by_years():
    df_countries=pd.read_csv('./country_data.csv')
    #average population by country over the years
    average_population = df_countries.groupby("country")["population"].mean().reset_index().sort_values(by='population', ascending=False).head(10)
    average_population['population'] = average_population[['population']].applymap('{:,.0f}'.format)
    print('average population by country over the years')
    print(average_population)
    return average_population

def population_country_2021():
    #population of each country in 2021
    df_countries=pd.read_csv('./country_data.csv')
    population_2021 = df_countries.loc[df_countries['year'] == 2021, ['year', 'country', 'population']].reset_index().sort_values(by='population', ascending=False).head(10)
    population_2021['population'] = population_2021[['population']].applymap('{:,.0f}'.format)
    print('population of each country in 2021')
    print(population_2021)
    return population_2021

def growth_percentage_country():
    #growth percentage by country over the years
    df_countries=pd.read_csv('./country_data.csv')
    df_countries_percentage = df_countries.copy()
    df_countries_percentage.loc[:,'last'] = df_countries_percentage.groupby('country')['population'].shift(1)
    df_countries_percentage['growth'] = (df_countries_percentage['population'] / df_countries_percentage['last']) - 1
    growth_percentage = (df_countries_percentage.groupby('country')['growth'].mean()*100).reset_index().rename(columns={'growth': 'percentage'}).sort_values(by='percentage', ascending=False).head(10)
    print('growth percentage by country over the years')
    print(growth_percentage)
    return(growth_percentage)

# ---------------------      Country Analysis (Plot)          ------------------
def plot_average_population_by_years(average_population):
    #average_population=average_population_by_years()
    fig = px.bar(average_population, x='country', y='population', 
             color='population', color_continuous_scale='PuBuGn',
             title='average population by country over the years')
    fig.update_layout(xaxis_title='country', yaxis_title='population',
                  font=dict(size=15, family='Franklin Gothic'),template='plotly_dark')
    fig.show()

def plot_population_2021(population_2021):
    fig = px.scatter(population_2021, x='country', y='population', 
             color='population', color_continuous_scale='PuBuGn',
             title='population of each country in 2021')
    fig.update_layout(xaxis_title='country', yaxis_title='population',
                 font=dict(size=15, family='Franklin Gothic'),template='plotly_dark')
    fig.show()

def plot_growth_percentage(growth_percentage):
    fig = px.bar(growth_percentage, x='country', y='percentage', 
             color='percentage', color_continuous_scale='PuBuGn',
             title='growth percentage by country over the years')
    fig.update_layout(xaxis_title='country', yaxis_title='percentage_growth',
                  font=dict(size=15, family='Franklin Gothic'),template='plotly_dark')
    fig.show()

# ---------------------      Country Analysis (ML)          ------------------
from sklearn.linear_model import LinearRegression

def population_estimates(country, future_year):
    df_countries=pd.read_csv('./country_data.csv')
    df_country_estimate = df_countries[df_countries['country'] == country]
    X = df_country_estimate['year'].values.reshape(-1, 1)
    y = df_country_estimate['population'].values.reshape(-1, 1)
    reg = LinearRegression().fit(X, y)
    future_population = int(reg.predict([[future_year]])[0][0])
    future_population_str = '{:,.0f}'.format(future_population)
    print(f'Projected population for {country} in {future_year}: {future_population_str}')
    return future_population
    
def main():
    indicator= 'SP.POP.TOTL'
    data= extract_api_data(indicator)
    convert_to_dataframe(data)
    transform_data()
    country_dataframe()
    average_population=average_population_by_years()
    plot_average_population_by_years(average_population)
    population_2021= population_country_2021()
    plot_population_2021(population_2021)
    growth_percentage=growth_percentage_country()
    plot_growth_percentage(growth_percentage)
    future_population = population_estimates(country='China', future_year=2023)
   

if __name__ == '__main__':
    main()

average population by country over the years
                country     population
41                China  1,099,373,145
89                India    899,719,861
206       United States    256,301,023
90            Indonesia    182,159,909
26               Brazil    148,560,342
161  Russian Federation    139,932,931
149            Pakistan    124,917,221
98                Japan    118,308,800
15           Bangladesh    108,516,768
144             Nigeria    107,223,178


population of each country in 2021
     index  year             country     population
41    2542  2021               China  1,412,360,000
89    5518  2021               India  1,407,563,842
206  12772  2021       United States    331,893,745
90    5580  2021           Indonesia    273,753,191
149   9238  2021            Pakistan    231,402,117
26    1612  2021              Brazil    214,326,223
144   8928  2021             Nigeria    213,401,323
15     930  2021          Bangladesh    169,356,251
161   9982  2021  Russian Federation    143,449,286
127   7874  2021              Mexico    126,705,138


growth percentage by country over the years
                    country  percentage
181     St. Kitts and Nevis    0.292228
29                 Bulgaria    0.223298
109                  Latvia    0.199061
48                  Croatia    0.102979
87                  Hungary    0.046198
24   Bosnia and Herzegovina    0.014191
115               Lithuania   -0.007830
72                  Georgia   -0.020491
203                 Ukraine   -0.036549
168                  Serbia   -0.053907


Projected population for China in 2023: 1,521,024,389
