In [None]:
import pandas as pd
import numpy as np
#Load common functions
import ipynb.fs.full.common as common

In [None]:
MML_VARIABLE_ANALYZED = 'POP'
REGION = "LAM"

In [None]:
df = pd.read_excel(f"data/{REGION}/AnnualTotPopMidYear-20221027061003.xlsx", sheet_name='Data')
#Annual Total Population at Mid-Year (thousands)
df[0:5]

In [None]:
del df['ISO 3166-1 numeric code']
del df['Note']
df.columns = df.columns.astype(str)
df['Location'] = df['Location'].str.strip()


In [None]:
def fix_countries_for_region(df, region=REGION):
    '''Matc h country column name in dataset with MML dict of country names for region.
    Unfortunately, it's a manual job. We know which names are missing according to MML, but we don't how the truth dataset named that country (if exists at all).
    
    '''
    if region == 'DEVELOPED':
        fix_countries_for_developed(df)
    if region == 'LAM':
        fix_countries_for_lam(df)
def fix_countries_for_developed(df):
    common.replace_value_in_column(df,'Location','Tajikistan','Tayikistan')
    common.replace_value_in_column(df,'Location','Lebanon','Libano')
    common.replace_value_in_column(df,'Location','Bosnia and Herzegovina','Bosnia y Herzegovina')
    common.replace_value_in_column(df,'Location','Japan','Japon')
    common.replace_value_in_column(df,'Location','Kyrgyzstan','Kirguistan')
    common.replace_value_in_column(df,'Location', 'Ukraine', 'Ucrania')
    common.replace_value_in_column(df,'Location','Czechia','Republica Checa')
    common.replace_value_in_column(df,'Location','Finland','Finlandia')
    common.replace_value_in_column(df,'Location','Denmark','Dinamarca')
    common.replace_value_in_column(df,'Location','Spain','España')
    common.replace_value_in_column(df,'Location','United States of America','EEUU')
    common.replace_value_in_column(df,'Location','Norway','Noruega')
    common.replace_value_in_column(df,'Location','Belgium','Belgica')
    common.replace_value_in_column(df,'Location','Hungary','Hungria')
    common.replace_value_in_column(df,'Location','Russian Federation','Rusia')
    common.replace_value_in_column(df,'Location','Switzerland','Suiza')
    common.replace_value_in_column(df,'Location','France','Francia')
    common.replace_value_in_column(df,'Location','Poland','Polonia')
    common.replace_value_in_column(df,'Location','Ireland','Irlanda')
    common.replace_value_in_column(df,'Location','Slovenia','Eslovenia')
    common.replace_value_in_column(df,'Location','Greece','Grecia')
    common.replace_value_in_column(df,'Location','Romania','Rumania')
    common.replace_value_in_column(df,'Location','Sweden','Suecia')
    common.replace_value_in_column(df,'Location','Croatia','Croacia')
    common.replace_value_in_column(df,'Location','Netherlands','Paises Bajos')
    common.replace_value_in_column(df,'Location','Latvia','Letonia')
    common.replace_value_in_column(df,'Location','Kazakhstan','Kazajistan')
    common.replace_value_in_column(df,'Location','Lithuania','Lituania')
    common.replace_value_in_column(df,'Location','United Kingdom','Reino Unido')
    common.replace_value_in_column(df,'Location','TFYR Macedonia','Macedonia')
    common.replace_value_in_column(df,'Location','Germany','Alemania')
    common.replace_value_in_column(df,'Location','Italy','Italia')
    common.replace_value_in_column(df,'Location','New Zealand','Nueva Zelanda')
    common.replace_value_in_column(df,'Location','Belarus','Bielorrusia')
    common.replace_value_in_column(df,'Location','Republic of Moldova','Moldavia')
    common.replace_value_in_column(df,'Location','Slovakia','Eslovaquia')
    common.replace_value_in_column(df,'Location','Azerbaijan','Azerbaiyan')
    
def fix_countries_for_lam(df):
    common.replace_value_in_column(df, 'Location', 'Venezuela (Bolivarian Republic of)','Venezuela')
    common.replace_value_in_column(df, 'Location', 'Bolivia (Plurinational State of)','Bolivia')    
    common.replace_value_in_column(df, 'Location', 'Dominican Republic', 'Republica Dominicana')
    common.replace_value_in_column(df, 'Location', 'Trinidad and Tobago', 'Trinidad y Tobago')

In [None]:
print("Missing countries for region")
print(common.get_missing_countries_for_region(df, 'Location', REGION))
#Actually, they are not missing, they have a different value
fix_countries_for_region(df)
print("Missing countries for region after fix")
print(common.get_missing_countries_for_region(df, 'Location', REGION))

In [None]:
df_truth = common.get_countries_for_region(df, 'Location', REGION)
assert len(df_truth) == len(common.regions[REGION]['countries']), 'MISSING COUNTRIES FOR REGION'

In [None]:
years_filtered = np.arange(1970,2020,2) 
columns_filter = np.append(['Location'],years_filtered)

df_truth = df_truth[columns_filter]
assert 0 == df_truth.isna().sum().sum(), "MISSING VALUES. CONSIDER FILL STRATEGY" #If 0, then we don't have missing values.
assert not (df_truth == 0).any().any(), "CELLS WITH 0" 

Finally, we got our truth dataframe. Now, MML

In [None]:
df_mml = common.get_mml_data(REGION)
df_mml = df_mml[['Year',MML_VARIABLE_ANALYZED]]
df_mml = df_mml[df_mml.Year <= 2020]
# According to truth dataset, population values are expressed in thousands, so 1 in truth is actually 1000.
df_mml[MML_VARIABLE_ANALYZED] = df_mml[MML_VARIABLE_ANALYZED] / 1000 
df_mml[0:3]

Ok, that was easy. Back to truth df. 
We need to transpose df, get only even years and group by some condition

In [None]:
df_T = df_truth.set_index('Location').T
df_T = df_T.reset_index().rename(columns={"index": "Year"})


df_truth = df_T.astype({'Year': 'int64'})
df_truth['sum'] = df_truth.iloc[:, 1:len(common.regions[REGION]['countries'])+1].sum(axis=1)
df_truth_aggs = df_truth[['Year','sum']]

In [None]:
df = pd.merge(df_mml,df_truth_aggs,on='Year').rename(columns={"POP": "mml_value", "sum":"truth_value"})
df[0:3]

In [None]:

common.compare_mml_vs_truth_line_plot(df, 'Year', 'mml_value', 'truth_value', f"{MML_VARIABLE_ANALYZED}*1000", REGION)

As we see in the graph, from 1988, mml stops growing as the truth continues the original tendence.

### 1970 - 1988

So, let's take only from 1970-1988

Mean seems to be quite accurate in that interval

### Error visualization

In [None]:
common.compute_porcentual_diff_between_truth_and_mml(df, 'mml_value', 'truth_value')
common.porcentual_diff_plot(df,MML_VARIABLE_ANALYZED, REGION, years_filtered, np.arange(-20,21,5))

In [None]:
common.save_porcentual_diff_to_excel_for_variable(df, MML_VARIABLE_ANALYZED, REGION)