# Organize the dtypes of each dataset into a dataframe

In [1]:
from glob import glob
from os import path
import os
import pandas as pd
import numpy as np

DATASETS = "../owid-datasets/datasets/*"

def load_data(ds):
    return pd.read_csv(os.path.join("../owid-datasets/datasets/", ds, ds + ".csv"))

In [2]:
# consolidate code: dtype hierarchy. 


# create empty df to insert the dtypes for each
from itertools import chain 


def get_ds_dict():
    ds_dict = {}
    for ds in glob(DATASETS):
        ds = os.path.basename(ds)
        df = load_data(ds)
        ds_dict[ds] = df.dtypes
    return ds_dict

        
def get_df_cols(ds_dict):
    y_dim =  set((chain(*[dtypes.index for dtypes in ds_dict.values()])))
    x_dim = ds_dict.keys()
    df_cols = pd.DataFrame(index=x_dim, columns=y_dim)
    df_cols = df_cols.astype('object')
    # enter rows to df_cols (index is the dataset, columns are all the fields in every dataset, column values are what dtype the column is) from ds_dict. 
    for ds, cols in ds_dict.items():
        cols = cols.astype('object')
        df_cols.loc[ds, cols.index] = cols
    return df_cols


def get_col_to_uniq(df_cols):
    """ returns map from """
    # gather conflict columns: where there are more than 2 unique values in a column
    s = df_cols.describe(include='all').loc['unique', :]
    s = s[s > 1]
    
    # gather unique values from columns
    col_to_uniq = {}
    for c in s.index:
        col_to_uniq[c] = df_cols.loc[:, c].unique()
    return col_to_uniq
        
def get_col_to_dtype(col_to_uniq, hierarchy=None):
    """ get dtype hierarchy per column
    """
    if hierarchy is None:
        hierarchy = ["object", "float64", "int64", "nan"]
        
    # apply hierarchy to conflict columns
    col_to_dtype = {}
    for col, uniq in col_to_uniq.items():
        col_to_dtype[col] = hierarchy[min([hierarchy.index(str(v)) for v in uniq])]
    return col_to_dtype

def apply_data_hierarchy(df, col_to_dtype, overwrite={}):
    """ apply a data hierarchy on a dataframe
    """
    if not any([str(col) in col_to_dtype for col in df.columns]):
        return df
    
    # create sub_dict from data_hierarchy
    for k,v in overwrite.items():
        col_to_dtype[k] = v
 
    w = {col: col_to_dtype[col] for col in df.columns if col in col_to_dtype}


    return df.astype(w)

    

In [3]:
ds_dict = get_ds_dict()
df_cols = get_df_cols(ds_dict)

In [4]:
col_to_uniq = get_col_to_uniq(df_cols)
col_to_dtype = get_col_to_dtype(col_to_uniq)

In [5]:
overwrite = {'Forest Transition Phase': 'object'}

# don't have to create a blank df with a do_while
do_while = 0
for ds in glob(DATASETS):
    do_while += 1
    if do_while == 1:
        ds = os.path.basename(ds)
        df = load_data(ds)
        df = df.set_index(["Entity", "Year"])
        df = apply_data_hierarchy(df, col_to_dtype, overwrite=overwrite)
        continue
        
    ds = os.path.basename(ds)
    df2 = load_data(ds)
    df2 = df2.set_index(["Entity", "Year"])
    
    overwrite['ds_name'] = ds
    df2 = apply_data_hierarchy(df2, col_to_dtype, overwrite=overwrite)
    
    try:
        df = pd.merge(df, df2, how='left', on=["Entity", "Year"])
    except ValueError:
        print("MergeError:", ds)

MergeError: Cumulative share of marriages ending in divorce (England and Wales, UK ONS)


In [6]:
df_back = df.copy()

In [7]:
df2 = load_data('Cumulative share of marriages ending in divorce (England and Wales, UK ONS)')

In [8]:
retry_df = apply_data_hierarchy(df2, col_to_dtype, overwrite={"Year": "object"})
pd.merge(df, retry_df, how='left', on=["Entity", "Year"])

Unnamed: 0,Entity,Year,"Net investment (Blum, Ducoing, McLaughlin (2017))","Green investment (Blum, Ducoing, McLaughlin (2017))","Genuine saving (Blum, Ducoing, McLaughlin (2017))","GSTFP (Blum, Ducoing, McLaughlin (2017))","GDP (Blum, Ducoing, McLaughlin (2017))","Green carbon (Blum, Ducoing, McLaughlin (2017))",Other social policy areas,Nominal rates for domestic letters (United States Postal Service (2018)),...,MDG4.A: child mortality rate,MDG5.A: maternal mortality rate,MDG5.B: share of pregnant women receiving antenatal care,MDG5.B: share of married women using contraceptives,0 to 4 (UN Population Division (2015)),5 to 14 (UN Population Division (2015)),15 to 24 (UN Population Division (2015)),25 to 60 (UN Population Division (2015)),70+ (UN Population Division (2015)),"Cumulative share of marriages ending in divorce (England and Wales, UK ONS)"
0,Argentina,1900,68.78,67.61,69.36,611.86,2915.58,,,,...,,,,,,,,,,
1,Argentina,1901,124.01,121.35,123.28,684.19,3077.47,,,,...,,,,,,,,,,
2,Argentina,1902,84.41,82.08,83.99,556.85,2930.59,,,,...,,,,,,,,,,
3,Argentina,1903,105.50,103.17,105.54,671.40,3259.57,,,,...,,,,,,,,,,
4,Argentina,1904,156.33,153.95,156.84,781.99,3509.50,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1207,World,1996,662.06,373.26,942.80,,16061.67,153.08,,,...,,,,,614734.981,1219785.321,1041917.085,2701248.227,171023.894,
1208,World,1997,820.14,542.90,1110.82,,16527.22,318.31,,,...,,,,,608791.033,1232641.789,1049282.855,2757918.339,177082.696,
1209,World,1998,819.21,595.87,1174.11,,16944.97,366.52,,,...,,,,,606915.726,1240140.794,1057989.717,2813894.264,183377.746,
1210,World,1999,756.54,508.43,1108.51,,17354.72,274.76,,,...,,,,,606605.033,1242631.471,1069561.326,2870423.471,189622.989,
