In [1]:
import timeit
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from datetime import datetime, timezone, timedelta
from time import time


In [2]:
def timenow():
    time = datetime.utcnow() + timedelta(hours=2)
    return time.strftime('%H:%M:%S')

In [3]:
def timer_func(func):
    # This function shows the execution time of 
    # the function object passed
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [4]:
folder_loc = r"C:\Users\laraujo\Music\Work_august\dados\_Joined_per_dept"

In [5]:
def dept_nb_from_path(path):
    return path.split("_")[-1].split('.')[0]

@timer_func
def read_1dept(file_path):
    dept = dept_nb_from_path(file_path)
    print(dept)
    dept_gdf = gpd.read_file(file_path)
    return dept_gdf
    
dept_file_names = os.listdir(folder_loc)
dept_file_paths = [os.path.join(folder_loc, file) for file in dept_file_names]

# take out the :3 to read all
charged_gdfs = {dept_nb_from_path(dept_file_path): read_1dept(dept_file_path) for dept_file_path in dept_file_paths[:3]}


01
Function 'read_1dept' executed in 91.2248s
02
Function 'read_1dept' executed in 95.4405s
03
Function 'read_1dept' executed in 61.8393s


In [6]:
gdfs = charged_gdfs.copy()

In [7]:
gdfs["01"].head(2)

Unnamed: 0,bnb_id,altitude_sol,adr_fiabilite_niv_1,adr_fiabilite_niv_2,config_adr,geombui_area,cerffo2020_l_adresse,adedpe202006_logtype_min_classe_ener_ges,adedpe202006_logtype_coherence_data_methode_dpe,adedpe202006_logtype_is_3cl,...,TYPO_count,TYPO_unique,TYPO_min,Part-prop_mean,Part-Pauvr_mean,INSEE_DEP_min,Ter_P1_min,Ter_P2-7_min,TYPO_M,geometry
0,01005000AB0110_bc15409b9aa9d1c,,problème de géocodage,batiment non géocodée au numéro,batiment sans adr correctement géocodée,,"{""5084 LE BOURG""}",,,,...,,,,0.517241,0.103448,1,FRANCE_TERRE_TUILE,FRANCE_TUILE,,"MULTIPOLYGON (((847384.100 6546081.977, 847367..."
1,01007000ZY0289_0d009a5ddaa18e9,,problème de géocodage,batiment non géocodée au numéro,batiment sans adr correctement géocodée,,"{""5383 TERRE LABBE""}",,,,...,,,,,,1,FRANCE_TERRE_TUILE,FRANCE_TUILE,,"MULTIPOLYGON (((879809.261 6548057.337, 879807..."


In [8]:
list(gdfs["01"].columns)

['bnb_id',
 'altitude_sol',
 'adr_fiabilite_niv_1',
 'adr_fiabilite_niv_2',
 'config_adr',
 'geombui_area',
 'cerffo2020_l_adresse',
 'adedpe202006_logtype_min_classe_ener_ges',
 'adedpe202006_logtype_coherence_data_methode_dpe',
 'adedpe202006_logtype_is_3cl',
 'adedpe202006_logtype_ratio_ges_conso',
 'adedpe202006_logtype_ch_gen_lib',
 'adedpe202006_logtype_ecs_gen_lib',
 'adedpe202006_min_conso_ener',
 'adedpe202006_min_estim_ges',
 'adedpe202006_max_conso_ener',
 'adedpe202006_max_estim_ges',
 'adedpe202006_l_ch_gen_princ',
 'adedpe202006_l_ecs_gen_princ',
 'anarnc202012_nb_log',
 'anarnc202012_nb_lot_garpark',
 'anarnc202012_nb_lot_tertiaire',
 'anarnc202012_nb_lot_tot',
 'anarnc202012_l_nom_copro',
 'cerdvf2021v5_min_valeur_fonc_par_m2_hab',
 'cerdvf2021v5_max_valeur_fonc_par_m2_hab',
 'cerdvf2021v5_med_valeur_fonc_par_m2_hab',
 'cerdvf2021v5_mean_valeur_fonc_par_m2_hab_t1',
 'cerdvf2021v5_mean_valeur_fonc_par_m2_hab_t2',
 'cerdvf2021v5_mean_valeur_fonc_par_m2_hab_t3',
 'cerdvf20

### Reliable address

In [9]:
# Percentage of reliable addresse per department

def dept_adr_reliable(df):
    reliable = df[["bnb_id","adr_fiabilite_niv_1"]].groupby("adr_fiabilite_niv_1").count()/len(df)*100 
    return reliable

def all_adr_reliable(gdfs):
    reliables = [dept_adr_reliable(gdfs[one_gdf]).rename(columns = {'bnb_id':f"{one_gdf} (%)"}) for one_gdf in gdfs]
    all_reliable = pd.concat(reliables, axis=1).round(decimals=1)
    return all_reliable

all_adr_reliable(gdfs)

Unnamed: 0_level_0,01 (%),02 (%),03 (%)
adr_fiabilite_niv_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bâtiment fiable,69.0,86.3,70.7
bâtiment fiable à la tup,1.6,1.3,2.0
bâtiment moyennement fiable,5.0,2.7,2.0
problème de géocodage,24.4,9.7,25.2


# Formatting

## Important columns first

In [10]:
def change_cols_position(dep):
    """ Changes the position of columns of territoire and filosofi
    These are important entries that are not going to be altered"""

    cols_change_position = dep[["Part-prop_mean" , "Part-Pauvr_mean" ,]]
    dep = dep.drop(list(cols_change_position.columns), axis=1)
    dep_new_order = pd.concat([cols_change_position, dep], axis=1)

    return dep_new_order

gdfs = {k:change_cols_position(v) for (k,v) in gdfs.items()}

## Function generate Danube entries

### Period

In [11]:
gdfs["01"]['cerffo2020_annee_construction'] 

0            NaN
1            NaN
2         2019.0
3            NaN
4         2008.0
           ...  
225149    1963.0
225150    1964.0
225151    1958.0
225152    1960.0
225153    1959.0
Name: cerffo2020_annee_construction, Length: 225154, dtype: float64

In [12]:
def create_period(dep):
    """  Transform BDNB 'cerffo2020_annee_construction' into Danube periods."""    
    period_relations = {           
                 'P1' :
                     (dep['cerffo2020_annee_construction'] <= 1948 ),

                 'P2':
                     (dep['cerffo2020_annee_construction'] > 1948 ) &
                     (dep['cerffo2020_annee_construction'] <= 1973 ),


                 'P3':
                     (dep['cerffo2020_annee_construction'] > 1973 ) &
                     (dep['cerffo2020_annee_construction'] <= 1981 ),

                 'P4':
                     (dep['cerffo2020_annee_construction'] > 1981 ) &
                     (dep['cerffo2020_annee_construction'] <= 1989 ),

                 'P5':
                     (dep['cerffo2020_annee_construction'] > 1989 ) &
                     (dep['cerffo2020_annee_construction'] <= 2000 ),

                 'P6':
                     (dep['cerffo2020_annee_construction'] > 2000 ) &
                     (dep['cerffo2020_annee_construction'] <= 2012 ),

                 'P7':
                     (dep['cerffo2020_annee_construction'] > 2012 )
                }


    period_values = list(period_relations.keys())
    period_conditions = list(period_relations.values())

    dep['period'] = np.select(period_conditions, period_values, default=None)

    # put into first column
    first_column = dep.pop('period')
    dep.insert(0, 'period', first_column)

### Territory

In [13]:
def create_territory_1dep(dep):
    dep['territory'] = dep.apply(
        lambda x: (
            None if x["period"] is None
            else x["Ter_P1_min"] if x["period"] == "P1" 
            else x["Ter_P2-7_min"]
        ), axis=1
    ) 
    # put into first column
    first_column = dep.pop('territory')
    dep.insert(0, 'territory', first_column)

### Usage

In [14]:
def create_usage_1dep(dep):
    """  Transform BDNB Usages into Danube Usages. 
    All usages in Danube are create with exception of the below: 
                                                                    Usages_ignored = [
                                                                    'BATIMENT AGRICOLE',
                                                                    'BATIMENT RELIGIEUX',
                                                                    'CHATEAU',
                                                                    'LOCAL NON CHAUFFE']"""
    relations = {           
                 'BATIMENT INDUSTRIEL' :
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Industrie' )
                     | (dep['cerffo2020_usage_niveau_3_txt'] == 'Etablissement industriel autre que carrière' ),       

                 'SERRE AGRICOLE':
                     (dep['cerffo2020_usage_niveau_3_txt'] == 'Serre' ),

                 'BATIMENT D ENSEIGNEMENT':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Enseignement' ),

                 'BATIMENT DE SANTE':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Centre de santé' ),

                 'COMMERCE':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Commerce' ),

                 'HABITAT':
                     (dep['cerffo2020_usage_niveau_1_txt'] == 'Résidentiel individuel' )
                     | (dep['cerffo2020_usage_niveau_1_txt'] == 'Résidentiel collectif' ) # not described in BDNB methodology
                     | (dep['cerffo2020_usage_niveau_3_txt'] == 'Maison exceptionnelle' ),

                 'TERTIAIRE':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Bureau' )
                     | (dep['cerffo2020_usage_niveau_2_txt'] == 'Hôtel' ),      

                 'BATIMENT SPORTIF':
                     (dep['cerffo2020_usage_niveau_3_txt'] == 'Espace sportif' )
                     | (dep['cerffo2020_usage_niveau_3_txt'] == 'Espace loisir' ),}


    usages = list(relations.keys())
    conditions = list(relations.values())

    dep['usage'] = np.select(conditions, usages, default=None)

    # put into first column
    first_column = dep.pop('usage')
    dep.insert(0, 'usage', first_column)

gdfs = {k:create_usage_1dep(v) for (k,v) in gdfs.items()}

### Typology

#### Mapuce

In [15]:
# Should I take out the local? Or should I pass it to final typology?
def create_typo_mapuce_S_1dep(dep):

    conditions_mapuce = [
        (dep['TYPO_M'] == 'pcif') |  
            (dep['TYPO_M'] == 'pcio') | 
            (dep['TYPO_M'] == 'pd')| 
            (dep['TYPO_M'] == 'psc') ,

        (dep['TYPO_M'] == 'icif') |  
            (dep['TYPO_M'] == 'icio') | 
            (dep['TYPO_M'] == 'id') ,

        (dep['TYPO_M'] == 'ba') ,
        (dep['TYPO_M'] == 'bgh'),
        (dep['TYPO_M'] == 'local')    
        ]

    values_mapuce = ['P', 'I', 'BA', 'IGH', "local"]

    dep['TYPO_M_S'] = np.select(conditions_mapuce, values_mapuce, default=None)

gdfs = {k:create_typo_mapuce_S_1dep(v) for (k,v) in gdfs.items()}

#### BDNB

In [16]:
def create_typo_bdnb_S_1dep(dep):
    conditions_typo_bdnb = [
        (dep['cerffo2020_usage_niveau_2_txt'] == 'Maison individuelle' ) |
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Maisons groupées' )| 
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Maison exceptionnelle' ), 

        (dep['cerffo2020_usage_niveau_2_txt'] == 'Immeuble collectif')|
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Résidentiel collectif autre')|
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Hôtel' )| 
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Bureau') |  
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Centre de santé') |  
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Enseignement') | 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin sans accès à la rue avec surface < 400m²')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin de centre commercial avec surface < 400m²')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin sur rue avec surface < 400m²'), 

        (dep['cerffo2020_usage_niveau_2_txt'] == 'Industrie') |  
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin grande surface (entre 400m² et 2499m²)')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin très grande surface (> 2500m²)') | 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Serre')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Espace sportif') ,

        (dep['igntop202103_bat_hauteur'] >= 39), 
        ]

    # create a list of the values we want to assign for each condition
    values_typo_bdnb = ['P', 'I', 'BA', 'IGH']

    dep['TYPO_BDNB_S'] = np.select(conditions_typo_bdnb, values_typo_bdnb, default=None)

gdfs = {k:create_typo_bdnb_S_1dep(v) for (k,v) in gdfs.items()}

### Combine both typos

In [17]:
def combine_typo_mapuce_bdnb(dep):
    dep['typology'] = np.where(dep.TYPO_M_S.notnull(), dep.TYPO_M_S, 
         (np.where(dep.TYPO_M_S.isnull(), dep.TYPO_BDNB_S, None)))
#     dep.fillna(np.nan)
    
    # put into first column
    first_column = dep.pop('typology')
    dep.insert(0, 'typology', first_column)

gdfs = {k:combine_typo_mapuce_bdnb(v) for (k,v) in gdfs.items()}

# Create all entries

In [20]:
@timer_func
def create_danube_entries(dep):
    create_period(dep)
    create_territory_1dep(dep)
    create_usage_1dep(dep)
    create_typo_mapuce_S_1dep(dep)
    create_typo_bdnb_S_1dep(dep)
    combine_typo_mapuce_bdnb(dep)


In [21]:
for dep in gdfs.values():
    create_danube_entries(dep)

Function 'create_danube_entries' executed in 93.5422s
Function 'create_danube_entries' executed in 26.2264s
Function 'create_danube_entries' executed in 11.4773s


### Checking output

In [22]:
dep1 = gdfs["01"]
dep1.head(2)

Unnamed: 0,typology,usage,territory,period,Part-prop_mean,Part-Pauvr_mean,bnb_id,altitude_sol,adr_fiabilite_niv_1,adr_fiabilite_niv_2,...,TYPO_count,TYPO_unique,TYPO_min,INSEE_DEP_min,Ter_P1_min,Ter_P2-7_min,TYPO_M,geometry,TYPO_M_S,TYPO_BDNB_S
0,BA,BATIMENT INDUSTRIEL,,,0.517241,0.103448,01005000AB0110_bc15409b9aa9d1c,,problème de géocodage,batiment non géocodée au numéro,...,,,,1,FRANCE_TERRE_TUILE,FRANCE_TUILE,,"MULTIPOLYGON (((847384.100 6546081.977, 847367...",,BA
1,,,,,,,01007000ZY0289_0d009a5ddaa18e9,,problème de géocodage,batiment non géocodée au numéro,...,,,,1,FRANCE_TERRE_TUILE,FRANCE_TUILE,,"MULTIPOLYGON (((879809.261 6548057.337, 879807...",,


In [23]:
dep1[dep1.period.notnull()][["period","cerffo2020_annee_construction"]].head()

Unnamed: 0,period,cerffo2020_annee_construction
2,P7,2019.0
4,P6,2008.0
6,P7,2017.0
7,P5,1995.0
13,P1,1800.0


In [24]:
dep1[dep1.territory.notnull()][["territory","period","Ter_P1_min" , "Ter_P2-7_min",]].head()

Unnamed: 0,territory,period,Ter_P1_min,Ter_P2-7_min
2,FRANCE_TUILE,P7,FRANCE_TERRE_TUILE,FRANCE_TUILE
4,FRANCE_TUILE,P6,FRANCE_TERRE_TUILE,FRANCE_TUILE
6,FRANCE_TUILE,P7,FRANCE_TERRE_TUILE,FRANCE_TUILE
7,FRANCE_TUILE,P5,FRANCE_TERRE_TUILE,FRANCE_TUILE
13,FRANCE_TERRE_TUILE,P1,FRANCE_TERRE_TUILE,FRANCE_TUILE


In [25]:
dep1[["usage",
        "typology",
        "TYPO_M" ,
        "TYPO_M_S",
        "cerffo2020_usage_niveau_1_txt" ,
        "cerffo2020_usage_niveau_2_txt",
        "cerffo2020_usage_niveau_3_txt", 
        "cerffo2020_l_usage_niveau_3_txt"]].head(10)

Unnamed: 0,usage,typology,TYPO_M,TYPO_M_S,cerffo2020_usage_niveau_1_txt,cerffo2020_usage_niveau_2_txt,cerffo2020_usage_niveau_3_txt,cerffo2020_l_usage_niveau_3_txt
0,BATIMENT INDUSTRIEL,BA,,,Secondaire,Industrie,Etablissement industriel,"{""Etablissement industriel""}"
1,,,,,Secondaire,Energie,Transformateur,{Transformateur}
2,HABITAT,P,,,Résidentiel individuel,Maison individuelle,Maison individuelle,"{""Maison individuelle""}"
3,,,,,Tertiaire & Autres,Tertiaire autre & Divers,Dépot couvert,"{""Dépot couvert""}"
4,HABITAT,P,,,Résidentiel individuel,Maison individuelle,Maison individuelle,"{""Maison individuelle""}"
5,,,,,Dépendance,Garage / Parking / Box,Garage,{Garage}
6,HABITAT,P,,,Résidentiel individuel,Maison individuelle,Maison individuelle,"{""Maison individuelle""}"
7,HABITAT,P,,,Résidentiel individuel,Maison individuelle,Maison individuelle,"{""Maison individuelle""}"
8,,,,,Secondaire,Energie,Transformateur,{Transformateur}
9,,,,,Dépendance,Garage / Parking / Box,Garage,{Garage}


# Mixed usage

## For dept 1

In [26]:
dep1["L1"] = dep1['cerffo2020_l_usage_niveau_1_txt'].str.split(",", expand=False).map(len)
dep1["L2"] = dep1['cerffo2020_l_usage_niveau_2_txt'].str.split(",", expand=False).map(len)
dep1["L3"] = dep1['cerffo2020_l_usage_niveau_3_txt'].str.split(",", expand=False).map(len)

In [27]:
len(dep1[dep1["L3"]>1]) / len(dep1) * 100

4.309494834646508

In [28]:
# usage is defined even though there is multiple usages
len(dep1[(dep1["L3"]>1) &
     (dep1["usage"].notnull())]) / len(dep1) * 100

3.6499462590049476

In [29]:
# usage is NOT defined because methodology does not follow logic when there is multiple usage
len(dep1[(dep1["L3"]>1) &
     (dep1["usage"].isnull())]) / len(dep1) * 100

0.6595485756415609

In [30]:
# typology is defined even though there is multiple usages
len(dep1[(dep1["L3"]>1) &
     (dep1["typology"].notnull())]) / len(dep1) * 100

3.645060714000196

In [31]:
# typology is NOT defined because methodology does not follow logic when there is multiple usage

len(dep1[(dep1["L3"]>1) &
     (dep1["typology"].isnull())]) / len(dep1) * 100

0.6644341206463132

## Generalize 

In [32]:
def create_col_num_multiple_usage(dep):
    def count_multiple_usage(col):
        return dep[col].str.split(",", expand=False).map(len)
    
    dep["L1"] = count_multiple_usage('cerffo2020_l_usage_niveau_1_txt')
    dep["L2"] = count_multiple_usage('cerffo2020_l_usage_niveau_2_txt')
    dep["L3"] = count_multiple_usage('cerffo2020_l_usage_niveau_3_txt')


In [33]:
for dep in gdfs.values():
    create_col_num_multiple_usage(dep)

In [34]:
gdfs["02"].head()

Unnamed: 0,typology,usage,territory,period,Part-prop_mean,Part-Pauvr_mean,bnb_id,altitude_sol,adr_fiabilite_niv_1,adr_fiabilite_niv_2,...,INSEE_DEP_min,Ter_P1_min,Ter_P2-7_min,TYPO_M,geometry,TYPO_M_S,TYPO_BDNB_S,L1,L2,L3
0,P,HABITAT,FRANCE_PIERRE_CALCAIRE_ARDOISE,P1,0.882353,0.0,020710000A0660_a92bde7e860b68c,,bâtiment fiable,batiment sans voisin à l'adresse,...,2,FRANCE_PIERRE_CALCAIRE_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((710145.238 6923292.828, 710143...",,P,1,1,1
1,P,HABITAT,FRANCE_ARDOISE,P7,0.479924,0.055303,02073000AC0276_2cbc17fa8347b0d,,problème de géocodage,batiment géocodage mauvais score,...,2,FRANCE_PIERRE_CALCAIRE_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((765493.239 6921987.923, 765486...",,P,1,1,1
2,,,,,1.0,0.0,020830000A2129_349a8ee5b9bea7b,,problème de géocodage,batiment non géocodée au numéro,...,2,FRANCE_PIERRE_CALCAIRE_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((735756.052 6893854.709, 735752...",,,1,1,1
3,BA,BATIMENT INDUSTRIEL,,,,,02097000ZE0015_f51fd9c7bd1b7a2,,problème de géocodage,batiment non géocodée au numéro,...,2,FRANCE_PIERRE_CALCAIRE_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((770812.839 6949184.577, 770806...",,BA,1,1,1
4,P,HABITAT,FRANCE_PIERRE_CALCAIRE_ARDOISE,P1,0.875,0.125,022380000G0001_f9099794cfa7abb,81.0,problème de géocodage,batiment non-géocodé,...,2,FRANCE_PIERRE_CALCAIRE_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((734252.200 6946102.700, 734261...",,P,1,1,1


# Percentage of data

In [44]:
# Percentage of typology data
@timer_func
def dept_perc_typology(df):
    
    def percent_not_null(column):
        return df[column].notnull().sum() / len(df) * 100
    
    def percent_entries_not_null():
        entries_not_null = df[(df['period'].notnull()) & 
                        #            (df['territory'].notnull()) & 
                                   (df['usage'].notnull()) & 
                                   (df['typology'].notnull()) ] 
        return len(entries_not_null) / len(df) * 100
      
 
    
    percent_typo = {
                    "dpe_3cl_u_mur" : percent_not_null("adedpe202006_logtype_mur_u_ext"),
                    "all_danube_entries" : percent_entries_not_null(),
                    "period" : percent_not_null("period"),
                    "territory" : percent_not_null("territory"),
                    "usage" : percent_not_null("usage"),
                    "typology" : percent_not_null("typology"),
                    "mapuce_typo_tot": percent_not_null("TYPO_count"),
                    "mapuce_typo_uni": percent_not_null("TYPO_M"),
                    "mapuce_typo_S" : percent_not_null("TYPO_M_S"),
                    "bdnb_typo_S" : percent_not_null("TYPO_BDNB_S"),
                    "bdnb_usage_tot" : percent_not_null("cerffo2020_usage_niveau_3_txt"),
                        }
    
    def percent_usage_multiple_indicators():
        usage_mult_ind = {}
        cond_mult = df["L3"]>1
        # all multiple usages in bdnb
        usage_mult_ind["bdnb_all_multiple_usage"] =  len(df[cond_mult]) 
        # usage is defined even though there is multiple usages
        usage_mult_ind["mult_usage_OK"] =  len(df[(cond_mult) & (df["usage"].notnull())])
        # usage is NOT defined because methodology does not follow logic when there is multiple usage
        usage_mult_ind["mult_usage_KO"] =  len(df[(cond_mult) & (df["usage"].isnull())])
        # typology is defined even though there is multiple usages
        usage_mult_ind["mult_typo_OK"] =  len(df[(cond_mult) & (df["typology"].notnull())])
        # typology is NOT defined because methodology does not follow logic when there is multiple usage
        usage_mult_ind["mult_typo_KO"] =  len(df[(cond_mult) & (df["typology"].isnull())])
        usage_mult_ind = {k:v / len(df) * 100 for (k,v) in usage_mult_ind.items()}
        return usage_mult_ind   
    
    usage_mult_ind = percent_usage_multiple_indicators()
    percent_typo.update(usage_mult_ind)
                         
    perc_cases = pd.DataFrame.from_dict(percent_typo, orient = 'index')
    return perc_cases


def all_perc_typology(gdfs):
    perc_typologies = [dept_perc_typology(gdfs[one_gdf]).rename(columns={0:one_gdf}) for one_gdf in gdfs]
    all_perc_typologies = pd.concat(perc_typologies, axis=1)
#     .round(decimals=1)
    return all_perc_typologies

perc = all_perc_typology(gdfs)
perc

Function 'dept_perc_typology' executed in 23.2163s
Function 'dept_perc_typology' executed in 4.2611s
Function 'dept_perc_typology' executed in 2.4319s


Unnamed: 0,01,02,03
dpe_3cl_u_mur,7.163986,4.563447,4.307571
all_danube_entries,87.345994,88.468592,87.842282
period,87.79502,88.774093,88.201544
territory,87.79502,88.774093,88.201544
usage,92.660135,92.81778,92.792712
typology,92.820025,93.37794,92.616954
mapuce_typo_tot,12.281372,10.128283,0.0
mapuce_typo_uni,10.366682,6.35242,0.0
mapuce_typo_S,10.366682,6.35242,0.0
bdnb_typo_S,92.368335,92.671158,92.616954


In [46]:
gdfs["03"]

Unnamed: 0,typology,usage,territory,period,Part-prop_mean,Part-Pauvr_mean,bnb_id,altitude_sol,adr_fiabilite_niv_1,adr_fiabilite_niv_2,...,INSEE_DEP_min,Ter_P1_min,Ter_P2-7_min,TYPO_M,geometry,TYPO_M_S,TYPO_BDNB_S,L1,L2,L3
0,,,,,0.875000,0.187500,03010000ZL0002_9f6013853a4269a,,problème de géocodage,batiment non-géocodé,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((667570.286 6597358.233, 667569...",,,1,1,1
1,,,,,0.882353,0.058824,03030000XK0003_b37a5211663729a,,bâtiment fiable,batiment sans voisin à l'adresse,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((724469.358 6553805.252, 724464...",,,1,1,1
2,P,HABITAT,FRANCE_ARDOISE,P7,0.847458,0.050847,03033000AM0042_0787d51d1accbc3,270.0,bâtiment fiable,batiment sans voisin à l'adresse,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((737565.600 6565112.200, 737549...",,P,1,1,1
3,,,,,0.580645,0.225806,030640000A0437_bdaaa9fd2bcfdcd,,bâtiment fiable,batiment sans voisin à l'adresse,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((702440.462 6629423.421, 702443...",,,1,1,1
4,,,,,0.809524,0.428571,030660000A0448_314b96c1a93ff4e,,problème de géocodage,batiment non géocodée au numéro,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((754805.910 6556760.213, 754806...",,,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167839,P,HABITAT,FRANCE_ARDOISE,P6,0.888889,0.000000,03321000CN0289_9d22cf6b02bd635,242.0,bâtiment fiable,batiment sans voisin à l'adresse,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((725883.700 6609000.700, 725892...",,P,1,1,1
167840,I,HABITAT,,,0.695652,0.043478,03321000CN0288_3b41e8b205037ec,239.0,bâtiment fiable,batiment sans voisin à l'adresse,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((725841.900 6608941.900, 725838...",,I,1,1,1
167841,P,HABITAT,FRANCE_PIERRE_GRES_ARDOISE,P1,0.695652,0.043478,03321000CN0274_ee00bb685f8f54a,233.0,bâtiment fiable,batiment sans voisin à l'adresse,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((725713.500 6608998.600, 725713...",,P,1,1,1
167842,P,HABITAT,,,0.750000,0.062500,03321000CN0270_db473e741b3eb44,225.0,bâtiment fiable,batiment sans voisin à l'adresse,...,03,FRANCE_PIERRE_GRES_ARDOISE,FRANCE_ARDOISE,,"MULTIPOLYGON (((725526.500 6609248.800, 725526...",,P,1,1,1


In [43]:
perc.round(decimals=10)

Unnamed: 0,01,02,03
dpe_3cl_u_mur,7.2,4.6,4.3
all_danube_entries,87.3,88.5,87.8
period,87.8,88.8,88.2
territory,87.8,88.8,88.2
usage,92.7,92.8,92.8
typology,92.8,93.4,92.6
mapuce_typo_tot,12.3,10.1,0.0
mapuce_typo_uni,10.4,6.4,0.0
mapuce_typo_S,10.4,6.4,0.0
bdnb_typo_S,92.4,92.7,92.6


In [41]:
perc_description = perc.T.describe().T.round(decimals=1)
perc_description

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
dpe_3cl_u_mur,3.0,5.4,1.6,4.3,4.4,4.6,5.9,7.2
all_danube_entries,3.0,87.9,0.6,87.3,87.6,87.8,88.2,88.5
period,3.0,88.3,0.5,87.8,88.0,88.2,88.5,88.8
territory,3.0,88.3,0.5,87.8,88.0,88.2,88.5,88.8
usage,3.0,92.8,0.1,92.7,92.8,92.8,92.8,92.8
typology,3.0,92.9,0.4,92.6,92.7,92.8,93.1,93.4
mapuce_typo_tot,3.0,7.5,6.6,0.0,5.0,10.1,11.2,12.3
mapuce_typo_uni,3.0,5.6,5.2,0.0,3.2,6.4,8.4,10.4
mapuce_typo_S,3.0,5.6,5.2,0.0,3.2,6.4,8.4,10.4
bdnb_typo_S,3.0,92.6,0.2,92.4,92.5,92.6,92.6,92.7


with pd.option_context('display.float_format', lambda x: '%.1f' % x):
    display(perc_description)