In [1]:
import os
from time import time
    
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import timeit

from gdfs_utils import open_gdfs
from pool_create_danube_entries import pool_create_danube_entries

In [2]:
folder_loc = r"C:\Users\laraujo\Music\Work_august\dados\_Joined_per_dept"
dept_file_paths = [os.path.join(folder_loc, file) for file in os.listdir(folder_loc)[:3]]

In [3]:
# read dept files
start_time = time()
loaded_gdfs = open_gdfs(dept_file_paths)
print(f"open_gdfs executed in{(time()-start_time):.4f}s")

open_gdfs executed in150.4784s


In [4]:
gdfs = loaded_gdfs.copy()

In [5]:
gdfs["01"].head(1)

Unnamed: 0,bnb_id,altitude_sol,adr_fiabilite_niv_1,adr_fiabilite_niv_2,config_adr,geombui_area,cerffo2020_l_adresse,adedpe202006_logtype_min_classe_ener_ges,adedpe202006_logtype_coherence_data_methode_dpe,adedpe202006_logtype_is_3cl,...,TYPO_count,TYPO_unique,TYPO_min,Part-prop_mean,Part-Pauvr_mean,INSEE_DEP_min,Ter_P1_min,Ter_P2-7_min,TYPO_M,geometry
0,01005000AB0110_bc15409b9aa9d1c,,problème de géocodage,batiment non géocodée au numéro,batiment sans adr correctement géocodée,,"{""5084 LE BOURG""}",,,,...,,,,0.517241,0.103448,1,FRANCE_TERRE_TUILE,FRANCE_TUILE,,"MULTIPOLYGON (((847384.100 6546081.977, 847367..."


# Formatting

In [6]:
start_time = time()
gdfs = pool_create_danube_entries(gdfs)
print(f"pool_create_danube_entries executed in{(time()-start_time):.4f}s")

pool_create_danube_entries executed in59.2049s


In [7]:
gdfs["01"].head(1)

Unnamed: 0,typology,usage,territory,period,bnb_id,altitude_sol,adr_fiabilite_niv_1,adr_fiabilite_niv_2,config_adr,geombui_area,...,Ter_P1_min,Ter_P2-7_min,TYPO_M,geometry,TYPO_M_S,TYPO_BDNB_S,archetype,L1,L2,L3
0,BA,BATIMENT INDUSTRIEL,,,01005000AB0110_bc15409b9aa9d1c,,problème de géocodage,batiment non géocodée au numéro,batiment sans adr correctement géocodée,,...,FRANCE_TERRE_TUILE,FRANCE_TUILE,,"MULTIPOLYGON (((847384.100 6546081.977, 847367...",,BA,,1,1,1


# Percentage of data

In [9]:
# Percentage of typology data
def dept_perc_typology(df):
    
    def percent_not_null(column):
        return df[column].notnull().sum() / len(df) * 100
    
    def percent_entries_not_null():
        entries_not_null = df[(df['period'].notnull()) & 
                        #            (df['territory'].notnull()) & 
                                   (df['usage'].notnull()) & 
                                   (df['typology'].notnull()) ] 
        return len(entries_not_null) / len(df) * 100
      
 
    
    dict_perc = {      
                    "all_danube_entries" : percent_not_null("archetype"),
                    "period" : percent_not_null("period"),
                    "territory" : percent_not_null("territory"),
                    "usage" : percent_not_null("usage"),
                    "typology" : percent_not_null("typology"),
#                     "mapuce_typo_tot": percent_not_null("TYPO_count"),
                    "mapuce_typo_uni": percent_not_null("TYPO_M"),
                    "mapuce_typo_S" : percent_not_null("TYPO_M_S"),
                    "bdnb_usage_tot" : percent_not_null("cerffo2020_usage_niveau_3_txt"),
                    "bdnb_typo_S" : percent_not_null("TYPO_BDNB_S"), # ajouter usage
                        }
    
    def percent_usage_multiple_indicators():
        usage_mult_ind = {}
        cond_mult = df["L3"]>1
        # all multiple usages in bdnb
        usage_mult_ind["bdnb_all_multiple_usage"] =  len(df[cond_mult]) 
        
        # usage is defined even though there is multiple usages
        usage_mult_ind["mult_usage_OK"] =  len(df[(cond_mult) & (df["usage"].notnull())])
        
        # usage is NOT defined ((because methodology does not follow logic) ou (ignored case)) and there is multiple usage
        usage_mult_ind["mult_usage_KO"] =  len(df[(cond_mult) & (df["usage"].isnull())])
        
        # typology is defined even though there is multiple usages
        usage_mult_ind["mult_typo_OK"] =  len(df[(cond_mult) & (df["typology"].notnull())])
        
        # typology is NOT defined ((because methodology does not follow logic) ou (ignored case)) and there is multiple usage
        usage_mult_ind["mult_typo_KO"] =  len(df[(cond_mult) & (df["typology"].isnull())])
        usage_mult_ind = {k:v / len(df) * 100 for (k,v) in usage_mult_ind.items()}
        return usage_mult_ind   
    
    usage_mult_ind = percent_usage_multiple_indicators()
    dict_perc.update(usage_mult_ind)
    
    
    percent_dpe = {
                "dpe_classe_conso" : percent_not_null("adedpe202006_logtype_classe_conso_ener"),
                "dpe_mean_conso" : percent_not_null("adedpe202006_mean_conso_ener"),

                "dpe_baie_vitrage" : percent_not_null("adedpe202006_logtype_baie_type_vitrage"),
                "dpe_baie_u" : percent_not_null("adedpe202006_logtype_baie_u"),

                "dpe_mur_mat" : percent_not_null("adedpe202006_logtype_mur_mat_ext"),
                "dpe_mur_pos_is" : percent_not_null("adedpe202006_logtype_mur_pos_isol_ext"),
                "dpe_mur_u" : percent_not_null("adedpe202006_logtype_mur_u_ext"),

                "dpe_pb_mat" : percent_not_null("adedpe202006_logtype_pb_mat"),
                "dpe_pb_pos_is" : percent_not_null("adedpe202006_logtype_pb_pos_isol"),
                "dpe_pb_u" : percent_not_null("adedpe202006_logtype_pb_u"),

                "dpe_ph_mat" : percent_not_null("adedpe202006_logtype_ph_mat"),
                "dpe_ph_pos_is" : percent_not_null("adedpe202006_logtype_ph_pos_isol"),
                "dpe_ph_u" : percent_not_null("adedpe202006_logtype_ph_u"),
                }
    
    dict_perc.update(percent_dpe)
                         
    perc_cases = pd.DataFrame.from_dict(dict_perc, orient = 'index')
    return perc_cases


def all_perc_typology(gdfs):
    perc_typologies = [dept_perc_typology(gdfs[one_gdf]).rename(columns={0:one_gdf}) for one_gdf in gdfs]
    all_perc_typologies = pd.concat(perc_typologies, axis=1).round(decimals=2)
    return all_perc_typologies

perc = all_perc_typology(gdfs)

In [11]:
first_dept = sorted(gdfs)[0]
last_dept = sorted(gdfs)[-1]
name_perc_nb_depts_from_to = f"percent_depts_{first_dept}_to_{last_dept}.csv"
folder = r"C:\Users\laraujo\Music\Work_august\dados\_output_code\percentage_of_data_tables"
perc.to_csv(rf"{folder}/{name_perc_nb_depts_from_to}")

In [None]:
perc_description

with pd.option_context('display.float_format', lambda x: '%.1f' % x):
    display(perc_description)

### Archetypes

df["archetype"] =  df['period'].astype(str) + "-" + df['typology'].astype(str) + "-" + df['usage'].astype(str) + "-" + df['territory'].astype(str)

archs = list(df.archetype.unique())
archs

acrh_vals  = [arc for arc in archs if "None" not in arc ]
sorted(acrh_val)

archs_gdfs_dict = {arch : gdf[gdf.archetype == arch] for arch in acrh_vals}
archs_gdfs_dict['P7-P-TERTIAIRE-FRANCE_TUILE']

len(archs_gdfs_dict)

for key in DataFrameDict.keys():
    DataFrameDict[key].to_csv(fr"archetypes/{key}.csv")

cond_have_all_entries = ((df['period'].notnull()) & 
                                   (df['territory'].notnull()) & 
                                   (df['usage'].notnull()) & 
                                   (df['typology'].notnull()))

df["archetype"] = np.where(cond_have_all_entries, 
                           (df['period'].astype(str) + "-" + 
                            df['typology'].astype(str) + "-" + 
                            df['usage'].astype(str) + "-" + 
                            df['territory'].astype(str)), 
                           None)

In [12]:
df = gdfs["01"]

In [13]:
df[["period","territory","usage","typology","archetype"]]

Unnamed: 0,period,territory,usage,typology,archetype
0,,,BATIMENT INDUSTRIEL,BA,
1,,,,,
2,P7,FRANCE_TUILE,HABITAT,P,P7-P-HABITAT-FRANCE_TUILE
3,,,,,
4,P6,FRANCE_TUILE,HABITAT,P,P6-P-HABITAT-FRANCE_TUILE
...,...,...,...,...,...
225149,P2,FRANCE_TUILE,HABITAT,P,P2-P-HABITAT-FRANCE_TUILE
225150,P2,FRANCE_TUILE,HABITAT,P,P2-P-HABITAT-FRANCE_TUILE
225151,P2,FRANCE_TUILE,HABITAT,P,P2-P-HABITAT-FRANCE_TUILE
225152,P2,FRANCE_TUILE,HABITAT,P,P2-P-HABITAT-FRANCE_TUILE


In [16]:
list(df.archetype.unique()) # take none out later

[None,
 'P7-P-HABITAT-FRANCE_TUILE',
 'P6-P-HABITAT-FRANCE_TUILE',
 'P5-P-HABITAT-FRANCE_TUILE',
 'P1-P-HABITAT-FRANCE_TERRE_TUILE',
 'P2-P-HABITAT-FRANCE_TUILE',
 'P4-P-HABITAT-FRANCE_TUILE',
 'P3-P-HABITAT-FRANCE_TUILE',
 'P1-I-HABITAT-FRANCE_TERRE_TUILE',
 'P6-I-HABITAT-FRANCE_TUILE',
 'P2-I-HABITAT-FRANCE_TUILE',
 'P5-I-HABITAT-FRANCE_TUILE',
 'P7-I-HABITAT-FRANCE_TUILE',
 'P1-I-TERTIAIRE-FRANCE_TERRE_TUILE',
 'P3-I-HABITAT-FRANCE_TUILE',
 'P4-I-HABITAT-FRANCE_TUILE',
 'P1-I-COMMERCE-FRANCE_TERRE_TUILE',
 'P2-I-COMMERCE-FRANCE_TUILE',
 'P5-BA-COMMERCE-FRANCE_TUILE',
 'P6-I-COMMERCE-FRANCE_TUILE',
 'P4-I-TERTIAIRE-FRANCE_TUILE',
 'P1-BA-COMMERCE-FRANCE_TERRE_TUILE',
 'P6-I-TERTIAIRE-FRANCE_TUILE',
 'P1-BA-BATIMENT SPORTIF-FRANCE_TERRE_TUILE',
 'P5-I-COMMERCE-FRANCE_TUILE',
 'P4-I-COMMERCE-FRANCE_TUILE',
 'P7-I-TERTIAIRE-FRANCE_TUILE',
 'P2-I-TERTIAIRE-FRANCE_TUILE',
 'P3-I-BATIMENT D ENSEIGNEMENT-FRANCE_TUILE',
 'P5-I-TERTIAIRE-FRANCE_TUILE',
 'P7-BA-BATIMENT SPORTIF-FRANCE_TUILE',


In [None]:
def split_per_archetype(dep):
    archs = list(dep.archetype.unique())
    acrh_vals  = [arc for arc in archs if "None" not in arc ] 
    archs_gdfs_dict = {arch : dep[dep.archetype == arch] for arch in sorted(acrh_vals)}
    return archs_gdfs_dict



In [None]:
for dep in gdfs.values():
    split_per_archetype(dep)

In [None]:
def split_per_archetype(dep):
    dep["archetype"] =  (dep['period'].astype(str) + 
                         "-" + dep['typology'].astype(str) + 
                         "-" + dep['usage'].astype(str) + 
                         "-" + dep['territory'].astype(str)
                        )

In [None]:
type(a)