In [1]:
import timeit
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from datetime import datetime, timezone, timedelta
from time import time


In [2]:
def timenow():
    time = datetime.utcnow() + timedelta(hours=2)
    return time.strftime('%H:%M:%S')

In [3]:
def timer_func(func):
    # This function shows the execution time of 
    # the function object passed
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [4]:
folder_loc = r"C:\Users\laraujo\Music\Work_august\dados\_Joined_per_dept"

In [None]:
def dept_nb_from_path(path):
    return path.split("_")[-1].split('.')[0]

@timer_func
def read_1dept(file_path):
    dept = dept_nb_from_path(file_path)
    print(dept)
    dept_gdf = gpd.read_file(file_path)
    return dept_gdf
    
dept_file_names = os.listdir(folder_loc)
dept_file_paths = [os.path.join(folder_loc, file) for file in dept_file_names]

# take out the :3 to read all
charged_gdfs = {dept_nb_from_path(dept_file_path): read_1dept(dept_file_path) for dept_file_path in dept_file_paths}


01
Function 'read_1dept' executed in 77.3730s
02
Function 'read_1dept' executed in 78.7939s
03
Function 'read_1dept' executed in 57.7151s
04
Function 'read_1dept' executed in 30.6920s
05
Function 'read_1dept' executed in 23.6137s
06
Function 'read_1dept' executed in 75.0290s
07
Function 'read_1dept' executed in 58.3135s
08
Function 'read_1dept' executed in 40.6946s
09
Function 'read_1dept' executed in 34.6514s
10
Function 'read_1dept' executed in 43.8002s
11
Function 'read_1dept' executed in 67.8359s
12
Function 'read_1dept' executed in 57.1585s
13
Function 'read_1dept' executed in 153.3890s
14
Function 'read_1dept' executed in 99.3483s
15
Function 'read_1dept' executed in 32.1569s
16
Function 'read_1dept' executed in 69.9321s
17
Function 'read_1dept' executed in 136.0533s
18
Function 'read_1dept' executed in 62.3808s
19
Function 'read_1dept' executed in 51.6538s
21
Function 'read_1dept' executed in 72.0981s
22


In [None]:
gdfs = charged_gdfs.copy()

In [None]:
gdfs["01"].head(2)

In [None]:
list(gdfs["01"].columns)

### Reliable address

In [None]:
# Percentage of reliable addresse per department

def dept_adr_reliable(df):
    reliable = df[["bnb_id","adr_fiabilite_niv_1"]].groupby("adr_fiabilite_niv_1").count()/len(df)*100 
    return reliable

def all_adr_reliable(gdfs):
    reliables = [dept_adr_reliable(gdfs[one_gdf]).rename(columns = {'bnb_id':f"{one_gdf} (%)"}) for one_gdf in gdfs]
    all_reliable = pd.concat(reliables, axis=1).round(decimals=1)
    return all_reliable

all_adr_reliable(gdfs)
with pd.option_context('display.float_format', lambda x: '%.1f' % x):
    display(perc_description)

# Formatting

## Important columns first

def change_cols_position(dep):
    """ Changes the position of columns of territoire and filosofi
    These are important entries that are not going to be altered"""

    cols_change_position = dep[["Part-prop_mean" , "Part-Pauvr_mean" ,]]
    dep = dep.drop(list(cols_change_position.columns), axis=1)
    dep_new_order = pd.concat([cols_change_position, dep], axis=1)

    return dep_new_order

gdfs = {k:change_cols_position(v) for (k,v) in gdfs.items()}

## Function generate Danube entries

### Period

In [None]:
gdfs["01"]['cerffo2020_annee_construction'] 

In [None]:
def create_period(dep):
    """  Transform BDNB 'cerffo2020_annee_construction' into Danube periods."""    
    period_relations = {           
                 'P1' :
                     (dep['cerffo2020_annee_construction'] <= 1948 ),

                 'P2':
                     (dep['cerffo2020_annee_construction'] > 1948 ) &
                     (dep['cerffo2020_annee_construction'] <= 1973 ),


                 'P3':
                     (dep['cerffo2020_annee_construction'] > 1973 ) &
                     (dep['cerffo2020_annee_construction'] <= 1981 ),

                 'P4':
                     (dep['cerffo2020_annee_construction'] > 1981 ) &
                     (dep['cerffo2020_annee_construction'] <= 1989 ),

                 'P5':
                     (dep['cerffo2020_annee_construction'] > 1989 ) &
                     (dep['cerffo2020_annee_construction'] <= 2000 ),

                 'P6':
                     (dep['cerffo2020_annee_construction'] > 2000 ) &
                     (dep['cerffo2020_annee_construction'] <= 2012 ),

                 'P7':
                     (dep['cerffo2020_annee_construction'] > 2012 )
                }


    period_values = list(period_relations.keys())
    period_conditions = list(period_relations.values())

    dep['period'] = np.select(period_conditions, period_values, default=None)

    # put into first column
    first_column = dep.pop('period')
    dep.insert(0, 'period', first_column)

### Territory

In [None]:
def create_territory_1dep(dep):
    dep['territory'] = dep.apply(
        lambda x: (
            None if x["period"] is None
            else x["Ter_P1_min"] if x["period"] == "P1" 
            else x["Ter_P2-7_min"]
        ), axis=1
    ) 
    # put into first column
    first_column = dep.pop('territory')
    dep.insert(0, 'territory', first_column)

### Usage

In [None]:
def create_usage_1dep(dep):
    """  Transform BDNB Usages into Danube Usages. 
    All usages in Danube are create with exception of the below: 
                                                                    Usages_ignored = [
                                                                    'BATIMENT AGRICOLE',
                                                                    'BATIMENT RELIGIEUX',
                                                                    'CHATEAU',
                                                                    'LOCAL NON CHAUFFE']"""
    relations = {           
                 'BATIMENT INDUSTRIEL' :
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Industrie' )
                     | (dep['cerffo2020_usage_niveau_3_txt'] == 'Etablissement industriel autre que carrière' ),       

                 'SERRE AGRICOLE':
                     (dep['cerffo2020_usage_niveau_3_txt'] == 'Serre' ),

                 'BATIMENT D ENSEIGNEMENT':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Enseignement' ),

                 'BATIMENT DE SANTE':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Centre de santé' ),

                 'COMMERCE':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Commerce' ),

                 'HABITAT':
                     (dep['cerffo2020_usage_niveau_1_txt'] == 'Résidentiel individuel' )
                     | (dep['cerffo2020_usage_niveau_1_txt'] == 'Résidentiel collectif' ) # not described in BDNB methodology
                     | (dep['cerffo2020_usage_niveau_3_txt'] == 'Maison exceptionnelle' ),

                 'TERTIAIRE':
                     (dep['cerffo2020_usage_niveau_2_txt'] == 'Bureau' )
                     | (dep['cerffo2020_usage_niveau_2_txt'] == 'Hôtel' ),      

                 'BATIMENT SPORTIF':
                     (dep['cerffo2020_usage_niveau_3_txt'] == 'Espace sportif' )
                     | (dep['cerffo2020_usage_niveau_3_txt'] == 'Espace loisir' ),}


    usages = list(relations.keys())
    conditions = list(relations.values())

    dep['usage'] = np.select(conditions, usages, default=None)

    # put into first column
    first_column = dep.pop('usage')
    dep.insert(0, 'usage', first_column)

gdfs = {k:create_usage_1dep(v) for (k,v) in gdfs.items()}

### Typology

#### Mapuce

In [None]:
# Should I take out the local? Or should I pass it to final typology? take it out
def create_typo_mapuce_S_1dep(dep):

    conditions_mapuce = [
        (dep['TYPO_M'] == 'pcif') |  
            (dep['TYPO_M'] == 'pcio') | 
            (dep['TYPO_M'] == 'pd')| 
            (dep['TYPO_M'] == 'psc') ,

        (dep['TYPO_M'] == 'icif') |  
            (dep['TYPO_M'] == 'icio') | 
            (dep['TYPO_M'] == 'id') ,

        (dep['TYPO_M'] == 'ba') ,
        (dep['TYPO_M'] == 'bgh'),
#         (dep['TYPO_M'] == 'local')    
        ]

#     values_mapuce = ['P', 'I', 'BA', 'IGH', "local"]
    values_mapuce = ['P', 'I', 'BA', 'IGH']

    dep['TYPO_M_S'] = np.select(conditions_mapuce, values_mapuce, default=None)

gdfs = {k:create_typo_mapuce_S_1dep(v) for (k,v) in gdfs.items()}

#### BDNB

In [None]:
def create_typo_bdnb_S_1dep(dep):
    conditions_typo_bdnb = [
        (dep['cerffo2020_usage_niveau_2_txt'] == 'Maison individuelle' ) |
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Maisons groupées' )| 
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Maison exceptionnelle' ), 

        (dep['cerffo2020_usage_niveau_2_txt'] == 'Immeuble collectif')|
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Résidentiel collectif autre')|
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Hôtel' )| 
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Bureau') |  
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Centre de santé') |  
            (dep['cerffo2020_usage_niveau_2_txt'] == 'Enseignement') | 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin sans accès à la rue avec surface < 400m²')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin de centre commercial avec surface < 400m²')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin sur rue avec surface < 400m²'), 

        (dep['cerffo2020_usage_niveau_2_txt'] == 'Industrie') |  
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin grande surface (entre 400m² et 2499m²)')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Magasin très grande surface (> 2500m²)') | 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Serre')| 
            (dep['cerffo2020_usage_niveau_3_txt'] == 'Espace sportif') ,

        (dep['igntop202103_bat_hauteur'] >= 39), 
        ]

    # create a list of the values we want to assign for each condition
    values_typo_bdnb = ['P', 'I', 'BA', 'IGH']

    dep['TYPO_BDNB_S'] = np.select(conditions_typo_bdnb, values_typo_bdnb, default=None)

gdfs = {k:create_typo_bdnb_S_1dep(v) for (k,v) in gdfs.items()}

### Combine both typos

In [None]:
def combine_typo_mapuce_bdnb(dep):
    dep['typology'] = np.where(dep.TYPO_M_S.notnull(), dep.TYPO_M_S, 
         (np.where(dep.TYPO_M_S.isnull(), dep.TYPO_BDNB_S, None)))
#     dep.fillna(np.nan)
    
    # put into first column
    first_column = dep.pop('typology')
    dep.insert(0, 'typology', first_column)

gdfs = {k:combine_typo_mapuce_bdnb(v) for (k,v) in gdfs.items()}

# Create all entries

In [None]:
@timer_func
def create_danube_entries(dep):
    create_period(dep)
    create_territory_1dep(dep)
    create_usage_1dep(dep)
    create_typo_mapuce_S_1dep(dep)
    create_typo_bdnb_S_1dep(dep)
    combine_typo_mapuce_bdnb(dep)


In [None]:
for dep in gdfs.values():
    create_danube_entries(dep)

### Checking output

In [None]:
dep1 = gdfs["01"]
dep1.head(2)

In [None]:
dep1[dep1.period.notnull()][["period","cerffo2020_annee_construction"]].head()

In [None]:
dep1[dep1.territory.notnull()][["territory","period","Ter_P1_min" , "Ter_P2-7_min",]].head()

In [None]:
dep1[["usage",
        "typology",
        "TYPO_M" ,
        "TYPO_M_S",
        "cerffo2020_usage_niveau_1_txt" ,
        "cerffo2020_usage_niveau_2_txt",
        "cerffo2020_usage_niveau_3_txt", 
        "cerffo2020_l_usage_niveau_3_txt"]].head(10)

# Mixed usage

## For dept 1

In [None]:
dep1["L1"] = dep1['cerffo2020_l_usage_niveau_1_txt'].str.split(",", expand=False).map(len)
dep1["L2"] = dep1['cerffo2020_l_usage_niveau_2_txt'].str.split(",", expand=False).map(len)
dep1["L3"] = dep1['cerffo2020_l_usage_niveau_3_txt'].str.split(",", expand=False).map(len)

In [None]:
len(dep1[dep1["L3"]>1]) / len(dep1) * 100

In [None]:
# usage is defined even though there is multiple usages
len(dep1[(dep1["L3"]>1) &
     (dep1["usage"].notnull())]) / len(dep1) * 100

In [None]:
# usage is NOT defined because methodology does not follow logic when there is multiple usage
len(dep1[(dep1["L3"]>1) &
     (dep1["usage"].isnull())]) / len(dep1) * 100

In [None]:
# typology is defined even though there is multiple usages
len(dep1[(dep1["L3"]>1) &
     (dep1["typology"].notnull())]) / len(dep1) * 100

In [None]:
# typology is NOT defined because methodology does not follow logic when there is multiple usage

len(dep1[(dep1["L3"]>1) &
     (dep1["typology"].isnull())]) / len(dep1) * 100

## Generalize 

In [None]:
def create_col_num_multiple_usage(dep):
    def count_multiple_usage(col):
        return dep[col].str.split(",", expand=False).map(len)
    
    dep["L1"] = count_multiple_usage('cerffo2020_l_usage_niveau_1_txt')
    dep["L2"] = count_multiple_usage('cerffo2020_l_usage_niveau_2_txt')
    dep["L3"] = count_multiple_usage('cerffo2020_l_usage_niveau_3_txt')


In [None]:
for dep in gdfs.values():
    create_col_num_multiple_usage(dep)

In [None]:
gdfs["02"].head()

# Percentage of data

In [None]:
# Percentage of typology data
@timer_func
def dept_perc_typology(df):
    
    def percent_not_null(column):
        return df[column].notnull().sum() / len(df) * 100
    
    def percent_entries_not_null():
        entries_not_null = df[(df['period'].notnull()) & 
                        #            (df['territory'].notnull()) & 
                                   (df['usage'].notnull()) & 
                                   (df['typology'].notnull()) ] 
        return len(entries_not_null) / len(df) * 100
      
 
    
    percent_typo = {
                    "dpe_3cl_u_mur" : percent_not_null("adedpe202006_logtype_mur_u_ext"),
                    "all_danube_entries" : percent_entries_not_null(),
                    "period" : percent_not_null("period"),
                    "territory" : percent_not_null("territory"),
                    "usage" : percent_not_null("usage"),
                    "typology" : percent_not_null("typology"),
                    "mapuce_typo_tot": percent_not_null("TYPO_count"),
                    "mapuce_typo_uni": percent_not_null("TYPO_M"),
                    "mapuce_typo_S" : percent_not_null("TYPO_M_S"),
                    "bdnb_usage_tot" : percent_not_null("cerffo2020_usage_niveau_3_txt"),
                    "bdnb_typo_S" : percent_not_null("TYPO_BDNB_S"), # ajouter usage
                        }
    
    def percent_usage_multiple_indicators():
        usage_mult_ind = {}
        cond_mult = df["L3"]>1
        # all multiple usages in bdnb
        usage_mult_ind["bdnb_all_multiple_usage"] =  len(df[cond_mult]) 
        
        # usage is defined even though there is multiple usages
        usage_mult_ind["mult_usage_OK"] =  len(df[(cond_mult) & (df["usage"].notnull())])
        
        # usage is NOT defined ((because methodology does not follow logic) ou (ignored case)) and there is multiple usage
        usage_mult_ind["mult_usage_KO"] =  len(df[(cond_mult) & (df["usage"].isnull())])
        
        # typology is defined even though there is multiple usages
        usage_mult_ind["mult_typo_OK"] =  len(df[(cond_mult) & (df["typology"].notnull())])
        
        # typology is NOT defined ((because methodology does not follow logic) ou (ignored case)) and there is multiple usage
        usage_mult_ind["mult_typo_KO"] =  len(df[(cond_mult) & (df["typology"].isnull())])
        usage_mult_ind = {k:v / len(df) * 100 for (k,v) in usage_mult_ind.items()}
        return usage_mult_ind   
    
    usage_mult_ind = percent_usage_multiple_indicators()
    percent_typo.update(usage_mult_ind)
                         
    perc_cases = pd.DataFrame.from_dict(percent_typo, orient = 'index')
    return perc_cases


def all_perc_typology(gdfs):
    perc_typologies = [dept_perc_typology(gdfs[one_gdf]).rename(columns={0:one_gdf}) for one_gdf in gdfs]
    all_perc_typologies = pd.concat(perc_typologies, axis=1).round(decimals=1)
    return all_perc_typologies

perc = all_perc_typology(gdfs)

# check df4-5 if no typo_count

In [None]:
with pd.option_context("display.max_columns", None):
    display(perc)

In [None]:
perc_description = perc.T.describe().T.round(decimals=1)
perc_description[["mean","min", "max"]]

In [None]:
3.7/4.3
# ne pas prendre en compte les cas multiples pour l'analyse

In [None]:
perc_description

with pd.option_context('display.float_format', lambda x: '%.1f' % x):
    display(perc_description)