In [1]:
# standard libs
import os
import sys
import logging

# project lib
PROJECT_SRC_PATH = os.path.join(os.path.abspath(''), '..', 'src')
sys.path.append(PROJECT_SRC_PATH)

import utils
import dataset
import visualizations
import energy_modeling
from prediction_age import AgePredictor, AgeClassifier, AgeClassifierComparison
from preprocessing import *

# external libs
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import geopandas as gpd
from shapely import wkt

from xgboost import XGBRegressor, XGBClassifier


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.[0m
[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.[0m
[1mThe 'nopython' keyword argument was not sup

In [2]:
logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
%load_ext autoreload
%autoreload 2

## Data

In [None]:
df_fr = utils.load_data('france', geo=False)
df_fr_sample = utils.sample_cities(df_fr, 0.1)


In [127]:
df_nl = utils.load_data('netherlands', geo=False)
df_nl_sample = utils.sample_cities(df_nl, 0.1)

## Preprocessing

#### Extract heating demand from TABULA_parameters_harmonized.csv and format csv

In [6]:
# tabula_params = tabula_params[tabula_params['country'].isin(['Netherlands', 'Spain', 'France'])]

def set_label(df):
    bins = sorted(np.unique(df[['age_min', 'age_max']].values))
    df['age_bin'] = utils.generate_labels(bins)
    return df

tabula_params = tabula_params.groupby(['country', 'residential_type']).apply(set_label)

NameError: name 'tabula_params' is not defined

In [4]:
def set_label(df):
    bins = sorted(np.unique(df[['age_min', 'age_max']].values))
    df['age_bin'] = utils.generate_labels(bins)
    return df

# load data
tabula_params_path = os.path.join('..', 'metadata', 'TABULA_parameters_harmonized.csv')
tabula_params = pd.read_csv(tabula_params_path)

tabula_params.rename(columns={'Country': 'country', 'BuildingType': 'residential_type', 'Age': 'age_bin', 'q_h_nd': 'heating_demand'}, inplace=True)

# filter for relevant countries
tabula_params = tabula_params[tabula_params['country'].isin(['France', 'Netherlands', 'Spain'])]

# add columns for upper and lower bound of age bin
tabula_params['age_min'] = tabula_params['age_bin'].str[:4].astype(int)
tabula_params['age_max'] = tabula_params['age_bin'].str[-4:].astype(int)
mask_upper_bound = (tabula_params['age_min'] == tabula_params['age_max']) & (tabula_params['age_max'] > 1990)
mask_lower_bound = (tabula_params['age_min'] == tabula_params['age_max']) & (tabula_params['age_max'] < 1990)
tabula_params.loc[mask_upper_bound, 'age_max'] = 2050
tabula_params.loc[mask_lower_bound, 'age_min'] = 0
tabula_params.loc[mask_lower_bound, 'age_max'] = tabula_params['age_max'] - 1
tabula_params['age_max'] = tabula_params['age_max'] + 1

# harmonize bin labels with labels generated by classifier
tabula_params = tabula_params.groupby(['country', 'residential_type']).apply(set_label)

# save relevant TABULA data
tabula_heating_path = os.path.join('..', 'metadata', 'TABULA_heating_demand.csv')
tabula_heating = tabula_params[['country', 'residential_type', 'age_bin', 'age_min', 'age_max', 'heating_demand']]
tabula_heating.to_csv(tabula_heating_path, index=False)

In [3]:
expected_years = {1970, 1980, 1990, 2000, 2010}

# Define a function to apply to each group
def replace_missing_years(group):
    existing_years = set(group['age_min'])
    missing_years = expected_years - existing_years
    
    for missing_year in sorted(missing_years):
        next_higher_years = group['age_min'][group['age_min'] > missing_year]
        if not next_higher_years.empty:
            next_higher_year = next_higher_years.min()
            group.loc[group['age_min'] == next_higher_year, 'age_min'] = missing_year
    return group

In [6]:
### Alternative to be used when working with RCA data

# load data
tabula_params_path = os.path.join('..', 'metadata', 'TABULA_parameters_harmonized.csv')
tabula_params = pd.read_csv(tabula_params_path)

# rename columns
tabula_params.rename(columns={'Country': 'country', 'BuildingType': 'residential_type', 'Age_Harmonized': 'age_bin', 'q_h_nd': 'heating_demand'}, inplace=True)

# delete < and s out of age labels
tabula_params['age_bin'] = tabula_params['age_bin'].str.replace('<', '', regex=False)
tabula_params['age_bin'] = tabula_params['age_bin'].str.replace('s', '', regex=False)

# add columns for upper and lower bound of age bin
tabula_params['age_min'] = tabula_params['age_bin'].str[:4].astype(int)
tabula_params['age_max'] = tabula_params['age_bin'].str[-4:].astype(int)
mask_upper_bound = (tabula_params['age_min'] == tabula_params['age_max']) & (tabula_params['age_max'] > 1990)
mask_lower_bound = (tabula_params['age_min'] == tabula_params['age_max']) & (tabula_params['age_max'] < 1990)
# Calculate the minimum age_min for each country-residential type pair
min_age_per_group = tabula_params.groupby(['country', 'residential_type'])['age_min'].transform('min')
max_age_per_group = tabula_params.groupby(['country', 'residential_type'])['age_max'].transform('max')
# Create a mask for rows where age_min is the lowest for its country-residential type pair
mask_lowest_age_min = (tabula_params['age_min'] == min_age_per_group)
mask_highest_age_max = (tabula_params['age_max'] == max_age_per_group)
# Combine the mask_lower_bound with mask_lowest_age_min
combined_mask_min = mask_lower_bound | mask_lowest_age_min
combined_mask_max = mask_upper_bound | mask_highest_age_max
# Apply the combined mask to set age_min to 0
tabula_params.loc[combined_mask_min, 'age_min'] = 0
tabula_params.loc[combined_mask_max, 'age_max'] = 2050
tabula_params['age_max'] = tabula_params['age_max'] + 1

# Apply the function to each country-residential_type group
tabula_params = tabula_params.groupby(['country', 'residential_type']).apply(replace_missing_years).reset_index(drop=True)

# harmonize bin labels with labels generated by classifier
tabula_params['age_bin'] = tabula_params['age_bin'].str.replace(r'1900(?!-)', '<1900', regex=True)
tabula_params['age_bin'] = tabula_params['age_bin'].str.replace('2010', '2010-2050', regex=False)
tabula_params = tabula_params.groupby(['age_bin', 'country', 'residential_type', 'age_min', 'age_max'])['heating_demand'].mean().reset_index()

# calculate the average heating demand by age bin and residential type across all countries
average_heating_demand_by_group = tabula_params.groupby(['age_bin', 'residential_type']).agg({'heating_demand': 'mean', 'age_min': 'first', 'age_max': 'first'}).reset_index()

# create a new DataFrame for Europe with average heating demand
europe_data = average_heating_demand_by_group.copy()
europe_data['country'] = 'Europe'

# append Europe data to the main DataFrame
tabula_heating = tabula_params[['country', 'residential_type', 'age_bin', 'age_min', 'age_max', 'heating_demand']]
tabula_heating = tabula_heating.append(europe_data, ignore_index=True)

# filter for relevant countries
tabula_heating = tabula_heating[tabula_heating['country'].isin(['Italy', 'Austria', 'Belgium', 'Bulgaria','France', 'Germany','Greece', 'Ireland', 'Netherlands', 'Slovenia', 'Spain', 'Europe'])]

# save relevant TABULA data
tabula_heating_path = os.path.join('..', 'metadata', 'TABULA_heating_demand.csv')
tabula_heating.to_csv(tabula_heating_path, index=False)

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


#### Defining age bins

In [16]:
# load data
tabula_params_path = os.path.join('..', 'metadata', 'TABULA_parameters_harmonized.csv')
tabula_params = pd.read_csv(tabula_params_path)

tabula_params = tabula_params[tabula_params['Country'].isin(['Italy', 'Austria', 'Belgium', 'Bulgaria','France', 'Germany','Greece', 'Ireland', 'Netherlands', 'Slovenia', 'Spain'])]

# tabula_combined_bins = sorted(tabula_bins['Age'][~tabula_bins['Age'].str.contains('<')].str[:4].unique())

tabula_combined_bins = sorted(tabula_params['Age_Harmonized'].str[:4].unique())
print(tabula_combined_bins)
tabula_combined_bins = sorted(tabula_params['Age'].str[:4].unique())
print(tabula_combined_bins)
tabula_combined_bins = sorted(tabula_params[tabula_params['Country'] == 'Netherlands']['Age'].str[:4].unique())
print(tabula_combined_bins)

tabula_params[tabula_params['Country'] == 'Spain'][['Country', 'BuildingType', 'Age', 'Age_Harmonized', 'q_h_nd']][:30]

['1900', '1945', '1970', '1980', '1990', '2000', '2010', '<190']
['1860', '1900', '1901', '1915', '1918', '1919', '1921', '1930', '1937', '1945', '1946', '1949', '1950', '1958', '1960', '1961', '1965', '1967', '1968', '1969', '1971', '1975', '1976', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1990', '1991', '1992', '1994', '1995', '1999', '2000', '2001', '2002', '2005', '2006', '2007', '2009', '2010', '2011', '2012', '2013', '2015', '2016']
['1965', '1975', '1992', '2006', '2015']


Unnamed: 0,Country,BuildingType,Age,Age_Harmonized,q_h_nd
187,Spain,AB,1901,1900-1944,11.2
188,Spain,AB,1901-1936,1900-1944,12.2
189,Spain,AB,1937-1959,1945-1969,10.5
190,Spain,AB,1960-1979,1970-1979,5.9
191,Spain,AB,1980-2006,1990-1999,3.1
192,Spain,AB,2007,2000-2009,2.9
193,Spain,MFH,1901,1900-1944,17.6
194,Spain,MFH,1901-1936,1900-1944,10.9
195,Spain,MFH,1937-1959,1945-1969,16.9
196,Spain,MFH,1960-1979,1970-1979,14.3


## Experiments

In [147]:
regressor_fr = AgePredictor(
    model=XGBRegressor(),
    df=df_fr_sample,
    test_training_split=split_80_20,
    preprocessing_stages=[remove_buildings_pre_1900]
)

regressor_nl = AgePredictor(
    model=XGBRegressor(),
    df=df_nl_sample,
    test_training_split=split_80_20,
    preprocessing_stages=[remove_buildings_pre_1900]
)

regressor_fr.print_model_error()
regressor_nl.print_model_error()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022-06-24 16:44:12,779 | INFO : Dataset length: 4441
2022-06-24 16:44:12,785 | INFO : Dataset allocated memory: 4 MB
2022-06-24 16:44:12,789 | INFO : Dataset standard deviation: 62.68525292516289
2022-06-24 16:44:12,790 | INFO : Dataset mean age: 1936.0225174510244
2022-06-24 16:44:12,791 | INFO : Training dataset length: 3552
2022-06-24 16:44:12,792 | INFO : Test dataset length: 889
2022-06-24 16:44:12,806 | INFO : Test dataset standard deviation after preprocessing: 33.72756817834984
2022-06-24 16:44:12,807 | INFO : Test dataset mean age after preprocessing: 1963.8566978193146

MAE: 18.55 y
RMSE: 25.70 y
R2: 0.4186
MAE: 11.92 y
RMSE: 18.21 y
R2: 0.4999


In [148]:
bins_fr = sorted(np.unique(tabula_params[tabula_params['country'] == 'France'][['age_min', 'age_max']].values))
bins_nl = sorted(np.unique(tabula_params[tabula_params['country'] == 'Netherlands'][['age_min', 'age_max']].values))

classifier_fr = AgeClassifier(
    model=XGBClassifier(),
    df=df_fr_sample,
    test_training_split=split_80_20,
    mitigate_class_imbalance=True,
    bins=bins_fr,
    preprocessing_stages=[remove_buildings_pre_1900]
)

classifier_nl = AgeClassifier(
    model=XGBClassifier(),
    df=df_nl_sample,
    mitigate_class_imbalance=True,
    bins=bins_nl,
    test_training_split=split_80_20,
    preprocessing_stages=[remove_buildings_pre_1900]
)

classifier_fr.print_model_error()
classifier_nl.print_model_error()


[autoreload of prediction failed: Traceback (most recent call last):
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 257, in check
    superreload(m, reload, self.old_objects)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 480, in superreload
    update_generic(old_obj, new_obj)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 377, in update_generic
    update(a, b)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 329, in update_class
    if update_generic(old_obj, new_obj):
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 377, in update_generic
    update(a, b)
  File "/usr/local/Caskroom/miniconda/base/envs/uf-ml/lib/pytho

Classification report:
               precision    recall  f1-score  support
<1915          0.527778  0.606383  0.564356       94
1915-1948      0.519481  0.444444  0.479042       90
1949-1967      0.474138  0.533981  0.502283      103
1968-1974      0.382979  0.400000  0.391304       45
1975-1981      0.589744  0.500000  0.541176       92
1982-1989      0.470588  0.588235  0.522876       68
1990-1999      0.440000  0.372881  0.403670       59
2000-2005      0.382353  0.351351  0.366197       37
2006-2012      0.352941  0.333333  0.342857       36
2013-2050      0.461538  0.333333  0.387097       18
accuracy       0.481308  0.481308  0.481308        0
macro avg      0.460154  0.446394  0.450086      642
weighted avg   0.482573  0.481308  0.479015      642
Cohen’s kappa: 0.4101
Matthews correlation coefficient (MCC): 0.4107
Classification report:
               precision    recall  f1-score  support
<1965          0.740095  0.846590  0.789769     4413
1965-1974      0.858111  0.789957  

In [167]:
bins_fr

[0, 1915, 1949, 1968, 1975, 1982, 1990, 2000, 2006, 2013, 2051]

In [160]:
bins_fr = sorted(np.unique(tabula_params[tabula_params['country'] == 'France'][['age_min', 'age_max']].values))
bins_nl = sorted(np.unique(tabula_params[tabula_params['country'] == 'Netherlands'][['age_min', 'age_max']].values))

comparison_config = {
    'France': {'df': df_fr_sample, 'bins': bins_fr},
    'Netherlands': {'df': df_nl_sample, 'bins': bins_nl},
}

# has to be aligned with TABULA bins for heating demand
grid_comparison_config = {
    '': {},
    'peter': {'bins': [1900, 1945, 1970, 1980, 1990, 2000, 2010]},
    '25': {'bins': [], 'bin_config': (1900, 2025, 25)},
    '10': {'bins': [], 'bin_config': (1900, 2025, 10)},
    '5': {'bins': [], 'bin_config': (1900, 2025, 5)},
}

comparison = AgeClassifierComparison(
    model=XGBClassifier(tree_method='hist'),
    df=None,
    cross_validation_split=cross_validation,
    preprocessing_stages=[remove_buildings_pre_1900],
    bin_config=None,
    mitigate_class_imbalance=True,
    include_baseline=False,
    comparison_config=comparison_config,
    grid_comparison_config=grid_comparison_config,
)

2022-06-24 18:03:22,760 | INFO : Starting experiment France__seed_0...
2022-06-24 18:03:22,778 | INFO : Generated bins: [0, 1915, 1949, 1968, 1975, 1982, 1990, 2000, 2006, 2013, 2051]
2022-06-24 18:03:22,779 | INFO : Generated bins with the following labels: ['<1915', '1915-1948', '1949-1967', '1968-1974', '1975-1981', '1982-1989', '1990-1999', '2000-2005', '2006-2012', '2013-2050']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022-06-24 18:03:22,812 | INFO : Dataset length: 4441
2022-06-24 18:03:22,819 | INFO : Dataset allocated memory: 4 MB
2022-06-24 18:03:22,825 | INFO : Training dataset length: 2960
2022-06-24 18:03:22

In [165]:
comparison.evaluate(include_energy_error=False)

Note that pos_label (set to 0) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 2) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 3) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 4) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 5) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 6) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_label] to specify a single positive class.
Note that pos_label (set to 7) is ignored when average != 'binary' (got 'macro'). You may use labels=[pos_

Unnamed: 0,name,MCC,F1,Recall_<1915,Recall_1915-1948,Recall_1949-1967,Recall_1968-1974,Recall_1975-1981,Recall_1982-1989,Recall_1990-1999,Recall_2000-2005,Recall_2006-2012,Recall_2013-2050,Recall_<1965,Recall_1965-1974,Recall_1975-1991,Recall_1992-2005,Recall_2006-2014,Recall_2015-2050
0,France__seed_0,0.411497,0.455026,0.588372,0.52439,0.547573,0.246914,0.51417,0.49863,0.425868,0.427885,0.347594,0.361446,,,,,,
1,Netherlands__seed_0,0.707859,0.743114,,,,,,,,,,,0.840732,0.779944,0.765113,0.72185,0.65476,0.679245


In [150]:
classifier_nl.aux_vars_test['country'] = 'Netherlands'
regressor_nl.aux_vars_test['country'] = 'Netherlands'
classifier_fr.aux_vars_test['country'] = 'France'
regressor_fr.aux_vars_test['country'] = 'France'

In [152]:
for predictor in [regressor_fr, classifier_fr, regressor_nl, classifier_nl]:
    aux_vars = pd.concat([predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")
    labels = getattr(predictor, 'labels', None)
    energy_modeling.calculate_energy_error(predictor.y_test, predictor.y_predict, aux_vars, labels=labels)

2022-06-24 16:53:12,682 | INFO : R2: 0.1065
2022-06-24 16:53:12,685 | INFO : MAPE: 0.5981
2022-06-24 16:53:12,745 | INFO : R2: 0.2321
2022-06-24 16:53:12,745 | INFO : MAPE: 0.4172
2022-06-24 16:53:13,224 | INFO : R2: 0.3351
2022-06-24 16:53:13,225 | INFO : MAPE: 0.4475
2022-06-24 16:53:13,671 | INFO : R2: 0.3200
2022-06-24 16:53:13,672 | INFO : MAPE: 0.3725


In [157]:
for predictor in [regressor_fr, classifier_fr, regressor_nl, classifier_nl]:
    # aux_vars = pd.concat([predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")
    labels = getattr(predictor, 'labels', None)
    y_true = pd.concat([predictor.y_test, predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")
    y_pred = pd.concat([predictor.y_predict, predictor.aux_vars_test, predictor.X_test[['FootprintArea']]], axis=1, join="inner")

    energy_modeling.calculate_energy_error(y_true, y_pred, labels=labels)

2022-06-24 17:37:48,276 | INFO : R2: 0.1828
2022-06-24 17:37:48,277 | INFO : MAPE: 0.4995

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
2022-06-24 17:37:48,374 | INFO : R2: 0.3481
2022-06-24 17:37:48,374 | INFO : MAPE: 0.3235
2022-06-24 17:37:48,885 | INFO : R2: 0.3351
2022-06-24 17:37:48,886 | INFO : MAPE: 0.4475
2022-06-24 17:37:49,327 | INFO : R2: 0.3200
2022-06-24 17:37:49,328 | INFO : MAPE: 0.3725
