In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
df_orig = pd.read_csv("cbecs2018_final_public.csv")
df_orig.head()

Unnamed: 0,PUBID,REGION,CENDIV,PBA,PUBCLIM,SQFT,SQFTC,WLCNS,RFCNS,RFCOOL,...,ZMFBTU,ZMFEXP,ZELCNS,ZELEXP,ZNGCNS,ZNGEXP,ZFKCNS,ZFKEXP,ZDHBTU,ZDHEXP
0,1,3,5,2,3,210000,8,1,4,2,...,0,0,0,0,9,9,1,1,0,0
1,2,4,9,2,4,28000,5,1,6,1,...,0,0,0,0,0,0,9,9,9,9
2,3,3,5,8,4,2100,2,1,4,2,...,0,0,0,0,9,9,9,9,9,9
3,4,3,7,5,5,240000,8,2,6,1,...,0,0,0,0,1,1,9,9,9,9
4,5,1,2,5,3,295000,8,3,6,2,...,0,0,0,0,0,0,9,9,9,9


In [None]:
display(df_orig.loc[(df_orig['PUBID'] > 0) & (df_orig['PBA'] == 2) & (df_orig['PUBCLIM'] == 4) & (df_orig['SQFT'] > 0), ['PUBID', 'PBA']])

In [None]:
df_wk = df_orig.copy()
df_wk.head(2)

#### creates df_wk dataset

In [3]:
df_wk = df_orig.iloc[:,[0,3,4,5,7,8,11,12,13,18,21,77,103,104,113,114,133,
    134,139,140,141,295,378,379,425,561,562,566,569,567,568,570,571]]
df_wk.head()

Unnamed: 0,PUBID,PBA,PUBCLIM,SQFT,WLCNS,RFCNS,BLDSHP,GLSSPC,NFLOOR,NELVTR,...,NGWATR,NGOTH,HDD65,CDD65,ELBTU,NGBTU,ELCNS,ELEXP,NGCNS,NGEXP
0,1,2,3,210000,1,4,6,3,994,8.0,...,2,2,4463,1759,18708970.0,,5483285.0,775800.0,,
1,2,2,4,28000,1,6,11,3,5,1.0,...,1,2,2424,189,1528667.0,201988.0,448027.0,77933.0,1946.0,4097.0
2,3,8,4,2100,1,4,2,3,1,,...,2,2,3218,2403,52387.0,,15354.0,3472.0,,
3,4,5,5,240000,2,6,2,2,1,,...,1,2,1045,3041,1974255.0,1211520.0,578621.0,41197.0,11672.0,9862.0
4,5,5,3,295000,3,6,1,1,1,,...,2,2,5076,1316,1290564.0,3386445.0,378243.0,33688.0,32625.0,35297.0


#### commit

#### Add cold degree days (CDD) and hot degree days (HDD) low to high ranges and averages to df_wk.
- The code cell below creates the df_clim dataframe.
- The data is from "Climate Data for Building Design Standards".

In [25]:
data = {'PUBCLIM': [2, 3, 4],
        'CDD_LOW': [1800, 2700, 4500],
        'CDD_HI': [6300, 6300, 6300],
        'HDD_LOW': [5400, 3600, 1800],
        'HDD_HI': [7200, 5400, 3600],
        'CDD_AVG': [4050, 4500, 5400],
        'HDD_AVG': [6300, 4500, 2700]
        }

df_clim = pd.DataFrame(data)
print(df_clim)

   PUBCLIM  CDD_LOW  CDD_HI  HDD_LOW  HDD_HI  CDD_AVG  HDD_AVG
0        2     1800    6300     5400    7200     4050     6300
1        3     2700    6300     3600    5400     4500     4500
2        4     4500    6300     1800    3600     5400     2700


- This code cell joins the df_clim to df_wk using the common PUBCLIM column to align the rows based on the 2, 3, and 4 climate zones.

In [27]:
df_wk = pd.merge(df_wk, df_clim, on='PUBCLIM', how='left')
print(df_wk)

      PUBID  PBA  PUBCLIM    SQFT  WLCNS  RFCNS  BLDSHP  GLSSPC  NFLOOR  \
0         1    2        3  210000      1      4       6       3     994   
1         2    2        4   28000      1      6      11       3       5   
2         3    8        4    2100      1      4       2       3       1   
3         4    5        5  240000      2      6       2       2       1   
4         5    5        3  295000      3      6       1       1       1   
...     ...  ...      ...     ...    ...    ...     ...     ...     ...   
6431   6432   14        2  130000      1      1       2       3       3   
6432   6433    1        4    1050      1      5       1       3       1   
6433   6434    2        4  122000      2      6      11       5       5   
6434   6435   23        4   15000      3      5       2       4       1   
6435   6436   14        7  188000      1      6       4       2       7   

      NELVTR  ...      ELCNS     ELEXP    NGCNS    NGEXP  CDD_LOW  CDD_HI  \
0        8.0  ...  548

#### end commit

#### commit

#### Filtered df_wk where the principle business activity (PBA) is 2 = office building, and the buildings are in climate zones PUBCLIM = 2 or 3 or 4.

In [39]:
df_wk_filtered = df_wk[(df_wk['PBA'] == 2) & (df_wk['PUBCLIM'].isin([2, 3, 4]))]
print(df_wk_filtered.shape)
print(df_wk_filtered)

(1018, 39)
      PUBID  PBA  PUBCLIM    SQFT  WLCNS  RFCNS  BLDSHP  GLSSPC  NFLOOR  \
0         1    2        3  210000      1      4       6       3     994   
1         2    2        4   28000      1      6      11       3       5   
12       13    2        3   30500      1      6       2       5       3   
18       19    2        4  184000      1      6       3       3       3   
20       21    2        3  240000      6      5       2       6       4   
...     ...  ...      ...     ...    ...    ...     ...     ...     ...   
6420   6421    2        3  430000      1      7       2       4     995   
6421   6422    2        2  385000      1      2       5       5       4   
6427   6428    2        2  265000      2      8       1       5     995   
6429   6430    2        4   54000      1      6       2       3       3   
6433   6434    2        4  122000      2      6      11       5       5   

      NELVTR  ...      ELCNS     ELEXP     NGCNS    NGEXP  CDD_LOW  CDD_HI  \
0        8

- The row index was reset so the new index is sequencial.

In [40]:
# resetting the df_wk row index
df_wk_filtered = df_wk_filtered.reset_index()
print(df_wk_filtered.shape)
print(df_wk_filtered)

(1018, 40)
      index  PUBID  PBA  PUBCLIM    SQFT  WLCNS  RFCNS  BLDSHP  GLSSPC  \
0         0      1    2        3  210000      1      4       6       3   
1         1      2    2        4   28000      1      6      11       3   
2        12     13    2        3   30500      1      6       2       5   
3        18     19    2        4  184000      1      6       3       3   
4        20     21    2        3  240000      6      5       2       6   
...     ...    ...  ...      ...     ...    ...    ...     ...     ...   
1013   6420   6421    2        3  430000      1      7       2       4   
1014   6421   6422    2        2  385000      1      2       5       5   
1015   6427   6428    2        2  265000      2      8       1       5   
1016   6429   6430    2        4   54000      1      6       2       3   
1017   6433   6434    2        4  122000      2      6      11       5   

      NFLOOR  ...      ELCNS     ELEXP     NGCNS    NGEXP  CDD_LOW  CDD_HI  \
0        994  ...  548

#### end commit

In [None]:
df_wk.shape

In [None]:
missing_values = df_wk.isna().sum()
print(missing_values)

In [None]:
print(df_wk.info())

In [None]:
print(df_wk.describe().round(0))

In [None]:
print(df_wk[['PBA', 'WLCNS']].value_counts())

#### function for commands

In [30]:
def run_commands(df):
    print('The number of rows and columns is:')
    print(df.shape)

run_commands(df_wk)

The number of rows and columns is:
(6436, 39)


In [None]:
# calc'd column
# zone 2:  cdd_low <= CDD value <= cdd_hi