In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import pandasql as pdsql
from datetime import datetime, timedelta

# import matplotlib as mpl
import matplotlib.pyplot as plt
# Setup matplotlib to display in notebook:
%matplotlib inline

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)         # initiate notebook for offline plot


### 1. Read in the data 

In [2]:
df = pd.read_csv("../data/NYC Open Data - Electric_Consumption_And_Cost__2010_-__June_2018_.csv", low_memory=False)

In [3]:
df.shape

(313147, 27)

#### Check the number of empty values in each column

In [4]:
df.isnull().sum()

Development Name         146
Borough                  146
Account Name             146
Location                9041
Meter AMR                187
Meter Scope           296588
TDS #                   1717
EDP                      146
RC Code                  146
Funding Source           146
AMP #                   1657
Vendor Name              146
UMIS BILL ID             146
Revenue Month            146
Service Start Date       146
Service End Date         146
# days                   146
Meter Number             146
Estimated                146
Current Charges          146
Rate Class               146
Bill Analyzed            146
Consumption (KWH)        146
KWH Charges              146
Consumption (KW)         146
KW Charges               146
Other charges            146
dtype: int64

## Part I - General Data Cleaning

### 2. Remove empty rows

In [5]:
mask = (pd.isna(df['Account Name']) == True)
df.drop(mask[mask == True].index, axis = 0, inplace = True)

### 3. Remove rows where electricity charges were estimated

In [6]:
df.drop(df.loc[df['Estimated'] == 'Y         '].index, axis = 0, inplace = True)

#### Check data types of columns

In [7]:
df.dtypes

Development Name       object
Borough                object
Account Name           object
Location               object
Meter AMR              object
Meter Scope            object
TDS #                 float64
EDP                   float64
RC Code                object
Funding Source         object
AMP #                  object
Vendor Name            object
UMIS BILL ID          float64
Revenue Month          object
Service Start Date     object
Service End Date       object
# days                float64
Meter Number           object
Estimated              object
Current Charges        object
Rate Class             object
Bill Analyzed          object
Consumption (KWH)     float64
KWH Charges            object
Consumption (KW)       object
KW Charges             object
Other charges          object
dtype: object

#### Change column names for easy reference

In [8]:
df.columns = ['Development_Name', 'Borough', 'Account_Name', 'Location', 'Meter_AMR',
       'Meter_Scope', 'TDS #', 'EDP', 'RC_Code', 'Funding_Source', 'AMP #',
       'Vendor_Name', 'UMIS_BILL_ID', 'Revenue_Month', 'Service_Start_Date',
       'Service_End_Date', '# days', 'Meter_Number', 'Estimated',
       'Current_Charges', 'Rate_Class', 'Bill_Analyzed', 'Consumption_KWH',
       'KWH_Charges', 'Consumption_KW', 'KW_Charges', 'Other_Charges']

### 4. Data Type Converstion

1. Change the following fields from string to numerical:
    - "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"

In [9]:
df["Consumption_KW"] = df["Consumption_KW"].apply(lambda x: x.replace(",","") if type(x) == str else str(x))
df["Consumption_KW"] = df["Consumption_KW"].astype(float)

In [10]:
df["Current_Charges"] = df["Current_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Current_Charges"] = df["Current_Charges"].astype(float)

In [11]:
df["KWH_Charges"] = df["KWH_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KWH_Charges"] = df["KWH_Charges"].astype(float, inplace = True)

In [12]:
df["KW_Charges"] = df["KW_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KW_Charges"] = df["KW_Charges"].astype(float, inplace = True)

In [13]:
df["Other_Charges"] = df["Other_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Other_Charges"] = df["Other_Charges"].astype(float, inplace = True)

##### More than 25% of the values for all except "Curent Charges" are 0, which seem unusual

In [14]:
df[["Consumption_KWH",  "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"]].describe()

Unnamed: 0,Consumption_KWH,Consumption_KW,Current_Charges,KWH_Charges,KW_Charges,Other_Charges
count,261252.0,261252.0,261252.0,261252.0,261252.0,261252.0
mean,32565.72,68.239881,4510.758176,1673.941416,1084.801607,1672.128898
std,53071.71,122.266314,6630.228947,2921.443192,1807.808563,3626.935491
min,0.0,0.0,-243.15,0.0,0.0,-59396.43
25%,0.0,0.0,384.3425,0.0,0.0,0.0
50%,11360.0,31.51,2536.945,555.02,448.37,896.51
75%,48160.0,98.5,6052.3575,2359.8925,1601.4225,2636.255
max,1779600.0,16135.46,329800.37,195575.86,78782.96,134224.51


2. Unify the format of "Meter_Number" field (some values exists in both numerical and string

In [15]:
df['Meter_Number'] = df['Meter_Number'].apply(lambda x: str(x) if type(x) == int else x)

### 5.1 Convert Revenue_Month and Two dates to datetime type

In [16]:
df["Revenue_Month"] = df["Revenue_Month"].map(lambda x: datetime.strptime(x.split(" ")[0], '%m/%d/%Y'))
df['Service_Start_Date'] = df['Service_Start_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df['Service_End_Date'] = df['Service_End_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))

#### In some cases the Revenue_Month is not in the same revenue_year as the Service Start and End dates when those two are

In [17]:
df['start_date_year'] = df['Service_Start_Date'].apply(lambda x: datetime(x.year, 1, 1))

df['end_date_year'] = df['Service_End_Date'].apply(lambda x: datetime(x.year, 1, 1))

df['revenue_month_year'] = df['Revenue_Month'].apply(lambda x: datetime(x.year, 1, 1))

mask = ((df['end_date_year'] == df['start_date_year']) & (df['revenue_month_year'] != df['end_date_year']))

In [18]:
mask.value_counts()

False    261237
True         15
dtype: int64

In [19]:
df[mask][['Revenue_Month', 'Service_Start_Date', 'Service_End_Date', 'Meter_Number']].sort_values(['Revenue_Month', 'Service_Start_Date', 'Meter_Number'])

Unnamed: 0,Revenue_Month,Service_Start_Date,Service_End_Date,Meter_Number
44361,2011-10-01,2010-09-22,2010-10-22,5934193
44362,2011-10-01,2010-09-22,2010-10-22,6439093
44363,2011-10-01,2010-09-22,2010-10-22,6443262
44364,2011-10-01,2010-09-22,2010-10-22,6443337
44365,2011-10-01,2010-09-22,2010-10-22,6443449
44366,2011-10-01,2010-09-22,2010-10-22,6443450
44367,2011-10-01,2010-09-22,2010-10-22,6443473
44368,2011-10-01,2010-09-22,2010-10-22,6443512
44369,2011-10-01,2010-09-22,2010-10-22,6443519
44370,2011-10-01,2010-09-22,2010-10-22,6443527


#### Correct the cases where Revenue_Month is in the wrong year

In [20]:
df.loc[mask, "Revenue_Month"] = datetime.strptime('10/01/2010', '%m/%d/%Y')

### 5.2 Clean up the Meter_Number field
- remove the leadng zeros 
- remove white spaces
- standardize the format for meter_numbers of the similar pattern

In [21]:
df['Meter_Number'] = df['Meter_Number'].apply(lambda x: x.lstrip("0").strip(" "))

In [22]:
df['Meter_Length'] = df['Meter_Number'].apply(lambda x: len(x))

In [23]:
df['Meter_Length'].value_counts()

7     257942
8       1847
12       456
5        427
6        292
18       287
10         1
Name: Meter_Length, dtype: int64

In [24]:
df[df['Meter_Length'] == 12]['Meter_Number'].value_counts()

1860113_7500    68
7860113_7500    68
1860113_1600    66
7860113_1600    66
8096662-58.5    35
1096662-58.5    35
8096662-41.5    35
1096662-41.5    35
8096662 58-5    12
1096662 58-5    12
1096662 41-5    12
8096662 41-5    12
Name: Meter_Number, dtype: int64

In [25]:
df.loc[df['Meter_Number'] == '1096662 41-5', 'Meter_Number'] = '1096662-41.5'

df.loc[df['Meter_Number'] == '1096662 58-5', 'Meter_Number'] = '1096662-58.5'

df.loc[df['Meter_Number'] == '8096662 41-5', 'Meter_Number'] = '8096662-41.5'

df.loc[df['Meter_Number'] == '8096662 58-5', 'Meter_Number'] = '8096662-58.5'

#### Check "Meter Scope": Do the row with a range value represent a "Master Meter" (i.e. its value is the sum of other related rows)? - No

df['Meter Scope'].value_counts()

df[(df['TDS #'] == 118) & (df["Revenue_Month"] == '2010-02-01')][["Location", "Meter Scope", "Revenue_Month", "Current_Charges"]]

df[(df['Meter Scope'] == 'Community Center')].groupby('Location').mean()

df[(df['Meter Scope'] == 'BLD 1 - 9')].groupby('Location').mean()

### 6. Create an unique identifier for each building and remove unnecessary fields

In [26]:
# the combination of TDS# and Location uniquely determines a buildling
# Use EDP or RC Code when TDS# is not available
df['Building_ID'] = df['TDS #'].combine_first(df['EDP']).map(str).combine_first(df['RC_Code']) \
                    + " - " + df['Location'].map(lambda x: 'NA' if pd.isna(x) else x)

In [27]:
# Building_ID alone is not the primary key of the data
df.groupby(['Building_ID', 'Revenue_Month']).count().shape[0]/df.shape[0]

0.6323396567299007

In [28]:
# the combination of Building_ID, meter number and revenue month is still not a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).count().shape[0]/df.shape[0]

0.9987636458285487

In [29]:
# Define a list of columns of interest
cols = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
        'Revenue_Month', 'Service_Start_Date', 'Service_End_Date', '# days', 
       'Current_Charges','Consumption_KWH', 'KWH_Charges',
       'Consumption_KW', 'KW_Charges', 'Other_Charges']
df = df[cols]

In [30]:
# the combination of Building_ID, meter number and revenue month is almost a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count().shape[0]/df.shape[0]

0.9994870852663329

### 7. Drop Duplicated rows and clean up to format of Meter_Number field

In [31]:
df = df.drop_duplicates()

### 8. Check which combinations of the 5 fields (Building_ID, Meter, Month, StartDate, EndDate) has multiple rows and why

In [32]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### half of these problematic rows has zero values in the numerical fields of charges and consumptions

In [33]:
a

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Service_Start_Date,Service_End_Date,# days,Current_Charges,Consumption_KWH,KWH_Charges,Consumption_KW,KW_Charges,Other_Charges
0,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2012-12-01,2012-11-21,2012-12-24,33.0,0.00,0.0,0.00,0.00,0.00,0.00
1,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2012-12-01,2012-11-21,2012-12-24,33.0,0.00,0.0,0.00,54.43,1109.09,-1109.09
36,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,0.00,0.00,0.00
37,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,52.08,1105.73,-1105.73
40,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,0.00,0.00,0.00
41,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,52.94,1166.15,-1166.15
42,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,0.00,0.00,0.00
43,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,50.93,1169.81,-1169.81
44,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,0.00,0.00,0.00
45,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,51.46,1146.50,-1146.50


#### remove those rows from the dataset

In [34]:
df = df[~((df['Current_Charges'] == 0) & (df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Other_Charges'] == 0) & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0))]

#### we also don't care about entries that only has other_charges not equal to zero

In [35]:
df = df[~((df['Other_Charges'] != 0) & (df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0))]

In [36]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### Only 2 rows left, seems a case of rebilling

In [37]:
a

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Service_Start_Date,Service_End_Date,# days,Current_Charges,Consumption_KWH,KWH_Charges,Consumption_KW,KW_Charges,Other_Charges
0,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,1306.02,12880.0,858.84,0.0,0.0,447.18
1,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,2693.18,26560.0,1771.02,0.0,0.0,922.16


### 9. save a copy of the original dataframe before further data cleaning with alterations and flag the rows with problems

In [38]:
df_orig = df

In [39]:
df_orig['flag'] = ""

In [40]:
# df = df_orig.iloc[:, 0:15]

#### update the flag in df_orig

In [41]:
mask = (df_orig['Building_ID'] == '63.0 - BLD 11') & (df_orig['Meter_Number'] == '8125318') & (df_orig['Revenue_Month'] == '2011-10-01')
df_orig.loc[mask, 'flag'] = 'rebill'
df_orig = df_orig.iloc[:, 0:15]
df_orig.flag.value_counts()

          258674
rebill         2
Name: flag, dtype: int64

#### Remove the entries with rebilling from the working dataset df

In [42]:
df = df[~mask]

In [43]:
del(a)

### 10. Add a column for Revenue_Year and reorder the columns

In [44]:
df.loc[:, 'Revenue_Year'] = df['Revenue_Month'].dt.year

In [45]:
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date',
       '# days', 'Consumption_KW', 'KW_Charges', 
       'Consumption_KWH', 'KWH_Charges', 'Other_Charges', 'Current_Charges']

df = df[col_ordered]

## Part II - Data Cleaning with alterations - aggregation, mapping

### 11. Check the zero values in Current_Charges, KWH_Charges and KW_Charges

#### High Percentage of rows have current_charges == 0

In [46]:
print ("{:.2%}".format(df[df['Current_Charges'] == 0].shape[0]/df.shape[0]))

16.65%


#### when current_charges == 0, all kwh_charges == 0 (NaN correlation coefficients with all other variables) and kw_charges seems negatively correlated with other_charges

In [47]:
df[df['Current_Charges'] == 0][['KWH_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

Unnamed: 0,KWH_Charges,KW_Charges,KWH_Charges.1,Other_Charges
KWH_Charges,,,,
KW_Charges,,1.0,,-0.694394
KWH_Charges,,,,
Other_Charges,,-0.694394,,1.0


#### when current_charges == 0, 82% of the time kw_charges == - other_charges and kw_charges ==  other_charges otherwise

In [48]:
mask = (df['Other_Charges'] + df['KW_Charges'] == 0) & (df['Current_Charges'] == 0) & (df['KWH_Charges'] == 0)

In [49]:
print("{:.2%}".format(df[mask].shape[0]/df[df['Current_Charges'] == 0].shape[0]))

82.30%


In [50]:
df[(df['Current_Charges'] == 0) & ((df['Other_Charges'] == df['KW_Charges']) \
        | (df['Other_Charges'] + df['KW_Charges'] == 0))].shape[0] / \
df[df['Current_Charges'] == 0].shape[0]

1.0

#### correct the rows where Other_Charges == KW_Charges with Other_Charges = -KW_Charges

In [51]:
mask = (df['Current_Charges'] == 0) & ((df['Other_Charges'] == df['KW_Charges']) & (df['KW_Charges'] != 0))


In [52]:
df.loc[mask, 'KW_Charges'] = df.loc[mask, 'Other_Charges'] * (-1)

In [53]:
df[df['Current_Charges'] == 0][['Current_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

Unnamed: 0,Current_Charges,KW_Charges,KWH_Charges,Other_Charges
Current_Charges,,,,
KW_Charges,,1.0,,-1.0
KWH_Charges,,,,
Other_Charges,,-1.0,,1.0


In [54]:
df[df['Current_Charges'] == 0][['Current_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

Unnamed: 0,Current_Charges,KW_Charges,KWH_Charges,Other_Charges
Current_Charges,,,,
KW_Charges,,1.0,,-1.0
KWH_Charges,,,,
Other_Charges,,-1.0,,1.0


#### update the flag in df_orig

In [55]:
mask = (df_orig['Current_Charges'] == 0) & ((df_orig['Other_Charges'] == df_orig['KW_Charges']) & (df_orig['KW_Charges'] != 0))
valid = df_orig[mask]['flag']
df_orig.loc[mask, 'flag'] = valid.apply(lambda x: 'Sign of Other_Charges is incorrect' if x == "" else x + '; ' + 'Sign of Other_Charges is incorrect')

df_orig = df_orig.iloc[:, 0:15]

del( valid, mask)
df_orig.flag.value_counts()

                                      251049
Sign of Other_Charges is incorrect      7625
rebill                                     2
Name: flag, dtype: int64

### Calculate Metrics regarding zero-values and meter types - 1st time

In [56]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 16.02%
perc of kwh_only meters: 36.68%
perc of kwh_and_kw meters: 47.30%
perc of buildings with both kw_only and kwh_only meters: 30.34%
perc of rows - current charges of zero: 16.65%
perc of rows - kw charges of zero: 40.98%
perc of rows - kwh charges of zero: 32.87%
perc of rows - consumption/charge inconsistency: 4.46%
perc of rows - sum of charges inconsistency: 26.47%


### 12. Identify accounts that have separated meters for KW and KWH charges and combine the meters

There are many cases where under the same Building_ID, two meter numbers differ only in the first digit and share the same service date ranges. Usually the larger meter number has zero values in all KW_Charges and the smaller one has zero values in all KWH_Charges. It seems reasonable to combined them.
- (Exceptions do exist - some larger meter number have values in both KW and KWH)

- Output:
    - df (with consolidated meter numbers)

In [57]:
temp = df.groupby(['Building_ID', 'Meter_Number']).agg('count').reset_index()[['Building_ID', 'Meter_Number']]

In [58]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select distinct l.Building_ID, l.Meter_Number, r.Meter_Number\
        from temp l join temp r on l.Building_ID = r.Building_ID and l.Meter_Number > r.Meter_Number \
        where substr(l.Meter_Number, 2, length(l.Meter_number)) == substr(r.Meter_Number, 2, length(r.Meter_number))"
df_meter_mapping = pysql(str1)

df_meter_mapping.columns = ['Building_ID', 'Meter_Number_L', 'Meter_Number_S']

#### 26.7% of the meter numbers can be mapped to another

In [59]:
str1 = "select count (distinct Meter_Number_S) as count_redudant_meters\
        from df_meter_mapping"
str2 = "select count (distinct Meter_Number) as count_meters\
        from temp"
pysql(str1)['count_redudant_meters'][0]/pysql(str2)['count_meters'][0]


0.26736884747822565

In [60]:
del(temp)

In [61]:
df_meter_mapping.head()

Unnamed: 0,Building_ID,Meter_Number_L,Meter_Number_S
0,10.0 - BLD 01,7864550,1864550
1,10.0 - BLD 02,7864551,1864551
2,10.0 - BLD 03,8010023,1010023
3,10.0 - BLD 04,7864536,1864536
4,10.0 - BLD 05,8010026,1010026


#### check if the two meters correspond to KWH_Charges and KW_Charges respectively, by comparing to the df_meter_type table obtained above

In [62]:
temp = pd.merge(df_meter_mapping, df_meter_type, left_on = ['Building_ID', 'Meter_Number_S']\
         , right_on = ['Building_ID', 'Meter_Number'], how = 'left')\
        [['Building_ID', 'Meter_Number_S', 'count_kwh_only', 'count_kw_only', 'count', 'kwh_only', 'kw_only', 'Meter_Number_L']]

temp.columns = ['Building_ID', 'Meter_Number_S', 'count_kwh_only_s', 'count_kw_only_s', 'count_s', 'kwh_only_s', 'kw_only_s',
       'Meter_Number_L']

temp = pd.merge(temp, df_meter_type, left_on = ['Building_ID', 'Meter_Number_L']\
         , right_on = ['Building_ID', 'Meter_Number'], how = 'left')\
        [['Building_ID', 'Meter_Number_S', 'count_kwh_only_s', 'count_kw_only_s', 'count_s', 'kwh_only_s', 'kw_only_s', 'Meter_Number_L', 'count_kwh_only', 'count_kw_only', 'count', 'kwh_only', 'kw_only']]

temp.columns = ['Building_ID', 'Meter_Number_S', 'count_kwh_only_s', 'count_kw_only_s', 'count_s', 'kwh_only_s', 'kw_only_s',
       'Meter_Number_L', 'count_kwh_only_l', 'count_kw_only_l', 'count_l', 'kwh_only_l', 'kw_only_l']

In [63]:
temp.head()

Unnamed: 0,Building_ID,Meter_Number_S,count_kwh_only_s,count_kw_only_s,count_s,kwh_only_s,kw_only_s,Meter_Number_L,count_kwh_only_l,count_kw_only_l,count_l,kwh_only_l,kw_only_l
0,10.0 - BLD 01,1864550,0,90,99,False,True,7864550,97,0,97,True,False
1,10.0 - BLD 02,1864551,0,90,99,False,True,7864551,95,0,95,True,False
2,10.0 - BLD 03,1010023,0,90,99,False,True,8010023,97,0,97,True,False
3,10.0 - BLD 04,1864536,0,90,99,False,True,7864536,97,0,97,True,False
4,10.0 - BLD 05,1010026,0,0,1,False,False,8010026,21,0,21,True,False


#### Nearly all the "small" meter_numbers are kw_only meters (they only have non-zero values in kw charges), it seems okay to map them to the "large" corresponding meter_numbers

#### kwh_only_l means the "larger" meter_number only has non-zero values in KWH charges; Better doc needed here

In [64]:
temp[(temp['kwh_only_l'] == False) & (temp['kw_only_l'] == False)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

0.059848484848484845

In [65]:
temp[(temp['kwh_only_s'] == False) & (temp['kw_only_s'] == False)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

0.43636363636363634

In [66]:
temp[(temp['kwh_only_s'] == True) & (temp['kw_only_s'] == False)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

0.0

In [67]:
temp[(temp['kwh_only_s'] == False) & (temp['kw_only_s'] == True)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

0.5636363636363636

#### Combine the meter numbers 

In [68]:
temp = pd.merge(df, df_meter_mapping, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID','Meter_Number_S'], how = 'left')
temp['Meter_Number_New'] = temp['Meter_Number_L'].combine_first(temp['Meter_Number'])

df = temp

del(temp)

In [69]:
df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_S'], axis = 1, inplace = True)

df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']

col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']

df = df[col_ordered]

In [70]:
df_meter_mapping.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_meter_mapping")

### Calculate Metrics regarding zero-values and meter types - 2nd time

In [71]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 1.30%
perc of kwh_only meters: 15.82%
perc of kwh_and_kw meters: 82.89%
perc of buildings with both kw_only and kwh_only meters: 0.49%
perc of rows - current charges of zero: 16.65%
perc of rows - kw charges of zero: 40.98%
perc of rows - kwh charges of zero: 32.87%
perc of rows - consumption/charge inconsistency: 4.46%
perc of rows - sum of charges inconsistency: 26.47%


### 13. Find the accounts with switched meter numbers

There are Building_ID's whose meter number changed over the years, need to find the mapping and consolidate the meter numbers (In some cases it's a many-to-many mapping, I'm excluding those cases for now)

outputs: 
1. df_multiple_meter_switch (building_id's with many-to-many meter mapping, need to investigate later)
2. df (with consolidated meter numbers)

In [72]:
from dateutil.relativedelta import *

In [73]:
a = df.groupby(['Building_ID']).agg({'Meter_Number': 'nunique'}).reset_index()

a = a[a["Meter_Number"]>1]

a.columns = ['Building_ID', 'Counts']

a = pd.merge(a, df, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number', "Revenue_Month"]]\
.groupby(['Building_ID', 'Meter_Number']).agg({'Revenue_Month': ['max','min']}).reset_index()

a.columns = a.columns.get_level_values(0)

a.columns = ['Building_ID', 'Meter_Number', 'Max_Month', 'Min_Month']

a['Max_Month_Next'] = a['Max_Month'].map(lambda x: x + relativedelta(months=+1))
a['Min_Month_Prior'] = a['Min_Month'].map(lambda x: x - relativedelta(months=+1))
df_switch_meter = a

del(a)

In [74]:
str1 = "select l.Building_ID, l.Meter_Number as Meter_Number_E, r.Meter_Number as Meter_Number_L \
        from df_switch_meter l join df_switch_meter r on l.Building_ID = r.Building_ID and l.Meter_Number != r.Meter_Number \
        where l.Max_Month == r.Min_Month_Prior"
a = pysql(str1)

In [75]:
df_meter_switch = pd.DataFrame(a['Building_ID'].value_counts() > 1).reset_index()
df_meter_switch.columns = ['Building_ID', 'Dummy']

df_single_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == False]
df_multiple_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == True]

In [76]:
df_meter_switch = pd.merge(a, df_single_meter_switch, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number_E', 'Meter_Number_L']]

In [77]:
del(a)

#### 14% of the meters can be mapped to another meter

In [78]:
df_meter_switch['Meter_Number_E'].count() / df['Meter_Number'].nunique()

0.1404090657822001

#### Combine the meter numbers 

In [79]:
a = pd.merge(df, df_meter_switch, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID', 'Meter_Number_E'], how = 'left')
a['Meter_Number_New'] = a['Meter_Number_L'].combine_first(a['Meter_Number'])
df = a

df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_E'], axis = 1, inplace = True)
df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']
df = df[col_ordered]

In [80]:
df_multiple_meter_switch.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_multiple_meter_switch")

### 14. Consolidate data to Building-Meter-Service_Date_Range level
After combinging the meter numbers in the 2 steps above, there are cases where 2 rows exist for the same Meter and Service Date ranges (1 row for KW charges, 1 row for KWH charges)

In [81]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).agg(['count'])['Account_Name'].reset_index()
idx = idx[idx['count'] > 1]

In [82]:
idx['count'].value_counts()

2    73940
Name: count, dtype: int64

#### see the example below, read starting from the 3rd row

In [83]:
mask = (df['Building_ID'] == '70.0 - BLD 01') & (df['Revenue_Year'] == 2013) & ( (df['Meter_Number'] == '8095177') | (df['Meter_Number'] == '8095173'))
df[mask].sort_values(['Service_Start_Date', 'Meter_Number']).head(10)

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges
78374,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-04-01,2013,2013-03-26,2013-04-24,29.0,0.0,0.0,45360.0,2339.67,4569.3,6908.97
78378,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-04-01,2013,2013-03-26,2013-04-24,29.0,0.0,0.0,42720.0,2203.5,4303.35,6506.85
78388,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-05-01,2013,2013-04-24,2013-05-23,29.0,90.53,2155.75,0.0,0.0,-2155.75,0.0
78402,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-05-01,2013,2013-04-24,2013-05-23,29.0,0.0,0.0,65040.0,3354.76,5421.44,8776.2
78392,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-05-01,2013,2013-04-24,2013-05-23,29.0,97.06,2311.25,0.0,0.0,-2311.25,0.0
78406,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-05-01,2013,2013-04-24,2013-05-23,29.0,0.0,0.0,75840.0,3911.83,6321.71,10233.54
78416,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-06-01,2013,2013-05-23,2013-06-24,32.0,116.16,2163.26,0.0,0.0,-2163.26,0.0
78430,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-06-01,2013,2013-05-23,2013-06-24,32.0,0.0,0.0,90480.0,5100.36,6561.4,11661.76
78420,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-06-01,2013,2013-05-23,2013-06-24,32.0,130.94,2438.51,0.0,0.0,-2438.51,0.0
78434,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-06-01,2013,2013-05-23,2013-06-24,32.0,0.0,0.0,105360.0,5939.14,7640.52,13579.66


#### remove the multiple rows by aggregating at building, meter, revenue month, service_date_range level

In [84]:
df = df.groupby(['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date',
       'Service_End_Date', '# days']).\
    agg({'Consumption_KW': 'sum', 'KW_Charges': 'sum', 'Consumption_KWH': 'sum', 'KWH_Charges': 'sum', 'Other_Charges': 'sum', 'Current_Charges': 'sum'}).reset_index()

### Calculate Metrics regarding zero-values and meter types - 3rd time

In [85]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 0.78%
perc of kwh_only meters: 16.81%
perc of kwh_and_kw meters: 82.40%
perc of buildings with both kw_only and kwh_only meters: 0.25%
perc of rows - current charges of zero: 2.41%
perc of rows - kw charges of zero: 17.36%
perc of rows - kwh charges of zero: 5.58%
perc of rows - consumption/charge inconsistency: 6.36%
perc of rows - sum of charges inconsistency: 34.27%


### 15. Consolidate data to Building-Meter-Revenue_Month level

##### only need to work on the cases where multiple rows exist for the same builing_id, meter_number and revenue_month, due to different service_date_ranges, which might be concatenated in many cases

In [86]:
temp = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).agg('count').reset_index().iloc[:, 0:4]
temp.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month', 'Row_Counts']

In [87]:
df_multiple = pd.merge(df, temp[temp['Row_Counts']  > 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner').iloc[:, 0:15]
df_single = pd.merge(df, temp[temp['Row_Counts']  == 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner').iloc[:, 0:15]

In [88]:
# sort by building_id, revenue month, meter number
df_multiple = df_multiple.sort_values(by = ['Meter_Number', 'Revenue_Month', 'Service_Start_Date'], ascending=[True, True, True])

def merge_dates(grp):
    # Find contiguous date groups, and get the first/last start/end date for each group.
    dt_groups = (grp['Service_Start_Date'] != grp['Service_End_Date'].shift()).cumsum()
    return grp.groupby(dt_groups).agg({'Service_Start_Date': 'first', 'Service_End_Date': 'last',
        '# days':'sum', 'Consumption_KW':'sum', 'KW_Charges':'sum',
       'Consumption_KWH':'sum', 'KWH_Charges':'sum', 'Other_Charges':'sum', 'Current_Charges':'sum'})

# Perform a groupby and apply the merge_dates function, followed by formatting.
df_multiple_concatenate = df_multiple.groupby(['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month', 'Revenue_Year']).apply(merge_dates)
df_multiple_concatenate = df_multiple_concatenate.reset_index().drop('level_6', axis = 1)
df_multiple_concatenate = df_multiple_concatenate.reset_index().iloc[:, 1:16]

In [89]:
idx = df_multiple_concatenate.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).count().reset_index().iloc[:, 0:4]

idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month', 'Count']

idx[idx['Count'] > 1]

Unnamed: 0,Building_ID,Meter_Number,Revenue_Month,Count
0,101.0 - BLD 02,7834072,2015-06-01,2
75,206.0 - BLD 02,7382886,2015-03-01,2
78,227.0 - BLD 01,8322190,2014-08-01,2
79,267.0 - BLD 01,8322189,2014-08-01,2
80,267.0 - BLD 03,8661312,2015-03-01,2
155,98.0 - BLD 15,99277502,2013-03-01,2


#### Only 6 meters have multiple entries under the same Revenue_Month that can't be concatenated. Again they are caused by the separated logging of KWH and KW charges

In [90]:
pd.merge(df, idx[idx['Count'] > 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges,Count
0,CASSIDY-LAFAYETTE,BLD 02,206.0 - BLD 02,7382886,2015-03-01,2015,2015-02-25,2015-03-26,29.0,0.0,0.0,156800.0,6399.01,0.0,6399.01,2
1,CASSIDY-LAFAYETTE,BLD 02,206.0 - BLD 02,7382886,2015-03-01,2015,2015-03-17,2015-03-26,9.0,249.12,5033.94,0.0,0.0,5033.94,10086.83,2
2,LEHMAN VILLAGE,BLD 02,101.0 - BLD 02,7834072,2015-06-01,2015,2015-05-26,2015-06-24,29.0,0.0,0.0,189200.0,8682.39,0.0,8682.39,2
3,LEHMAN VILLAGE,BLD 02,101.0 - BLD 02,7834072,2015-06-01,2015,2015-06-01,2015-06-24,23.0,460.32,5779.24,0.0,0.0,5779.24,12657.01,2
4,MORRISANIA AIR RIGHTS,BLD 01,267.0 - BLD 01,8322189,2014-08-01,2014,2014-07-24,2014-08-22,29.0,356.64,5592.12,0.0,0.0,7530.79,13122.91,2
5,MORRISANIA AIR RIGHTS,BLD 01,267.0 - BLD 01,8322189,2014-08-01,2014,2014-08-11,2014-08-22,11.0,0.0,0.0,209600.0,11844.5,0.0,11844.5,2
6,MORRISANIA AIR RIGHTS,BLD 03,267.0 - BLD 03,8661312,2015-03-01,2015,2015-02-25,2015-03-26,29.0,0.0,0.0,145600.0,5941.94,0.0,5941.94,2
7,MORRISANIA AIR RIGHTS,BLD 03,267.0 - BLD 03,8661312,2015-03-01,2015,2015-03-24,2015-03-26,2.0,251.04,5223.9,0.0,0.0,5223.9,8692.22,2
8,OCEAN BAY APARTMENTS (BAYSIDE),BLD 15,98.0 - BLD 15,99277502,2013-03-01,2013,2013-02-19,2013-03-18,27.0,838.8,0.0,398400.0,40709.83,21376.77,62086.6,2
9,OCEAN BAY APARTMENTS (BAYSIDE),BLD 15,98.0 - BLD 15,99277502,2013-03-01,2013,2013-03-19,2013-03-26,7.0,1.2,0.0,0.0,0.0,103.84,103.84,2


#### Remove them from the working dataset

In [91]:
temp = pd.merge(df_multiple_concatenate, idx[idx['Count'] > 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'left')

temp = temp[temp.Count.isnull()].iloc[:, 0:15]

#### Create the new working dataset df at Building-Meter-Revenue_Month level

In [92]:
df = df_single.append(temp)

In [93]:
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).agg('count')\
.reset_index()['Account_Name'].value_counts()

1    180020
Name: Account_Name, dtype: int64

### 15. Find the gaps between service date ranges

We'd like to know how many account have gaps (> 5 days) in their billing windows

#### concatenate service date ranges for each builing_id and  meter_number, across all years

In [94]:
# sort by building_id, meter number
df = df.sort_values(by = ['Meter_Number', 'Service_Start_Date'], ascending=[True, True])

def merge_dates(grp):
    # Find contiguous date groups, and get the first/last start/end date for each group.
    dt_groups = (grp['Service_Start_Date'] != grp['Service_End_Date'].shift()).cumsum()
    return grp.groupby(dt_groups).agg({'Service_Start_Date': 'first', 'Service_End_Date': 'last'})

# Perform a groupby and apply the merge_dates function, followed by formatting.
df_gap = df.groupby(['Building_ID', 'Meter_Number']).apply(merge_dates)
df_gap = df_gap.reset_index().drop('level_2', axis = 1)
df_gap = df_gap.reset_index()
df_gap.columns = ['rowNum', 'Building_ID', 'Meter_Number', 
       'Service_Start_Date', 'Service_End_Date']

df_gap['nextRowNum'] = df_gap['rowNum'].map(lambda x: x+1)

# Join the dataframe with itself to find the gap between service ranges
df_gap = pd.merge(df_gap, df_gap[['Building_ID', 'Meter_Number', 'nextRowNum', 'Service_End_Date']],\
        left_on = ['Building_ID', 'Meter_Number', 'rowNum'], right_on = ['Building_ID', 'Meter_Number', 'nextRowNum'], how = 'left')

# consecutive days of billing for the same meter number
df_gap['consecutive_days'] = \
df_gap[['Service_End_Date_x', 'Service_Start_Date']].apply(lambda x: (x[0] - x[1]).days, axis = 1)

# number of days elapsed since the previous service range
df_gap['gap_days'] = \
df_gap[['Service_Start_Date', 'Service_End_Date_y']].apply(lambda x: (x[0] - x[1]).days, axis = 1)


# Rename and reorder the columns
df_gap = df_gap[['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date_x', 'consecutive_days', 'gap_days']]
df_gap.columns = ['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date', 'consecutive_days', 'gap_days']

df_gap['Building_Meter'] = df_gap['Building_ID'] + df_gap['Meter_Number']

#### How frequent does a meter has gaps longer than 5 days through all the years ? ~83.2%

In [95]:
df_gap[df_gap['gap_days'] >= 5]['Building_Meter'].nunique() / df_gap['Building_Meter'].nunique()

0.8322010869565217

In [96]:
df_gap.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps")

In [97]:
df_gap_summary = df_gap[df_gap['gap_days'] > 0].groupby('Building_Meter').agg({'consecutive_days':'sum', 'gap_days':'sum'}).reset_index()

df_gap_summary['perc_gap'] = df_gap_summary['gap_days']/(df_gap_summary['consecutive_days'] + df_gap_summary['gap_days'])

In [98]:
df_gap_summary.head()

Unnamed: 0,Building_Meter,consecutive_days,gap_days,perc_gap
0,1.0 - BLD 017836716,1974,612.0,0.236659
1,1.0 - BLD 017838586,2250,760.0,0.252492
2,1.0 - BLD 027694040,2004,582.0,0.225058
3,1.0 - BLD 037177432,1976,610.0,0.235886
4,1.0 - BLD 047381828,2004,582.0,0.225058


#### overlapping service date ranges - 0.71% of the meter accounts

In [99]:
mask = df_gap['gap_days'] < 0
df_gap[mask]

Unnamed: 0,Building_ID,Meter_Number,Service_Start_Date,Service_End_Date,consecutive_days,gap_days,Building_Meter
8624,4.0 - RED HOOK EAST BLD 05,6505127,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 056505127
8633,4.0 - RED HOOK EAST BLD 06,6867327,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 066867327
8641,4.0 - RED HOOK EAST BLD 07,5704938,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 075704938
8649,4.0 - RED HOOK EAST BLD 08,6505249,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 086505249
8658,4.0 - RED HOOK EAST BLD 09,6311451,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 096311451
8667,4.0 - RED HOOK EAST BLD 10,6566619,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 106566619
8676,4.0 - RED HOOK EAST BLD 11,6025513,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 116025513
8684,4.0 - RED HOOK EAST BLD 12,6025507,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 126025507
8693,4.0 - RED HOOK EAST BLD 13,6505117,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 136505117
8701,4.0 - RED HOOK EAST BLD 14,6147237,2010-12-23,2011-03-25,92,-33.0,4.0 - RED HOOK EAST BLD 146147237


In [100]:
print("Perc of meters with overlapping service date ranges:", "{:.2%}".format(df_gap[mask]['Building_Meter'].agg('nunique')/df_gap['Building_Meter'].agg('nunique')))

Perc of meters with overlapping service date ranges: 0.71%


In [101]:
df_gap[mask].gap_days.value_counts()

-33.0    15
-3.0      6
Name: gap_days, dtype: int64

#### Examples

In [102]:
mask = (df['Building_ID'] == '79.0 - RED HOOK WEST BLD 03') \
& ((df['Meter_Number'] == '6477455')|(df['Meter_Number'] == '6477455') ) \
& (df['Revenue_Year'] == 2011)


df[mask].sort_values(['Revenue_Month', 'Service_Start_Date', 'Meter_Number'])

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges
124886,RED HOOK EAST/RED HOOK WEST,RED HOOK WEST BLD 03,79.0 - RED HOOK WEST BLD 03,6477455,2011-01-01,2011,2010-12-23,2011-01-25,33.0,38.0,494.0,22080.0,1266.51,1148.79,2909.3
124887,RED HOOK EAST/RED HOOK WEST,RED HOOK WEST BLD 03,79.0 - RED HOOK WEST BLD 03,6477455,2011-03-01,2011,2010-12-23,2011-03-25,92.0,40.4,530.86,19040.0,1104.13,1111.1,2746.09
124888,RED HOOK EAST/RED HOOK WEST,RED HOOK WEST BLD 03,79.0 - RED HOOK WEST BLD 03,6477455,2011-07-01,2011,2011-06-23,2011-07-25,32.0,61.2,531.22,31640.0,2450.2,2097.54,5078.96


### Calculate Metrics regarding zero-values and meter types - 4th time

In [103]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 0.78%
perc of kwh_only meters: 16.85%
perc of kwh_and_kw meters: 82.37%
perc of buildings with both kw_only and kwh_only meters: 0.25%
perc of rows - current charges of zero: 2.41%
perc of rows - kw charges of zero: 17.35%
perc of rows - kwh charges of zero: 5.57%
perc of rows - consumption/charge inconsistency: 6.33%
perc of rows - sum of charges inconsistency: 34.28%


### 16. Combine rows to the Building-Meter-Month level and Building-Month level; add new aggregation metrics

We need to analyze anamolous values of charges and consumptions at the Building-Meter-Month level and Building-Month level

In [104]:
df_combined_meter = df

df_combined_building = pd.pivot_table(df, values = ['# days', 'Current_Charges','Consumption_KWH', 'KWH_Charges',\
       'Consumption_KW', 'KW_Charges', 'Other_Charges'], index=['Account_Name', 'Location', 'Building_ID',
       'Revenue_Month'], aggfunc = np.sum).reset_index()

In [105]:
df_combined_meter['Total_Charges'] = df_combined_meter['KW_Charges'] + df_combined_meter['KWH_Charges']
df_combined_meter['Total_Energy_Rate'] = df_combined_meter['Total_Charges']/df_combined_meter['Consumption_KWH']

df_combined_meter['Building_Meter'] = df_combined_meter['Building_ID'] + df_combined_meter['Meter_Number']

In [106]:
df_combined_building['Total_Charges'] = df_combined_building['KW_Charges'] + df_combined_building['KWH_Charges']
df_combined_building['Total_Energy_Rate'] = df_combined_building['Total_Charges']/df_combined_building['Consumption_KWH']

### 17. Save the cleaned data to the output folder

In [107]:
# original data after general data cleansing
df_orig.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_original_dataset")

In [108]:
# data at Building_ID, Meter_Number, Revenue_Month level
df.to_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")

In [109]:
# data at Building_ID, Meter_Number, Revenue_Month level
df_combined_meter.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_meter")

In [110]:
# data at Building_ID, Meter_Number level
df_combined_building.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_building")

## Q&A with Linnea:

1. why would "Consumption_KW" be zero?
    - KW and KWH should be both positive, unless there are some related bills that already covers it
    - Maybe one account was separated into multiple meters?
2. What's the "Other Charges"?
    - negative values to adjust for the payments from previous month
    - taxes, fee for meter-reading, little fees charged by utilities and states (e.g. system benefit charge), credit (state got a better deal after charging the clients)

## To Do:

1. Statistical & Graphical Analysis on the combined datasets
2. Check entris that doesn't make sense
   - Cases where other == kw and kwh == 0, why?
   - Cases where other == current and (kw!=0 or kwh != 0)

#### example

In [111]:
df[(df['Other_Charges'] != 0) & (df['Current_Charges'] == df['Other_Charges']) & (~((df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0)))]

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges,Total_Charges,Total_Energy_Rate,Building_Meter
116669,QUEENSBRIDGE NORTH/QUEENSBRIDGE SOUTH,QUEENSBRIDGE NORTH BLD 12,505.0 - QUEENSBRIDGE NORTH BLD 12,1321833,2015-05-01,2015,2015-04-24,2015-05-26,32.0,64.0,1106.56,0.0,0.0,1106.56,1106.56,1106.56,inf,505.0 - QUEENSBRIDGE NORTH BLD 121321833
116670,QUEENSBRIDGE NORTH/QUEENSBRIDGE SOUTH,QUEENSBRIDGE NORTH BLD 12,505.0 - QUEENSBRIDGE NORTH BLD 12,1321833,2015-06-01,2015,2015-05-26,2015-06-24,29.0,122.0,2109.38,0.0,0.0,2109.38,2109.38,2109.38,inf,505.0 - QUEENSBRIDGE NORTH BLD 121321833
116671,QUEENSBRIDGE NORTH/QUEENSBRIDGE SOUTH,QUEENSBRIDGE NORTH BLD 12,505.0 - QUEENSBRIDGE NORTH BLD 12,1321833,2015-07-01,2015,2015-06-24,2015-07-24,30.0,102.0,1763.58,0.0,0.0,1763.58,1763.58,1763.58,inf,505.0 - QUEENSBRIDGE NORTH BLD 121321833
116672,QUEENSBRIDGE NORTH/QUEENSBRIDGE SOUTH,QUEENSBRIDGE NORTH BLD 12,505.0 - QUEENSBRIDGE NORTH BLD 12,1321833,2015-08-01,2015,2015-07-24,2015-08-24,31.0,92.0,1590.68,0.0,0.0,1590.68,1590.68,1590.68,inf,505.0 - QUEENSBRIDGE NORTH BLD 121321833
116673,QUEENSBRIDGE NORTH/QUEENSBRIDGE SOUTH,QUEENSBRIDGE NORTH BLD 12,505.0 - QUEENSBRIDGE NORTH BLD 12,1321833,2015-09-01,2015,2015-08-24,2015-09-23,30.0,106.0,1832.74,0.0,0.0,1832.74,1832.74,1832.74,inf,505.0 - QUEENSBRIDGE NORTH BLD 121321833
119874,RANDOLPH,BLD 26,278.0 - BLD 26,2180293,2012-01-01,2012,2011-12-23,2012-01-25,33.0,0.0,0.00,47.0,0.0,15.05,15.05,0.00,0.000000,278.0 - BLD 262180293
119875,RANDOLPH,BLD 26,278.0 - BLD 26,2180293,2012-02-01,2012,2012-01-25,2012-02-24,30.0,0.0,0.00,15.0,0.0,8.53,8.53,0.00,0.000000,278.0 - BLD 262180293
119876,RANDOLPH,BLD 26,278.0 - BLD 26,2180293,2012-03-01,2012,2012-02-24,2012-03-26,31.0,0.0,0.00,20.0,0.0,7.38,7.38,0.00,0.000000,278.0 - BLD 262180293
119896,RANDOLPH,BLD 26,278.0 - BLD 26,2180293,2013-12-01,2013,2013-11-21,2013-12-24,33.0,0.0,0.00,8.0,0.0,7.25,7.25,0.00,0.000000,278.0 - BLD 262180293
119897,RANDOLPH,BLD 26,278.0 - BLD 26,2180293,2014-01-01,2014,2013-12-24,2014-01-24,31.0,0.0,0.00,7.0,0.0,6.73,6.73,0.00,0.000000,278.0 - BLD 262180293


## To continue the work:

In [112]:
from __future__ import division
import pandas as pd
import numpy as np
import pandasql as pdsql
from datetime import datetime, timedelta

# import matplotlib as mpl
import matplotlib.pyplot as plt
# Setup matplotlib to display in notebook:
%matplotlib inline

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)         # initiate notebook for offline plot


In [113]:
df = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")
df_combined_meter = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_meter")
df_combined_building = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_building")
df_gap = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps")
df_orig = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_original_dataset")

In [114]:
df.groupby('Building_ID').agg({'Meter_Number':'nunique'}).reset_index()['Meter_Number'].value_counts()

1     1314
2      554
3       64
4       49
5        6
36       1
21       1
19       1
15       1
7        1
6        1
Name: Meter_Number, dtype: int64

#### Use SQL to explore the data

In [115]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select count(*) \
        from df \
        "
temp = pysql(str1)

#### Summary Statistics 

In [116]:
df[["Consumption_KWH",  "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"]].describe()

Unnamed: 0,Consumption_KWH,Consumption_KW,Current_Charges,KWH_Charges,KW_Charges,Other_Charges
count,180020.0,180020.0,180020.0,180020.0,180020.0,180020.0
mean,46370.48,96.046511,6408.680075,2382.229199,1393.816722,2383.041867
std,58546.57,134.623154,8147.173101,3261.329493,2046.64884,3689.362177
min,0.0,0.0,-243.15,0.0,-20198.18,-59396.43
25%,6030.0,18.9,1162.0675,328.5275,112.63,467.955
50%,32740.0,68.69,4386.445,1596.9,1044.78,1471.835
75%,63040.0,126.2975,8540.8025,3138.3,2052.41,3112.72
max,1779600.0,16135.46,329800.37,195575.86,78782.96,134224.51


#### Check the billing day gaps per meter per month

In [117]:
df['gaps'] = (df['Service_End_Date'] - df['Service_Start_Date']).dt.days

df['gaps'] = df['gaps'].map(lambda x: max(0, 31-x))

df.gaps.value_counts().sort_index()

0     73939
1     46588
2     55509
3      3638
4       100
5        59
6        64
7        44
8        16
9         8
10        2
11        2
12        3
13        3
14        4
15        1
16        3
17        6
19        2
20        5
21        2
23        1
25        2
26       14
27        1
29        2
30        2
Name: gaps, dtype: int64

In [118]:
df['gaps'].describe()

count    180020.000000
mean          0.951044
std           0.976614
min           0.000000
25%           0.000000
50%           1.000000
75%           2.000000
max          30.000000
Name: gaps, dtype: float64

In [119]:
df[df['gaps'] > 5].shape[0]/df.shape[0]

0.0010387734696144873

#### Boxplot for gap days in a month

In [120]:
# x0 = np.random.randn(500)
# x1 = np.random.randn(500)+1

trace1 = go.Box(
#     x=df[(df['KW_Charges'] == 0) & df['Consumption_KW'] != 0]['Consumption_KW'], 
    x = df['gaps'], 
    opacity=0.75
)
# trace2 = go.Histogram(
#     x=x1,
#     opacity=0.75
# )

data = [trace1]
layout = go.Layout(barmode='overlay')
fig = go.Figure(data=data, layout=layout)

iplot(fig, filename='boxplot')

#### Trendline of data completeness by year-month

In [121]:
temp = df_combined_meter.groupby(['Revenue_Month']).agg({'Building_Meter':'nunique'}).reset_index()
temp.columns = ['Revenue_Month', 'meter_counts']
temp['meter_perc'] = round(temp['meter_counts'] / df_combined_meter.Building_Meter.nunique(), 4)
temp = temp.sort_values('Revenue_Month')

In [122]:
# Create a trace
trace = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.meter_perc
)

data = [trace]

layout = go.Layout(
    title='Trend Line of Data Compleness - % of Accounts with data available in that month',
    yaxis=dict(
#         title='% of Accounts with data available',
        tickformat=".1%"
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

#### Average Metrics

##### TO DO - Check where df_combined_meter['Total_Charges'] < 0 or df_combined_meter['Consumption_KWH'] == 0

In [123]:
mask = (df_combined_meter['Consumption_KWH'] > 0) & (df_combined_meter['Total_Charges'] > 0)

mask = df_combined_meter['Consumption_KWH'] > 0
temp = df_combined_meter[mask].groupby(['Revenue_Month']).agg({\
        'Total_Charges':'mean', 'Total_Energy_Rate': 'mean', 'KWH_Charges':'mean', 'KW_Charges':'mean'}).reset_index()

temp.columns = ['Revenue_Month', 'Total_Charges', 'Total_Energy_Rate', 'KWH_Charges', 'KW_Charges']

temp = temp.sort_values('Revenue_Month')

#### Trend Line of Average Energy Charges

In [124]:
# Create a trace

# Create traces
trace1 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.Total_Charges,
#     mode = 'lines',
    name = 'Avg. Total Charge'
)
trace2 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.Total_Energy_Rate,
#     mode = 'lines+markers',
    name = 'Avg. Total Charge Rate', 
    yaxis='y2'
)

data = [trace1, trace2]

layout = go.Layout(
    title='Trend Line of Average Energy Charges',
    yaxis=dict(
        title='Avg. Total Charges($)',
        tickformat=","
    ),
    yaxis2=dict(
        title='Avg. Total Charge Rates($/KWH)',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
#         tickformat=".2%",
        overlaying='y',
        side='right'
    ),
    legend=dict(x=-.1, y=1.2)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

#### Trend Line of Average KW and KWH Charges

In [125]:
# Create a trace

# Create traces
trace1 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.KWH_Charges,
#     mode = 'lines',
    name = 'Avg. KWH Charges'
)
trace2 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.KW_Charges,
#     mode = 'lines+markers',
    name = 'Avg. KW Charges', 
    yaxis='y2'
)

data = [trace1, trace2]

layout = go.Layout(
    title='Trend Line of Average KW and KWH Charges',
    yaxis=dict(
        title='Avg. KWH Charges($)',
        tickformat=","
    ),
    yaxis2=dict(
        title='Avg. KW Charges($)',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickformat=",",
        overlaying='y',
        side='right'
    ),
    legend=dict(x=-.1, y=1.2)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)