In [1]:
import pandas as pd
import numpy as np
import pandasql as pdsql
from datetime import datetime

### 1. Read in the data 

In [2]:
df = pd.read_csv("../data/NYC Open Data - Electric_Consumption_And_Cost__2010_-__June_2018_.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.shape

(313147, 27)

#### Check the number of empty values in each column

In [4]:
df.isnull().sum()

Development Name         146
Borough                  146
Account Name             146
Location                9041
Meter AMR                187
Meter Scope           296588
TDS #                   1717
EDP                      146
RC Code                  146
Funding Source           146
AMP #                   1657
Vendor Name              146
UMIS BILL ID             146
Revenue Month            146
Service Start Date       146
Service End Date         146
# days                   146
Meter Number             146
Estimated                146
Current Charges          146
Rate Class               146
Bill Analyzed            146
Consumption (KWH)        146
KWH Charges              146
Consumption (KW)         146
KW Charges               146
Other charges            146
dtype: int64

### 2. Remove empty rows

In [5]:
mask = (pd.isna(df['Account Name']) == True)
df.drop(mask[mask == True].index, axis = 0, inplace = True)

### 3. Remove rows where electricity charges were estimated

In [6]:
df.drop(df.loc[df['Estimated'] == 'Y         '].index, axis = 0, inplace = True)

#### Check data types of columns

In [7]:
df.dtypes

Development Name       object
Borough                object
Account Name           object
Location               object
Meter AMR              object
Meter Scope            object
TDS #                 float64
EDP                   float64
RC Code                object
Funding Source         object
AMP #                  object
Vendor Name            object
UMIS BILL ID          float64
Revenue Month          object
Service Start Date     object
Service End Date       object
# days                float64
Meter Number           object
Estimated              object
Current Charges        object
Rate Class             object
Bill Analyzed          object
Consumption (KWH)     float64
KWH Charges            object
Consumption (KW)       object
KW Charges             object
Other charges          object
dtype: object

#### change column names for easy reference

In [8]:
df.columns = ['Development_Name', 'Borough', 'Account_Name', 'Location', 'Meter_AMR',
       'Meter_Scope', 'TDS #', 'EDP', 'RC_Code', 'Funding_Source', 'AMP #',
       'Vendor_Name', 'UMIS_BILL_ID', 'Revenue_Month', 'Service_Start_Date',
       'Service_End_Date', '# days', 'Meter_Number', 'Estimated',
       'Current_Charges', 'Rate_Class', 'Bill_Analyzed', 'Consumption_KWH',
       'KWH_Charges', 'Consumption_KW', 'KW_Charges', 'Other_Charges']

### 4. Data Type Converstion

1. Change the following fields from string to numerical:
    - "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"

In [9]:
df["Consumption_KW"] = df["Consumption_KW"].apply(lambda x: x.replace(",","") if type(x) == str else str(x))
df["Consumption_KW"] = df["Consumption_KW"].astype(float)

In [10]:
df["Current_Charges"] = df["Current_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Current_Charges"] = df["Current_Charges"].astype(float)

In [11]:
df["KWH_Charges"] = df["KWH_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KWH_Charges"] = df["KWH_Charges"].astype(float, inplace = True)

In [12]:
df["KW_Charges"] = df["KW_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KW_Charges"] = df["KW_Charges"].astype(float, inplace = True)

In [13]:
df["Other_Charges"] = df["Other_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Other_Charges"] = df["Other_Charges"].astype(float, inplace = True)

#### Summarize the numerical fields

More than 25% of the values for all except "Curent Charges" are 0

In [14]:
df[["Consumption_KWH",  "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"]].describe()

Unnamed: 0,Consumption_KWH,Consumption_KW,Current_Charges,KWH_Charges,KW_Charges,Other_Charges
count,261252.0,261252.0,261252.0,261252.0,261252.0,261252.0
mean,32565.72,68.239881,4510.758176,1673.941416,1084.801607,1672.128898
std,53071.71,122.266314,6630.228947,2921.443192,1807.808563,3626.935491
min,0.0,0.0,-243.15,0.0,0.0,-59396.43
25%,0.0,0.0,384.3425,0.0,0.0,0.0
50%,11360.0,31.51,2536.945,555.02,448.37,896.51
75%,48160.0,98.5,6052.3575,2359.8925,1601.4225,2636.255
max,1779600.0,16135.46,329800.37,195575.86,78782.96,134224.51


2. Unify the format of "Meter_Number" field (some values exists in both numerical and string

In [None]:
# def tryConvertToInt(x):
#     try:
#         return int(x.strip())
#     except (ValueError, TypeError):
#         return x

In [15]:
df['Meter_Number'] = df['Meter_Number'].apply(lambda x: str(x) if type(x) == int else x)

### 5. Convert Revenue_Month and Two dates to datetime type

In [16]:
df["Revenue_Month"] = df["Revenue_Month"].map(lambda x: datetime.strptime(x.split(" ")[0], '%m/%d/%Y'))
df['Service_Start_Date'] = df['Service_Start_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df['Service_End_Date'] = df['Service_End_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))

#### Check "Meter Scope": Do the row with a range value represent a "Master Meter" (i.e. its value is the sum of other related rows)? - No

df['Meter Scope'].value_counts()

df[(df['TDS #'] == 118) & (df["Revenue_Month"] == '2010-02-01')][["Location", "Meter Scope", "Revenue_Month", "Current_Charges"]]

df[(df['Meter Scope'] == 'Community Center')].groupby('Location').mean()

df[(df['Meter Scope'] == 'BLD 1 - 9')].groupby('Location').mean()

### 6. Create an unique identifier for each building and remove unnecessary fields

In [17]:
# the combination of TDS# and Location uniquely determines a buildling
# Use EDP or RC Code when TDS# is not available
df['Building_ID'] = df['TDS #'].combine_first(df['EDP']).map(str).combine_first(df['RC_Code']) \
                    + " - " + df['Location'].map(lambda x: 'NA' if pd.isna(x) else x)

In [18]:
# Building_ID alone is not the primary key of the data
df.groupby(['Building_ID', 'Revenue_Month']).count().shape[0]/df.shape[0]

0.6323396567299007

In [19]:
# the combination of Building_ID, meter number and revenue month is still not a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).count().shape[0]/df.shape[0]

0.9987636458285487

In [20]:
# Define a list of columns of interest
cols = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
        'Revenue_Month', 'Service_Start_Date', 'Service_End_Date', '# days', 
       'Current_Charges','Consumption_KWH', 'KWH_Charges',
       'Consumption_KW', 'KW_Charges', 'Other_Charges']
df = df[cols]

In [21]:
# the combination of Building_ID, meter number and revenue month is almost a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count().shape[0]/df.shape[0]

0.9994870852663329

### 7. There are duplicated rows in the dataset - remove duplicates in df

In [22]:
df = df.drop_duplicates()

### 8. Check which combinations of the 5 fields (Building_ID, Meter, Month, StartDate, EndDate) has multiple rows and why

In [23]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### half of these problematic rows has zero values in the numerical fields like "current charges"

In [24]:
a

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Service_Start_Date,Service_End_Date,# days,Current_Charges,Consumption_KWH,KWH_Charges,Consumption_KW,KW_Charges,Other_Charges
0,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2012-12-01,2012-11-21,2012-12-24,33.0,0.00,0.0,0.00,0.00,0.00,0.00
1,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2012-12-01,2012-11-21,2012-12-24,33.0,0.00,0.0,0.00,54.43,1109.09,-1109.09
36,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,0.00,0.00,0.00
37,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,52.08,1105.73,-1105.73
40,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,0.00,0.00,0.00
41,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,52.94,1166.15,-1166.15
42,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,0.00,0.00,0.00
43,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,50.93,1169.81,-1169.81
44,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,0.00,0.00,0.00
45,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,51.46,1146.50,-1146.50


#### remove those rows from the dataset

In [25]:
df = df[~((df['Current_Charges'] == 0) & (df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Other_Charges'] == 0) & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0))]

In [26]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### Only one row left, seems a case of rebilling

In [27]:
a

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Service_Start_Date,Service_End_Date,# days,Current_Charges,Consumption_KWH,KWH_Charges,Consumption_KW,KW_Charges,Other_Charges
0,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,1306.02,12880.0,858.84,0.0,0.0,447.18
1,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,2693.18,26560.0,1771.02,0.0,0.0,922.16


### 9. Add a column for Revenue_Year and reorder the columns

In [28]:
df['Revenue_Year'] = df['Revenue_Month'].dt.year

In [29]:
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date',
       '# days', 'Consumption_KW', 'KW_Charges', 
       'Consumption_KWH', 'KWH_Charges', 'Other_Charges', 'Current_Charges']

df = df[col_ordered]

In [30]:
df_orig = df

### 10. Identify accounts that have separated meters for KW and KWH charges and combine the meter numbers

There are many cases where under the same Building_ID, two meter numbers share the same last 6 digits and service date ranges. Usually one meter has zero values in all KW_Charges and one has zero values in all KWH_Charges. It seems reasonable to combined them (Exceptions do exist though. Further investigation needed based on the following codes.)

#### Use sql to explore the dataset

In [33]:
pysql = lambda q: pdsql.sqldf(q, globals())

In [65]:
temp = df.groupby(['Building_ID', 'Meter_Number']).agg('count').reset_index()[['Building_ID', 'Meter_Number']]

In [66]:
str1 = "select distinct l.Building_ID, l.Meter_Number, r.Meter_Number\
        from temp l join temp r on l.Building_ID = r.Building_ID and l.Meter_Number > r.Meter_Number \
        where substr(l.Meter_Number, 2, length(l.Meter_number)) == substr(r.Meter_Number, 2, length(r.Meter_number))"
df_meter_mapping = pysql(str1)

df_meter_mapping.columns = ['Building_ID', 'Meter_Number_L', 'Meter_Number_S']


#### 25.7% of the meter numbers can be mapped to another

In [67]:
str1 = "select count (distinct Meter_Number_S) as count_redudant_meters\
        from df_meter_mapping"
str2 = "select count (distinct Meter_Number) as count_meters\
        from temp"
pysql(str1)['count_redudant_meters'][0]/pysql(str2)['count_meters'][0]


0.2571872571872572

In [68]:
del(temp)

#### check if the two meters correspond to KWH_Charges and KW_Charges respectively

In [None]:
# df_sumChargesByMeter = df.groupby(['Building_ID', 'Meter_Number']).agg({'KW_Charges': 'sum', 'KWH_Charges' : 'sum', 'Other_Charges' : 'sum'}).reset_index()

# df_sumChargesByMeter.columns = ['Building_ID', 'Meter_Number', 'KW_Charges', 'KWH_Charges', 'Other_Charges']

# pd.merge(df_sumChargesByMeter[df_sumChargesByMeter['KW_Charges'] == 0][['Building_ID', 'Meter_Number']],
# df_sumChargesByMeter[df_sumChargesByMeter['KWH_Charges'] == 0][['Building_ID', 'Meter_Number']], on = 'Building_ID'
# , how = 'inner')

# df_sumChargesByMeter[df_sumChargesByMeter['KWH_Charges'] == 0].head()

# str1 = "select l.Building_ID, l.Meter_Number_L, l.Meter_Number_S \
#         , sum(r.KW_Charges) as total_KW_Charges, sum(r.KWH_Charges) as total_KWH_Charges \
#         from df_meter_mapping l join df_sumChargesByMeter r on l.Building_ID = r.Building_ID\
#         and l.Meter_Number_L = r.Meter_Number group by l.Building_ID, l.Meter_Number_L, l.Meter_Number_S"
# pysql(str1)

##### examples:

In [None]:
# df[(df['Building_ID'] == '10.0 - BLD 01') & ((df['Meter_Number'] == 7864550) | (df['Meter_Number'] == 1864550))].sort_values(['Service_Start_Date'])


# df[(df['Building_ID'] == '101.0 - BLD 02') & (df['KW_Charges'] > 0) & ((df["Meter_Number"] == 7834072) )]

In [None]:
# ## This query (using "not exists") does not work for pandas dataframe...
# str1 = "select distinct l.Building_ID, l.Meter_Number, r.Meter_Number \
#         from df_gap l join df_gap r on l.Building_ID = r.Building_ID and l.Meter_Number > r.Meter_Number \
#         where not exists (l.Service_Start_Date > r.Service_Start_Date or l.Service_End_Date < r.Service_End_Date)"
# pysql(str1)

#### Combined the meter numbers 

In [73]:
temp = pd.merge(df, df_meter_mapping, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID','Meter_Number_S'], how = 'left')
temp['Meter_Number_New'] = temp['Meter_Number_L'].combine_first(temp['Meter_Number'])


In [76]:
df = temp

In [79]:
del(temp)

In [77]:
df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_S'], axis = 1, inplace = True)

df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']

col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']

df = df[col_ordered]

### 11. Find the accounts with switched meters

There are Building_ID's whose meter number changed over the years, need to find the mapping and consolidate the meter numbers (some cases it's a many-to-many mapping, I'm excluding those cases for now)

outputs: 
1. df_multiple_meter_switch (building_id's with many-to-many meter mapping, need to investigate later)
2. df (with consolidated meter numbers)

In [81]:
from dateutil.relativedelta import *

In [82]:
a = df.groupby(['Building_ID']).agg({'Meter_Number': 'nunique'}).reset_index()

a = a[a["Meter_Number"]>1]

a.columns = ['Building_ID', 'Counts']

a = pd.merge(a, df, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number', "Revenue_Month"]]\
.groupby(['Building_ID', 'Meter_Number']).agg({'Revenue_Month': ['max','min']}).reset_index()

a.columns = a.columns.get_level_values(0)

a.columns = ['Building_ID', 'Meter_Number', 'Max_Month', 'Min_Month']

a['Max_Month_Next'] = a['Max_Month'].map(lambda x: x + relativedelta(months=+1))
a['Min_Month_Prior'] = a['Min_Month'].map(lambda x: x - relativedelta(months=+1))
df_switch_meter = a

del(a)

In [84]:
str1 = "select l.Building_ID, l.Meter_Number as Meter_Number_E, r.Meter_Number as Meter_Number_L \
        from df_switch_meter l join df_switch_meter r on l.Building_ID = r.Building_ID and l.Meter_Number != r.Meter_Number \
        where l.Max_Month == r.Min_Month_Prior"
a = pysql(str1)

In [85]:
df_meter_switch = pd.DataFrame(a['Building_ID'].value_counts() > 1).reset_index()
df_meter_switch.columns = ['Building_ID', 'Dummy']

df_single_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == False]
df_multiple_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == True]

In [86]:
df_meter_switch = pd.merge(a, df_single_meter_switch, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number_E', 'Meter_Number_L']]

In [91]:
del(a)

#### Combined the meter numbers 

In [96]:
a = pd.merge(df, df_meter_switch, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID', 'Meter_Number_E'], how = 'left')
a['Meter_Number_New'] = a['Meter_Number_L'].combine_first(a['Meter_Number'])
df = a

df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_E'], axis = 1, inplace = True)
df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']
df = df[col_ordered]

In [97]:
# save the df_multiple_meter_switch
df_multiple_meter_switch.to_pickle("../output/df_multiple_meter_switch")

### 12. After combinging the meter numbers, there are cases where multiple rows exist for the same Meter and Service date ranges

In [153]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).agg(['count'])['Account_Name'].reset_index()
idx = idx[idx['count'] > 1]

In [174]:
idx['count'].value_counts()

2    68680
Name: count, dtype: int64

In [175]:
mask = (df['Building_ID'] == '70.0 - BLD 01') & (df['Revenue_Year'] == 2013) & ( (df['Meter_Number'] == '8095177') | (df['Meter_Number'] == '8095173'))
df[mask]

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges
78691,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-04-01,2013,2013-03-26,2013-04-24,29.0,0.0,0.0,45360.0,2339.67,4569.3,6908.97
78705,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-05-01,2013,2013-04-24,2013-05-23,29.0,90.53,2155.75,0.0,0.0,-2155.75,0.0
78719,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-05-01,2013,2013-04-24,2013-05-23,29.0,0.0,0.0,65040.0,3354.76,5421.44,8776.2
78733,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-06-01,2013,2013-05-23,2013-06-24,32.0,116.16,2163.26,0.0,0.0,-2163.26,0.0
78747,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-06-01,2013,2013-05-23,2013-06-24,32.0,0.0,0.0,90480.0,5100.36,6561.4,11661.76
78761,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-07-01,2013,2013-06-24,2013-07-24,30.0,161.38,2833.83,0.0,0.0,1516.47,4350.3
78775,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-07-01,2013,2013-06-24,2013-07-24,30.0,0.0,0.0,136320.0,7684.36,4112.17,11796.53
78789,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-08-01,2013,2013-07-24,2013-08-22,29.0,136.03,2388.69,0.0,0.0,1010.35,3399.04
78803,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-08-01,2013,2013-07-24,2013-08-22,29.0,0.0,0.0,104640.0,5898.56,2495.0,8393.56
78817,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-09-01,2013,2013-08-22,2013-09-23,32.0,131.42,2307.74,0.0,0.0,1222.3,3530.04


In [184]:
df = df.groupby(['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date',
       'Service_End_Date', '# days']).\
    agg({'Consumption_KW': 'sum', 'KW_Charges': 'sum', 'Consumption_KWH': 'sum', 'KWH_Charges': 'sum', 'Other_Charges': 'sum', 'Current_Charges': 'sum'}).reset_index()

### 12. Find the gaps between service date ranges

We'd like to know how many account have gaps (> 5 days) in their billing windows

#### concatenate service date ranges for each builing_id, meter number and revenue year

In [198]:
# sort by building_id, revenue year, meter number
df = df.sort_values(by = ['Meter_Number', 'Revenue_Year', 'Service_Start_Date'], ascending=[True, True, True])

def merge_dates(grp):
    # Find contiguous date groups, and get the first/last start/end date for each group.
    dt_groups = (grp['Service_Start_Date'] != grp['Service_End_Date'].shift()).cumsum()
    return grp.groupby(dt_groups).agg({'Service_Start_Date': 'first', 'Service_End_Date': 'last'})

# Perform a groupby and apply the merge_dates function, followed by formatting.
df_gap = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Year']).apply(merge_dates)
df_gap = df_gap.reset_index().drop('level_3', axis = 1)
df_gap = df_gap.reset_index()
df_gap.columns = ['rowNum', 'Building_ID', 'Meter_Number', 'Revenue_Year',
       'Service_Start_Date', 'Service_End_Date']

df_gap['nextRowNum'] = df_gap['rowNum'].map(lambda x: x+1)

# Join the dataframe with itself to find the gap between service ranges
df_gap = pd.merge(df_gap, df_gap[['Building_ID', 'Meter_Number', 'nextRowNum', 'Service_End_Date']],\
        left_on = ['Building_ID', 'Meter_Number', 'rowNum'], right_on = ['Building_ID', 'Meter_Number', 'nextRowNum'], how = 'left')

# consecutive days of billing for the same meter number
df_gap['consecutive_days'] = \
df_gap[['Service_End_Date_x', 'Service_Start_Date']].apply(lambda x: (x[0] - x[1]).days, axis = 1)

# gap days from the previous service range of the same meter number
df_gap['gap_days'] = \
df_gap[['Service_Start_Date', 'Service_End_Date_y']].apply(lambda x: (x[0] - x[1]).days, axis = 1)


# Rename and reorder the columns
df_gap = df_gap[['Building_ID', 'Meter_Number', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date_x', 'consecutive_days', 'gap_days']]
df_gap.columns = ['Building_ID', 'Meter_Number', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', 'consecutive_days', 'gap_days']

In [199]:
df_gap['Building_Meter'] = df_gap['Building_ID'] + df_gap['Meter_Number']

In [200]:
df_gap.head()

Unnamed: 0,Building_ID,Meter_Number,Revenue_Year,Service_Start_Date,Service_End_Date,consecutive_days,gap_days,Building_Meter
0,1.0 - BLD 01,7836716,2010,2009-12-24,2010-12-23,364,,1.0 - BLD 017836716
1,1.0 - BLD 01,7836716,2011,2010-12-23,2011-05-24,152,0.0,1.0 - BLD 017836716
2,1.0 - BLD 01,7836716,2011,2011-06-23,2011-08-23,61,30.0,1.0 - BLD 017836716
3,1.0 - BLD 01,7836716,2011,2011-09-22,2011-12-23,92,30.0,1.0 - BLD 017836716
4,1.0 - BLD 01,7836716,2012,2011-12-23,2012-06-22,182,0.0,1.0 - BLD 017836716


#### How frequent does a meter have gaps longer than 5 days in a year (i.e. service date ranges didn't cover the whole year) ? ~40%

In [None]:
# df_gap[df_gap['gap_days'] > 5].groupby(['Building_ID', 'Meter_Number']).agg(['count'])['Revenue_Year'].reset_index().shape[0] / \
# df_gap.groupby(['Building_ID', 'Meter_Number']).agg(['count'])['Revenue_Year'].reset_index().shape[0]

In [209]:
a = df_gap[df_gap['gap_days'] > 5].groupby(['Revenue_Year']).agg({'Building_Meter':'nunique'}).reset_index()

b = df_gap.groupby(['Revenue_Year']).agg({'Building_Meter':'nunique'}).reset_index()

temp = pd.merge(a, b, on = 'Revenue_Year')

temp.columns = ['Revenue_Year', 'Meter_Count_Gap', 'Meter_Count_Total']

temp['Gap_Perc'] = temp['Meter_Count_Gap'] / temp['Meter_Count_Total']

In [210]:
np.mean(temp['Gap_Perc'])

0.3982947414836333

In [211]:
temp

Unnamed: 0,Revenue_Year,Meter_Count_Gap,Meter_Count_Total,Gap_Perc
0,2010,911,1986,0.458711
1,2011,1027,2133,0.481481
2,2012,1087,2269,0.479066
3,2013,638,2244,0.284314
4,2014,775,2455,0.315682
5,2015,546,2167,0.251961
6,2016,1094,2209,0.495247
7,2017,764,2103,0.363291
8,2018,933,2051,0.4549


In [204]:
del(a, b)

### 11. Combine rows to the Building-Meter-Month level and Building-Month level; add new aggregation metrics

We need to analyze anamolous values of charges and consumptions at the Building-Meter-Month level and Building-Month level

In [215]:
df_combined_meter = pd.pivot_table(df, values = ['Current_Charges','Consumption_KWH', 'KWH_Charges',\
       'Consumption_KW', 'KW_Charges', 'Other_Charges'], index=['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month'], aggfunc = np.sum).reset_index()

df_combined_building = pd.pivot_table(df, values = ['Current_Charges','Consumption_KWH', 'KWH_Charges',\
       'Consumption_KW', 'KW_Charges', 'Other_Charges'], index=['Account_Name', 'Location', 'Building_ID',
       'Revenue_Month'], aggfunc = np.sum).reset_index()

In [216]:
df_combined_meter['Total_Charges'] = df_combined_meter['KW_Charges'] + df_combined_meter['KWH_Charges']
df_combined_meter['Total_Energy_Rate'] = df_combined_meter['Total_Charges']/df_combined_meter['Consumption_KWH']

In [217]:
df_combined_building['Total_Charges'] = df_combined_building['KW_Charges'] + df_combined_building['KWH_Charges']
df_combined_building['Total_Energy_Rate'] = df_combined_building['Total_Charges']/df_combined_building['Consumption_KWH']

### 12. Save the cleaned data to the output folder

In [218]:
df.to_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")

In [219]:
df_combined_meter.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_meter")

In [220]:
df_combined_building.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_building")

## Q&A with Linnea:

1. why would "Consumption_KW" be zero?
    - KW and KWH should be both positive, unless there are some related bills that already covers it
    - Maybe one account was separated into multiple meters?
2. What's the "Other Charges"?
    - negative values to adjust for the payments from previous month
    - taxes, fee for meter-reading, little fees charged by utilities and states (e.g. system benefit charge), credit (state got a better deal after charging the clients)

## To Do:

1. investigate the zero values in the fields of charges and consumption
    - current charges = 0 and other charges != 0
    - current charges == other charges and current charges != 0
2. investigate why (Current Charge) != (KWH charges) + (KW charges) + (Other Charges)
3. check rebills (can probably detected by variance-checking) - overlapping service date ranges
4. create a flag column in the original dataframe to indicate the problematic rows
    - zero values in consumptions and charges
    - sum of charges != current charges
    - rebilling
    - gap in service ranges
    - not covering the full year
5.  There are Meter_Numbers in irregular format:
    - '102.0 - MORRIS I BLD 05'	'8096662 41-5'
    - '116.0 - WEST BRIGHTON I BLD 06'	'1860113_1600'	'7860113_1600-87.8%'
    - '8096662 41-5'	'8096662-41.5'


### Issue 1. Investigate the zero values in the fields of charges and consumption (Incomplete)

#### Over 27.9% of the records either have KWH charges being zero or KW charges being zero
#### Around 6.1% of the records have inconsistent "charges" value and "consumption" value

#### check how much % of rows have zero values in each numeric field

In [187]:
df[(df['KWH_Charges'] == 0) ].shape[0]\
    /df.shape[0]

0.08473910704679935

In [188]:
df[(df['Consumption_KWH'] == 0)].shape[0]\
    /df.shape[0]

0.08147391070467994

In [189]:
df[(df['KW_Charges'] == 0)].shape[0]\
    /df.shape[0]

0.19898332436793975

In [190]:
df[ (df['Consumption_KW'] == 0)].shape[0]\
    /df.shape[0]

0.14391070467993544

In [191]:
df[ (df['Current_Charges'] == 0)].shape[0]\
    /df.shape[0]

0.041748251748251745

In [214]:
df[(df['Current_Charges'] == 0)].iloc[:, 0:16].head()

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges
830,ALBANY/ALBANY II,ALBANY BLD 05,31.0 - ALBANY BLD 05,19046,2012-03-01,2012,2012-02-24,2012-03-26,31.0,28.75,495.49,0.0,0.0,-495.49,0.0
831,ALBANY/ALBANY II,ALBANY BLD 05,31.0 - ALBANY BLD 05,19046,2012-04-01,2012,2012-03-26,2012-04-24,29.0,30.0,590.15,0.0,0.0,-590.15,0.0
832,ALBANY/ALBANY II,ALBANY BLD 05,31.0 - ALBANY BLD 05,19046,2012-05-01,2012,2012-04-24,2012-05-23,29.0,28.75,568.15,0.0,0.0,-568.15,0.0
837,ALBANY/ALBANY II,ALBANY BLD 05,31.0 - ALBANY BLD 05,19046,2012-10-01,2012,2012-09-21,2012-10-23,32.0,32.5,612.01,0.0,0.0,-612.01,0.0
838,ALBANY/ALBANY II,ALBANY BLD 05,31.0 - ALBANY BLD 05,19046,2012-11-01,2012,2012-10-23,2012-11-21,29.0,31.25,625.35,0.0,0.0,-625.35,0.0


In [205]:
df[ (df['Other_Charges'] == 0)].shape[0]\
    /df.shape[0]

0.03669714900484131

#### check the correlation of numerical fields having zero values

In [194]:
df[(df['KWH_Charges'] == 0) | (df['Consumption_KWH'] == 0) \
   | (df['KW_Charges'] == 0) | (df['Consumption_KW'] == 0) ].shape[0]\
    /df.shape[0]

0.2798063474986552

In [195]:
df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]

0.061603012372243145

In [196]:
# df[((df['KWH_Charges'] == 0) & (df['Consumption_KWH'] == 0)) ]\
# .groupby(['KWH_Charge_0', 'KWH_0', 'KW_Charge_0', 'KW_0']).agg(['count'])

KeyError: 'KWH_Charge_0'

In [136]:
# df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) ]\
# .groupby(['KWH_Charge_0', 'KWH_0', 'KW_Charge_0', 'KW_0']).agg(['count'])

KeyError: 'KWH_Charge_0'

In [137]:
# df[((df['KW_Charges'] == 0) & (df['Consumption_KW'] == 0))]\
# .groupby(['KW_Charge_0', 'KW_0', 'KWH_Charge_0', 'KWH_0']).agg(['count'])

KeyError: 'KW_Charge_0'

In [None]:
# df[((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0))]\
# .groupby(['KW_Charge_0', 'KW_0', 'KWH_Charge_0', 'KWH_0']).agg(['count'])

### **Issue- 2 - Check for cases where current charge is not equal to the sum of KWH, kW and Other_Charges**

In [221]:
import pandas as pd
clean_df = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")

In [222]:
# create a new column that aggregates all charge values
clean_df["sum_all_charges"] = clean_df.apply(lambda x: x['KW_Charges'] + x['KWH_Charges'] + x['Other_Charges'], axis=1)

In [223]:
# check if the current charges are equal to the sum of all charges column
clean_df["check_current_charge"] = clean_df.apply(lambda x: round(x['Current_Charges']) == round(x['sum_all_charges']), axis=1)

In [224]:
# the number of rows with mismatching values
clean_df['check_current_charge'].value_counts()

True     162887
False     23013
Name: check_current_charge, dtype: int64

In [225]:
# percentage of rows with mismatching values in the clean data
(len(clean_df[(clean_df.check_current_charge == False)])/len(clean_df['check_current_charge']))*100

12.379236148466918

In [226]:
clean_df[(clean_df.check_current_charge == False)]

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges,sum_all_charges,check_current_charge
101094,MORRIS I/MORRISANIA,MORRIS I BLD 05,130.0 - MORRIS I BLD 05,1096662-58.5,2015-07-01,2015,2015-06-24,2015-07-24,30.0,329.38,5214.06,0.0,0.00,5214.06,8682.04,10428.12,False
101095,MORRIS I/MORRISANIA,MORRIS I BLD 05,130.0 - MORRIS I BLD 05,1096662-58.5,2015-08-01,2015,2015-07-24,2015-08-24,31.0,336.54,5327.41,0.0,0.00,5327.41,8058.25,10654.82,False
101096,MORRIS I/MORRISANIA,MORRIS I BLD 05,130.0 - MORRIS I BLD 05,1096662-58.5,2015-09-01,2015,2015-08-24,2015-09-23,30.0,329.94,5222.95,0.0,0.00,5222.95,7838.21,10445.90,False
101097,MORRIS I/MORRISANIA,MORRIS I BLD 05,130.0 - MORRIS I BLD 05,1096662-58.5,2015-10-01,2015,2015-09-23,2015-10-23,30.0,234.19,3995.56,0.0,0.00,3995.56,6196.67,7991.12,False
101098,MORRIS I/MORRISANIA,MORRIS I BLD 05,130.0 - MORRIS I BLD 05,1096662-58.5,2015-11-01,2015,2015-10-23,2015-11-23,31.0,208.21,3995.56,0.0,0.00,3995.56,5532.18,7991.12,False
101099,MORRIS I/MORRISANIA,MORRIS I BLD 05,130.0 - MORRIS I BLD 05,1096662-58.5,2015-12-01,2015,2015-11-23,2015-12-24,31.0,205.12,2337.35,0.0,0.00,2337.35,0.00,4674.70,False
21174,BROWNSVILLE/VAN DYKE I,BROWNSVILLE BLD 20,16.0 - BROWNSVILLE BLD 20,1125622,2015-01-01,2015,2014-12-24,2015-01-26,33.0,38.40,763.46,0.0,0.00,763.46,0.00,1526.92,False
21175,BROWNSVILLE/VAN DYKE I,BROWNSVILLE BLD 20,16.0 - BROWNSVILLE BLD 20,1125622,2015-02-01,2015,2015-01-26,2015-02-25,30.0,36.96,788.96,0.0,0.00,788.96,0.00,1577.92,False
21176,BROWNSVILLE/VAN DYKE I,BROWNSVILLE BLD 20,16.0 - BROWNSVILLE BLD 20,1125622,2015-03-01,2015,2015-02-25,2015-03-26,29.0,40.80,935.43,0.0,0.00,935.43,0.00,1870.86,False
21177,BROWNSVILLE/VAN DYKE I,BROWNSVILLE BLD 20,16.0 - BROWNSVILLE BLD 20,1125622,2015-04-01,2015,2015-03-26,2015-04-24,29.0,40.70,626.74,0.0,0.00,626.74,1377.47,1253.48,False


**Looking at the data for which the values are not the same, nothing out of the ordinary seems to stand out. It could be due to missing fields, typos while entering the data, or unknown parameters, even a combination of all those factors.**

### Issue - 3, check rebills

#### overlapping service date ranges - 1.3% of the accounts

In [212]:
# mask = (df_gap['Building_ID'] == '70.0 - BLD 01') & ((df_gap['Meter_Number'] == '8095177') | (df_gap['Meter_Number'] == '8095173'))
mask = df_gap['gap_days'] < 0
df_gap[mask].head()

Unnamed: 0,Building_ID,Meter_Number,Revenue_Year,Service_Start_Date,Service_End_Date,consecutive_days,gap_days,Building_Meter
352,101.0 - BLD 02,7834072,2015,2015-06-01,2015-12-24,206,-23.0,101.0 - BLD 027834072
5393,206.0 - BLD 02,7382886,2015,2015-03-17,2015-12-24,282,-9.0,206.0 - BLD 027382886
5560,21.0 - BLD 01,8300966,2011,2010-09-22,2010-10-22,30,-92.0,21.0 - BLD 018300966
5574,21.0 - BLD 02,6443527,2011,2010-09-22,2010-10-22,30,-92.0,21.0 - BLD 026443527
5593,21.0 - BLD 03,6443449,2011,2010-09-22,2010-10-22,30,-92.0,21.0 - BLD 036443449


In [213]:
df_gap[mask]['Building_Meter'].agg('nunique')/df_gap['Building_Meter'].agg('nunique')

0.012686330478908976

### Issue 4. create a flag column in the original dataframe to indicate the problematic rows (Incomplete but easy to do)
    - zero values in consumptions and charges
    - sum of charges != current charges
    - rebilling
    - gap in service ranges
    - not covering the full year

### Issue 5. Correct meter values in irregular format