In [2]:
import pandas as pd

### 1. Read in the data 

In [3]:
df = pd.read_csv("NYC Open Data - Electric_Consumption_And_Cost__2010_-__June_2018_.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.shape

(313147, 27)

#### Check the number of empty values in each column

In [5]:
df.isnull().sum()

Development Name         146
Borough                  146
Account Name             146
Location                9041
Meter AMR                187
Meter Scope           296588
TDS #                   1717
EDP                      146
RC Code                  146
Funding Source           146
AMP #                   1657
Vendor Name              146
UMIS BILL ID             146
Revenue Month            146
Service Start Date       146
Service End Date         146
# days                   146
Meter Number             146
Estimated                146
Current Charges          146
Rate Class               146
Bill Analyzed            146
Consumption (KWH)        146
KWH Charges              146
Consumption (KW)         146
KW Charges               146
Other charges            146
dtype: int64

### 2. Remove empty rows

In [5]:
mask = (pd.isna(df['Account Name']) == True)
df.drop(mask[mask == True].index, axis = 0, inplace = True)

### 3. Remove rows where electricity charges were estimated

In [6]:
df.drop(df.loc[df['Estimated'] == 'Y         '].index, axis = 0, inplace = True)

#### Check data types of columns

In [7]:
df.dtypes

Development Name       object
Borough                object
Account Name           object
Location               object
Meter AMR              object
Meter Scope            object
TDS #                 float64
EDP                   float64
RC Code                object
Funding Source         object
AMP #                  object
Vendor Name            object
UMIS BILL ID          float64
Revenue Month          object
Service Start Date     object
Service End Date       object
# days                float64
Meter Number           object
Estimated              object
Current Charges        object
Rate Class             object
Bill Analyzed          object
Consumption (KWH)     float64
KWH Charges            object
Consumption (KW)       object
KW Charges             object
Other charges          object
dtype: object

### 4. Change the following fields from string to float

"Consumption (KW)", "Current Charges", "KWH Charges", "KW Charges", "Other charges"

In [8]:
df["Consumption (KW)"] = df["Consumption (KW)"].apply(lambda x: x.replace(",","") if type(x) == str else str(x))
df["Consumption (KW)"] = df["Consumption (KW)"].astype(float)

In [9]:
df["Current Charges"] = df["Current Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Current Charges"] = df["Current Charges"].astype(float)

In [10]:
df["KWH Charges"] = df["KWH Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KWH Charges"] = df["KWH Charges"].astype(float, inplace = True)

In [11]:
df["KW Charges"] = df["KW Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KW Charges"] = df["KW Charges"].astype(float, inplace = True)

In [12]:
df["Other charges"] = df["Other charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Other charges"] = df["Other charges"].astype(float, inplace = True)

#### Summarize the numerical fields

More than 25% of the values for all except "Curent Charges" are 0

In [13]:
df[["Consumption (KWH)",  "Consumption (KW)", "Current Charges", "KWH Charges", "KW Charges", "Other charges"]].describe()

Unnamed: 0,Consumption (KWH),Consumption (KW),Current Charges,KWH Charges,KW Charges,Other charges
count,261252.0,261252.0,261252.0,261252.0,261252.0,261252.0
mean,32565.72,68.239881,4510.758176,1673.941416,1084.801607,1672.128898
std,53071.71,122.266314,6630.228947,2921.443192,1807.808563,3626.935491
min,0.0,0.0,-243.15,0.0,0.0,-59396.43
25%,0.0,0.0,384.3425,0.0,0.0,0.0
50%,11360.0,31.51,2536.945,555.02,448.37,896.51
75%,48160.0,98.5,6052.3575,2359.8925,1601.4225,2636.255
max,1779600.0,16135.46,329800.37,195575.86,78782.96,134224.51


### 5. Convert Revenue Month and Two dates to datetime type

In [14]:
from datetime import datetime

In [15]:
df["Revenue Month"] = df["Revenue Month"].map(lambda x: datetime.strptime(x.split(" ")[0], '%m/%d/%Y'))
df['Service Start Date'] = df['Service Start Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df['Service End Date'] = df['Service End Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))

#### Check "Meter Scope": are the row with a range value represent a "Master Meter" (i.e. its value is the sum of other related rows)? - No

In [16]:
df['Meter Scope'].value_counts()

Community Center                                      3223
BLD 01                                                 621
BLD 02                                                 452
BLD 03                                                 433
BLD 04                                                 402
BLD 06                                                 274
BLD 07                                                 274
APT 01A - Management Office                            273
APT 01B - Management Office                            263
WEST BRIGHTON I ALL                                    200
WEST BRIGHTON II ALL                                   200
APT 02F - Management Office                            195
APT 02E - Management Office                            194
MORRISANIA ALL                                         194
APT 02H - Management Office                            194
APT 02G - Management Office                            193
BLD 05                                                 1

In [17]:
df[(df['TDS #'] == 118) & (df["Revenue Month"] == '2010-02-01')][["Location", "Meter Scope", "Revenue Month", "Current Charges"]]

Unnamed: 0,Location,Meter Scope,Revenue Month,Current Charges
1,BLD 05,BLD 01 to 07,2010-02-01,14556.34
13,BLD 06,BLD 06,2010-02-01,10948.54
25,BLD 07,BLD 07,2010-02-01,10403.82
37,BLD 01,BLD 01,2010-02-01,9422.06
49,BLD 02,BLD 02,2010-02-01,9710.06
61,BLD 03,BLD 03,2010-02-01,9286.81
73,BLD 04,BLD 04,2010-02-01,10133.33


In [18]:
df[(df['Meter Scope'] == 'Community Center')].groupby('Location').mean()

Unnamed: 0_level_0,TDS #,EDP,UMIS BILL ID,# days,Current Charges,Consumption (KWH),KWH Charges,Consumption (KW),KW Charges,Other charges
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BLD 01 - Community Center,219.172222,448.761111,3703092.0,30.455556,968.792556,5736.194444,313.216667,16.535889,216.6245,423.685889
BLD 02 - Community Center,179.483471,435.892562,3678972.0,30.342975,1786.717975,9327.508264,509.086653,33.722045,400.787686,833.356219
BLD 03 - Community Center,227.368078,345.739414,3657077.0,30.358306,3364.966645,22792.403909,1209.733257,55.999088,675.012866,1395.811466
BLD 04 - Community Center,164.564103,458.384615,3592468.0,30.318681,2934.512381,17003.956044,922.177802,50.013919,712.859634,1244.547033
BLD 05 - Community Center,111.0,434.0,3727165.0,30.351064,1359.712021,9234.893617,492.193723,22.808511,278.162553,559.105426
BLD 06 - Community Center,238.207447,457.611702,3646114.0,30.367021,3094.852234,17753.829787,954.731543,51.770213,775.85266,1326.571862
BLD 08,75.0,226.0,3734546.0,30.474747,2617.118889,12876.363636,641.169293,61.707071,574.925859,1249.793737
BLD 09 - Community Center,28.0,523.0,3892687.0,30.421687,4268.201084,23331.084337,1204.569036,76.356627,1077.436506,1889.236867
BLD 10 - Community Center,60.928571,392.952381,3626191.0,30.425926,4550.163545,26973.185185,1412.216614,82.048466,1018.37463,1992.630608
BLD 12 - Community Center,126.0,440.0,4053056.0,30.463415,4852.131341,22640.97561,1194.572927,88.926829,1296.006098,2238.667195


In [19]:
df[(df['Meter Scope'] == 'BLD 1 - 9')].groupby('Location').mean()

Unnamed: 0_level_0,TDS #,EDP,UMIS BILL ID,# days,Current Charges,Consumption (KWH),KWH Charges,Consumption (KW),KW Charges,Other charges
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BLD 02,55.0,525.0,3781606.0,27.008929,44982.836339,290678.571429,15647.757054,475.214286,4529.356607,22893.834732


### 6. Create an unique identifier for each building and remove unnecessary fields

In [20]:
# the combination of TDS# and Location uniquely determines a buildling
# Use EDP or RC Code when TDS# is not available
df['Building ID'] = df['TDS #'].combine_first(df['EDP']).map(str).combine_first(df['RC Code']) \
                    + " - " + df['Location'].map(lambda x: 'NA' if pd.isna(x) else x)

In [21]:
# Building ID alone is not the primary key of the data
df.groupby(['Building ID', 'Revenue Month']).count().shape[0]/df.shape[0]

0.6323396567299007

In [22]:
# the combination of Building ID, meter number and revenue month is still not a primary key
df.groupby(['Building ID', 'Meter Number', 'Revenue Month']).count().shape[0]/df.shape[0]

0.9987636458285487

In [23]:
# Define a list of columns of interest
cols = ['Account Name', 'Location', 'Building ID', 'Meter Number',
        'Revenue Month', 'Service Start Date', 'Service End Date', '# days', 
       'Current Charges','Consumption (KWH)', 'KWH Charges',
       'Consumption (KW)', 'KW Charges', 'Other charges']
df = df[cols]

#### In some cases multiple rows exists for one revenue month, with consecutive service start and end dates. We need to combined these rows for the same meter-month

In [24]:
df[(df['Building ID'] == '98.0 - BLD 15') & (df['Meter Number'] == 99277502) & (df['Revenue Month'] == '2013-03-01')][cols]

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Current Charges,Consumption (KWH),KWH Charges,Consumption (KW),KW Charges,Other charges
117025,OCEAN BAY APARTMENTS (BAYSIDE),BLD 15,98.0 - BLD 15,99277502,2013-03-01,2013-02-19,2013-03-18,27.0,62086.6,398400.0,40709.83,838.8,0.0,21376.77
117026,OCEAN BAY APARTMENTS (BAYSIDE),BLD 15,98.0 - BLD 15,99277502,2013-03-01,2013-03-18,2013-03-19,1.0,14.64,0.0,0.0,0.0,0.0,14.64
117027,OCEAN BAY APARTMENTS (BAYSIDE),BLD 15,98.0 - BLD 15,99277502,2013-03-01,2013-03-19,2013-03-26,7.0,103.84,0.0,0.0,1.2,0.0,103.84


In [25]:
df[(df['Building ID'] == '165.0 - BLD 01') & (df['Meter Number'] == '099800299') & (df['Revenue Month'] == '2016-06-01')][cols]

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Current Charges,Consumption (KWH),KWH Charges,Consumption (KW),KW Charges,Other charges


In [26]:
# the combination of Building ID, meter number and revenue month is almost a primary key
df.groupby(['Building ID', 'Meter Number', 'Revenue Month', 'Service Start Date', 'Service End Date']).count().shape[0]/df.shape[0]

0.9994870852663329

### 7. There are duplicated rows in the dataset - remove duplicates in df

In [27]:
df = df.drop_duplicates()

### 8. Check which combinations of the 5 fields (Building ID, Meter, Month, StartDate, EndDate) has multiple rows and why

In [28]:
idx = df.groupby(['Building ID', 'Meter Number', 'Revenue Month', 'Service Start Date', 'Service End Date']).count()['Account Name'].reset_index()
idx.columns = ['Building ID', 'Meter Number', 'Revenue Month','Service Start Date', 'Service End Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building ID', 'Meter Number', 'Revenue Month'], how = 'inner')[cols]\
        .sort_values(['Building ID', 'Meter Number', 'Revenue Month'])

#### half of these problematic rows has zero values in the numerical fields like "current charges"

In [29]:
a

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Current Charges,Consumption (KWH),KWH Charges,Consumption (KW),KW Charges,Other charges
0,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,0.00,0.00,0.00
1,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,52.08,1105.73,-1105.73
36,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,0.00,0.00,0.00
37,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,52.94,1166.15,-1166.15
40,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,0.00,0.00,0.00
41,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,50.93,1169.81,-1169.81
42,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,0.00,0.00,0.00
43,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,51.46,1146.50,-1146.50
44,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-05-01,2013-04-24,2013-05-23,29.0,0.00,0.0,0.00,0.00,0.00,0.00
45,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-05-01,2013-04-24,2013-05-23,29.0,0.00,0.0,0.00,60.96,1235.82,-1235.82


#### remove those rows from the dataset

In [30]:
df = df[~((df['Current Charges'] == 0) & (df['KWH Charges'] == 0) & (df['KW Charges'] == 0) \
  & (df['Other charges'] == 0) & (df['Consumption (KWH)'] == 0) & (df['Consumption (KW)'] == 0))]

In [31]:
idx = df.groupby(['Building ID', 'Meter Number', 'Revenue Month', 'Service Start Date', 'Service End Date']).count()['Account Name'].reset_index()
idx.columns = ['Building ID', 'Meter Number', 'Revenue Month','Service Start Date', 'Service End Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building ID', 'Meter Number', 'Revenue Month'], how = 'inner')[cols]\
        .sort_values(['Building ID', 'Meter Number', 'Revenue Month'])

#### Only one row left, seems a case of rebilling

In [32]:
a

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Current Charges,Consumption (KWH),KWH Charges,Consumption (KW),KW Charges,Other charges
0,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,1306.02,12880.0,858.84,0.0,0.0,447.18
1,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,2693.18,26560.0,1771.02,0.0,0.0,922.16


### 9. Combine the rows of same Building, Meter and Month

In [33]:
import numpy as np

In [34]:
dfCharges = pd.pivot_table(df, values = ['# days', 'Current Charges','Consumption (KWH)', 'KWH Charges',\
       'Consumption (KW)', 'KW Charges', 'Other charges'], index=['Account Name', 'Location', 'Building ID', 'Meter Number',
       'Revenue Month'], aggfunc = np.sum).reset_index()

dfStartDate = pd.pivot_table(df, values = ['Service Start Date'], index=['Account Name', 'Location', 'Building ID', 'Meter Number',
       'Revenue Month'], aggfunc = np.min).reset_index()

dfEndDate = pd.pivot_table(df, values = ['Service End Date'], index=['Account Name', 'Location', 'Building ID', 'Meter Number',
       'Revenue Month'], aggfunc = np.max).reset_index()

In [35]:
df_combined = pd.merge(dfStartDate, dfEndDate, on = ['Account Name', 'Location', 'Building ID', 'Meter Number',
       'Revenue Month'], how = 'inner')

df_combined = pd.merge(df_combined, dfCharges, on = ['Account Name', 'Location', 'Building ID', 'Meter Number',
       'Revenue Month'], how = 'inner')

In [36]:
df_combined.head()

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges
0,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-01-01,2009-12-24,2010-01-26,33.0,148.0,84000.0,10314.51,1924.0,4818.24,3572.27
1,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-02-01,2010-01-26,2010-02-25,30.0,144.0,75200.0,9422.06,1872.0,4313.47,3236.59
2,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-03-01,2010-02-25,2010-03-26,29.0,136.0,68800.0,8988.12,1768.0,3946.37,3273.75
3,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-04-01,2010-03-26,2010-04-26,31.0,124.0,68400.0,9146.17,1612.0,3923.42,3610.75
4,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-05-01,2010-04-26,2010-05-24,28.0,144.0,64800.0,9137.42,1872.0,3716.93,3548.49


### 10. save a copy of the original dataframe

In [37]:
df_orig = df
df = df_combined

### 11. Add new metrics

In [38]:
df['Total Charges'] = df['KW Charges'] + df['KWH Charges']

In [39]:
df['Total Energy Rate'] = df['Total Charges']/df['Consumption (KWH)']

In [40]:
df['Total Charge Match'] = (df['Current Charges'] == df['KWH Charges'] + df['KW Charges'] + df['Other charges'])

In [41]:
df.head()

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges,Total Charges,Total Energy Rate,Total Charge Match
0,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-01-01,2009-12-24,2010-01-26,33.0,148.0,84000.0,10314.51,1924.0,4818.24,3572.27,6742.24,0.080265,True
1,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-02-01,2010-01-26,2010-02-25,30.0,144.0,75200.0,9422.06,1872.0,4313.47,3236.59,6185.47,0.082254,False
2,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-03-01,2010-02-25,2010-03-26,29.0,136.0,68800.0,8988.12,1768.0,3946.37,3273.75,5714.37,0.083058,False
3,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-04-01,2010-03-26,2010-04-26,31.0,124.0,68400.0,9146.17,1612.0,3923.42,3610.75,5535.42,0.080927,True
4,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-05-01,2010-04-26,2010-05-24,28.0,144.0,64800.0,9137.42,1872.0,3716.93,3548.49,5588.93,0.086249,True


### 12. Save the cleaned data to the output folder

In [47]:
df.to_csv('../output/NYCHA_Electricity_2010_to_2018_Cleaned.csv')

### 13. Investigate the zero values in the fields of charges and consumption (Incomplete)

#### check how much % of rows have zero values in each numeric field

In [48]:
df[(df['KWH Charges'] == 0) ].shape[0]\
    /df.shape[0]

0.3268528710691575

In [49]:
df[(df['Consumption (KWH)'] == 0)  ].shape[0]\
    /df.shape[0]

0.3244731819729642

In [50]:
df[(df['KW Charges'] == 0) ].shape[0]\
    /df.shape[0]

0.41077155776586394

In [51]:
df[ (df['Consumption (KW)'] == 0) ].shape[0]\
    /df.shape[0]

0.3702495307934145

In [52]:
df[ (df['Current Charges'] == 0) ].shape[0]\
    /df.shape[0]

0.16495878109236045

In [53]:
df[ (df['Other charges'] == 0) ].shape[0]\
    /df.shape[0]

0.11739667556245396

#### check the correlation of numerical fields having zero values

In [54]:
df['KWH_Charge_0'] = df['KWH Charges'] == 0
df['KWH_0'] = df['Consumption (KWH)'] == 0
df['KW_Charge_0'] = df['KW Charges'] == 0
df['KW_0'] = df['Consumption (KW)'] == 0
df['Other_Charge_0'] = df['Other charges'] == 0

In [55]:
df[(df['KWH Charges'] == 0) | (df['Consumption (KWH)'] == 0) \
   | (df['KW Charges'] == 0) | (df['Consumption (KW)'] == 0) ].shape[0]\
    /df.shape[0]

0.7342825692723141

In [56]:
df[((df['KWH Charges'] == 0) ^ (df['Consumption (KWH)'] == 0)) \
   | ((df['KW Charges'] == 0) ^ (df['Consumption (KW)'] == 0)) ].shape[0]\
    /df.shape[0]

0.045190335532203016

In [57]:
df[((df['KWH Charges'] == 0) & (df['Consumption (KWH)'] == 0)) ]\
.groupby(['KWH_Charge_0', 'KWH_0', 'KW_Charge_0', 'KW_0']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges,Total Charges,Total Energy Rate,Total Charge Match,Other_Charge_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
KWH_Charge_0,KWH_0,KW_Charge_0,KW_0,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
True,True,False,False,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375,81375
True,True,False,True,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12
True,True,True,False,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,0,103,103
True,True,True,True,447,447,447,447,447,447,447,447,447,447,447,447,447,447,447,0,447,447


In [58]:
df[((df['KWH Charges'] == 0) ^ (df['Consumption (KWH)'] == 0)) ]\
.groupby(['KWH_Charge_0', 'KWH_0', 'KW_Charge_0', 'KW_0']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges,Total Charges,Total Energy Rate,Total Charge Match,Other_Charge_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
KWH_Charge_0,KWH_0,KW_Charge_0,KW_0,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
False,True,False,True,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
False,True,True,True,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
True,False,False,False,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17
True,False,True,False,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59
True,False,True,True,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535


In [59]:
df[((df['KW Charges'] == 0) & (df['Consumption (KW)'] == 0))]\
.groupby(['KW_Charge_0', 'KW_0', 'KWH_Charge_0', 'KWH_0']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges,Total Charges,Total Energy Rate,Total Charge Match,Other_Charge_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
KW_Charge_0,KW_0,KWH_Charge_0,KWH_0,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
True,True,False,False,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211,92211
True,True,False,True,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
True,True,True,False,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535,535
True,True,True,True,447,447,447,447,447,447,447,447,447,447,447,447,447,447,447,0,447,447


In [60]:
df[((df['KW Charges'] == 0) ^ (df['Consumption (KW)'] == 0))]\
.groupby(['KW_Charge_0', 'KW_0', 'KWH_Charge_0', 'KWH_0']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges,Total Charges,Total Energy Rate,Total Charge Match,Other_Charge_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
KW_Charge_0,KW_0,KWH_Charge_0,KWH_0,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
False,True,False,False,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293,293
False,True,False,True,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
False,True,True,True,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12
True,False,False,False,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384,10384
True,False,True,False,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,59
True,False,True,True,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,0,103,103


## Q&A with Linnea:

1. why would "Consumption (KW)" be zero?
    KW and KWH should be both positive, unless there are some related bills that already covers it
2. What's the "Other Charges"?
    - negative values to adjust for the payments from previous month
    - taxes, fee for meter-reading, little fees charged by utilities and states (e.g. system benefit charge), credit (state got a better deal after charging the clients)

## To Do:

1. investigate the zero values in the fields of charges and consumption
    - current charges = 0 and other charges != 0
    - current charges == other charges and current charges != 0
2. investigate why (Current Charge) != (KWH charges) + (KW charges) + (Other Charges)
3. check if for the same building, meter number varies across different months
4. check rebills (can probably detected by variance-checking)
5. check gaps in service and whether the union of all the billing window is one-year for each meter
    - start, end dates are useful when there is not a complete month reported (1-3 day gap is okay , 5 days is not)
6. anomaly detection of charges & consumptions at the meter level and building level

**Issue- 2 - Checking for cases where current charge is not equal to the sum of KWH, kW and Other charges**

In [3]:
import pandas as pd
clean_df = pd.read_csv("NYCHA_Electricity_2010_to_2018_Cleaned.csv", index_col=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# create a new column that aggregates all charge values
clean_df["sum_all_charges"] = clean_df.apply(lambda x: x['KW Charges'] + x['KWH Charges'] + x['Other charges'], axis=1)

In [5]:
# check if the current charges are equal to the sum of all charges column
clean_df["check_current_charge"] = clean_df.apply(lambda x: round(x['Current Charges']) == round(x['sum_all_charges']), axis=1)

In [6]:
# the number of rows with mismatching values
clean_df['check_current_charge'].value_counts()

True     217503
False     35051
Name: check_current_charge, dtype: int64

In [7]:
# percentage of rows with mismatching values in the clean data
(len(clean_df[(clean_df.check_current_charge == False)])/len(clean_df['check_current_charge']))*100

13.878616058347918

In [8]:
clean_df[(clean_df.check_current_charge == False)]

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges,Total Charges,Total Energy Rate,Total Charge Match,sum_all_charges,check_current_charge
55,ADAMS,BLD 01,118.0 - BLD 01,7518352,1/1/15,12/24/14,1/26/15,33,124.00,70000,9150.28,2163.84,3616.90,2163.84,5780.74,0.082582,False,7944.58,False
56,ADAMS,BLD 01,118.0 - BLD 01,7518352,2/1/15,1/26/15,2/25/15,30,124.00,63200,8467.79,2163.84,3265.54,2163.84,5429.38,0.085908,False,7593.22,False
57,ADAMS,BLD 01,118.0 - BLD 01,7518352,3/1/15,2/25/15,3/26/15,29,108.00,58400,8166.43,2184.54,2383.30,2184.54,4567.84,0.078216,False,6752.38,False
58,ADAMS,BLD 01,118.0 - BLD 01,7518352,4/1/15,3/26/15,4/24/15,29,116.00,59600,10567.75,2184.54,2432.28,2184.54,4616.82,0.077463,False,6801.36,False
59,ADAMS,BLD 01,118.0 - BLD 01,7518352,5/1/15,4/24/15,5/26/15,32,136.00,66000,8350.19,2184.54,2693.46,2184.54,4878.00,0.073909,False,7062.54,False
60,ADAMS,BLD 01,118.0 - BLD 01,7518352,6/1/15,5/26/15,6/24/15,29,176.00,76800,9798.17,2786.08,3524.35,2786.08,6310.43,0.082167,False,9096.51,False
61,ADAMS,BLD 01,118.0 - BLD 01,7518352,8/1/15,7/24/15,8/24/15,31,188.00,105200,11710.61,2976.04,4827.63,2976.04,7803.67,0.074179,False,10779.71,False
62,ADAMS,BLD 01,118.0 - BLD 01,7518352,9/1/15,8/24/15,9/23/15,30,196.00,96400,11264.13,3102.68,4423.80,3102.68,7526.48,0.078076,False,10629.16,False
63,ADAMS,BLD 01,118.0 - BLD 01,7518352,10/1/15,9/23/15,10/23/15,30,144.00,68400,8027.30,2327.01,2791.40,2327.01,5118.41,0.074831,False,7445.42,False
64,ADAMS,BLD 01,118.0 - BLD 01,7518352,11/1/15,10/23/15,11/23/15,31,140.00,73200,7601.60,2327.01,2987.29,2327.01,5314.30,0.072600,False,7641.31,False


**Looking at the data for which the values are not the same, nothing out of the ordinary seems to stand out. It could be due to missing fields, typos while entering the data, or unknown parameters, even a combination of all those factors.**

**Issue - 3 - Aggregating meter numbers for each building id, to identify buildings with multiple meter readings**

In [9]:
# list of all buildings with number of meters used to collect data
clean_df.groupby('Building ID').agg({'Meter Number': 'nunique'})

Unnamed: 0_level_0,Meter Number
Building ID,Unnamed: 1_level_1
1.0 - BLD 01,2
1.0 - BLD 02,1
1.0 - BLD 03,1
1.0 - BLD 04,3
1.0 - BLD 05,1
1.0 - BLD 06,1
1.0 - BLD 07,2
10.0 - BLD 01,2
10.0 - BLD 02,3
10.0 - BLD 03,2


In [10]:
# filtering those rows for which count > 1
temp = clean_df.groupby('Building ID').agg({'Meter Number': 'nunique'})
temp = temp.apply(lambda x: x['Meter Number'] > 1, axis = 1)
temp = temp.to_frame()
temp.columns = ['multiple_meters']
temp.query('multiple_meters')

Unnamed: 0_level_0,multiple_meters
Building ID,Unnamed: 1_level_1
1.0 - BLD 01,True
1.0 - BLD 04,True
1.0 - BLD 07,True
10.0 - BLD 01,True
10.0 - BLD 02,True
10.0 - BLD 03,True
10.0 - BLD 04,True
10.0 - BLD 05,True
10.0 - BLD 06,True
10.0 - BLD 07,True


**A large number of buildings (1396 rows) seem to be using multiple meters to collect data. Need to investigate if this causes any overlap in the stored data**

**Issue-5, checking if the service window covers the entire month, for each building id**

In [11]:
clean_df[clean_df['Building ID'] == '118.0 - BLD 01']

Unnamed: 0,Account Name,Location,Building ID,Meter Number,Revenue Month,Service Start Date,Service End Date,# days,Consumption (KW),Consumption (KWH),Current Charges,KW Charges,KWH Charges,Other charges,Total Charges,Total Energy Rate,Total Charge Match,sum_all_charges,check_current_charge
0,ADAMS,BLD 01,118.0 - BLD 01,7518352,1/1/10,12/24/09,1/26/10,33,148.0,84000,10314.51,1924.00,4818.24,3572.27,6742.24,0.080265,True,10314.51,True
1,ADAMS,BLD 01,118.0 - BLD 01,7518352,2/1/10,1/26/10,2/25/10,30,144.0,75200,9422.06,1872.00,4313.47,3236.59,6185.47,0.082254,False,9422.06,True
2,ADAMS,BLD 01,118.0 - BLD 01,7518352,3/1/10,2/25/10,3/26/10,29,136.0,68800,8988.12,1768.00,3946.37,3273.75,5714.37,0.083058,False,8988.12,True
3,ADAMS,BLD 01,118.0 - BLD 01,7518352,4/1/10,3/26/10,4/26/10,31,124.0,68400,9146.17,1612.00,3923.42,3610.75,5535.42,0.080927,True,9146.17,True
4,ADAMS,BLD 01,118.0 - BLD 01,7518352,5/1/10,4/26/10,5/24/10,28,144.0,64800,9137.42,1872.00,3716.93,3548.49,5588.93,0.086249,True,9137.42,True
5,ADAMS,BLD 01,118.0 - BLD 01,7518352,6/1/10,5/24/10,6/23/10,30,168.0,82400,11421.15,2184.00,4726.46,4510.69,6910.46,0.083865,True,11421.15,True
6,ADAMS,BLD 01,118.0 - BLD 01,7518352,7/1/10,6/23/10,7/23/10,30,204.0,106800,14130.71,2652.00,6126.05,5352.66,8778.05,0.082191,True,14130.71,True
7,ADAMS,BLD 01,118.0 - BLD 01,7518352,9/1/10,8/23/10,9/22/10,30,200.0,93600,13140.02,2600.00,5368.90,5171.12,7968.90,0.085138,True,13140.02,True
8,ADAMS,BLD 01,118.0 - BLD 01,7518352,10/1/10,9/22/10,10/22/10,30,152.0,70000,10001.97,1976.00,4015.20,4010.77,5991.20,0.085589,True,10001.97,True
9,ADAMS,BLD 01,118.0 - BLD 01,7518352,11/1/10,10/22/10,11/22/10,31,128.0,68800,9093.04,1664.00,3946.37,3482.67,5610.37,0.081546,True,9093.04,True


In [12]:
# create a new column for revenue year
clean_df['Revenue Month'] = pd.to_datetime(clean_df['Revenue Month'])
clean_df['Service Start Date'] = pd.to_datetime(clean_df['Service Start Date'])
clean_df['Service End Date'] = pd.to_datetime(clean_df['Service End Date'])
clean_df['revenue_year'] = clean_df['Revenue Month'].dt.year


In [16]:
# group by building id, and revenue year, and find sum of # days in each year

clean_df.groupby(['Building ID','revenue_year']).agg({'# days': sum})

Unnamed: 0_level_0,Unnamed: 1_level_0,# days
Building ID,revenue_year,Unnamed: 2_level_1
1.0 - BLD 01,2010,639
1.0 - BLD 01,2011,551
1.0 - BLD 01,2012,670
1.0 - BLD 01,2013,730
1.0 - BLD 01,2014,666
1.0 - BLD 01,2015,608
1.0 - BLD 01,2016,302
1.0 - BLD 01,2017,302
1.0 - BLD 01,2018,364
1.0 - BLD 02,2010,364


**Many buildings seem to have more than 365 days recorded in a calendar year, which is due to multiple meter readings for the same time period. Have to investigate how to deal with these data anomalies **