In [107]:
from __future__ import division
import pandas as pd
import numpy as np
import pandasql as pdsql
from datetime import datetime

### 1. Read in the data 

In [194]:
df = pd.read_csv("../data/NYC Open Data - Electric_Consumption_And_Cost__2010_-__June_2018_.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [195]:
df.shape

(313147, 27)

#### Check the number of empty values in each column

In [109]:
df.isnull().sum()

Development Name         146
Borough                  146
Account Name             146
Location                9041
Meter AMR                187
Meter Scope           296588
TDS #                   1717
EDP                      146
RC Code                  146
Funding Source           146
AMP #                   1657
Vendor Name              146
UMIS BILL ID             146
Revenue Month            146
Service Start Date       146
Service End Date         146
# days                   146
Meter Number             146
Estimated                146
Current Charges          146
Rate Class               146
Bill Analyzed            146
Consumption (KWH)        146
KWH Charges              146
Consumption (KW)         146
KW Charges               146
Other charges            146
dtype: int64

## Part I - General Data Cleaning

### 2. Remove empty rows

In [110]:
mask = (pd.isna(df['Account Name']) == True)
df.drop(mask[mask == True].index, axis = 0, inplace = True)

### 3. Remove rows where electricity charges were estimated

In [111]:
df.drop(df.loc[df['Estimated'] == 'Y         '].index, axis = 0, inplace = True)

#### Check data types of columns

In [112]:
df.dtypes

Development Name       object
Borough                object
Account Name           object
Location               object
Meter AMR              object
Meter Scope            object
TDS #                 float64
EDP                   float64
RC Code                object
Funding Source         object
AMP #                  object
Vendor Name            object
UMIS BILL ID          float64
Revenue Month          object
Service Start Date     object
Service End Date       object
# days                float64
Meter Number           object
Estimated              object
Current Charges        object
Rate Class             object
Bill Analyzed          object
Consumption (KWH)     float64
KWH Charges            object
Consumption (KW)       object
KW Charges             object
Other charges          object
dtype: object

#### change column names for easy reference

In [113]:
df.columns = ['Development_Name', 'Borough', 'Account_Name', 'Location', 'Meter_AMR',
       'Meter_Scope', 'TDS #', 'EDP', 'RC_Code', 'Funding_Source', 'AMP #',
       'Vendor_Name', 'UMIS_BILL_ID', 'Revenue_Month', 'Service_Start_Date',
       'Service_End_Date', '# days', 'Meter_Number', 'Estimated',
       'Current_Charges', 'Rate_Class', 'Bill_Analyzed', 'Consumption_KWH',
       'KWH_Charges', 'Consumption_KW', 'KW_Charges', 'Other_Charges']

### 4. Data Type Converstion

1. Change the following fields from string to numerical:
    - "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"

In [114]:
df["Consumption_KW"] = df["Consumption_KW"].apply(lambda x: x.replace(",","") if type(x) == str else str(x))
df["Consumption_KW"] = df["Consumption_KW"].astype(float)

In [115]:
df["Current_Charges"] = df["Current_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Current_Charges"] = df["Current_Charges"].astype(float)

In [116]:
df["KWH_Charges"] = df["KWH_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KWH_Charges"] = df["KWH_Charges"].astype(float, inplace = True)

In [117]:
df["KW_Charges"] = df["KW_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KW_Charges"] = df["KW_Charges"].astype(float, inplace = True)

In [118]:
df["Other_Charges"] = df["Other_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Other_Charges"] = df["Other_Charges"].astype(float, inplace = True)

#### Summarize the numerical fields

##### More than 25% of the values for all except "Curent Charges" are 0, which look unusual

In [119]:
df[["Consumption_KWH",  "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"]].describe()

Unnamed: 0,Consumption_KWH,Consumption_KW,Current_Charges,KWH_Charges,KW_Charges,Other_Charges
count,261252.0,261252.0,261252.0,261252.0,261252.0,261252.0
mean,32565.72,68.239881,4510.758176,1673.941416,1084.801607,1672.128898
std,53071.71,122.266314,6630.228947,2921.443192,1807.808563,3626.935491
min,0.0,0.0,-243.15,0.0,0.0,-59396.43
25%,0.0,0.0,384.3425,0.0,0.0,0.0
50%,11360.0,31.51,2536.945,555.02,448.37,896.51
75%,48160.0,98.5,6052.3575,2359.8925,1601.4225,2636.255
max,1779600.0,16135.46,329800.37,195575.86,78782.96,134224.51


2. Unify the format of "Meter_Number" field (some values exists in both numerical and string

In [120]:
df['Meter_Number'] = df['Meter_Number'].apply(lambda x: str(x) if type(x) == int else x)

### 5. Convert Revenue_Month and Two dates to datetime type

In [121]:
df["Revenue_Month"] = df["Revenue_Month"].map(lambda x: datetime.strptime(x.split(" ")[0], '%m/%d/%Y'))
df['Service_Start_Date'] = df['Service_Start_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df['Service_End_Date'] = df['Service_End_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))

#### Check "Meter Scope": Do the row with a range value represent a "Master Meter" (i.e. its value is the sum of other related rows)? - No

df['Meter Scope'].value_counts()

df[(df['TDS #'] == 118) & (df["Revenue_Month"] == '2010-02-01')][["Location", "Meter Scope", "Revenue_Month", "Current_Charges"]]

df[(df['Meter Scope'] == 'Community Center')].groupby('Location').mean()

df[(df['Meter Scope'] == 'BLD 1 - 9')].groupby('Location').mean()

### 6. Create an unique identifier for each building and remove unnecessary fields

In [122]:
# the combination of TDS# and Location uniquely determines a buildling
# Use EDP or RC Code when TDS# is not available
df['Building_ID'] = df['TDS #'].combine_first(df['EDP']).map(str).combine_first(df['RC_Code']) \
                    + " - " + df['Location'].map(lambda x: 'NA' if pd.isna(x) else x)

In [123]:
# Building_ID alone is not the primary key of the data
df.groupby(['Building_ID', 'Revenue_Month']).count().shape[0]/df.shape[0]

0.6323396567299007

In [124]:
# the combination of Building_ID, meter number and revenue month is still not a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).count().shape[0]/df.shape[0]

0.9987636458285487

In [125]:
# Define a list of columns of interest
cols = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
        'Revenue_Month', 'Service_Start_Date', 'Service_End_Date', '# days', 
       'Current_Charges','Consumption_KWH', 'KWH_Charges',
       'Consumption_KW', 'KW_Charges', 'Other_Charges']
df = df[cols]

In [126]:
# the combination of Building_ID, meter number and revenue month is almost a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count().shape[0]/df.shape[0]

0.9994870852663329

### 7. There are duplicated rows in the dataset - remove duplicates in df

In [127]:
df = df.drop_duplicates()

### 8. Check which combinations of the 5 fields (Building_ID, Meter, Month, StartDate, EndDate) has multiple rows and why

In [128]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### half of these problematic rows has zero values in the numerical fields like "current charges"

In [129]:
a

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Service_Start_Date,Service_End_Date,# days,Current_Charges,Consumption_KWH,KWH_Charges,Consumption_KW,KW_Charges,Other_Charges
0,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2012-12-01,2012-11-21,2012-12-24,33.0,0.00,0.0,0.00,0.00,0.00,0.00
1,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2012-12-01,2012-11-21,2012-12-24,33.0,0.00,0.0,0.00,54.43,1109.09,-1109.09
36,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,0.00,0.00,0.00
37,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-01-01,2012-12-24,2013-01-24,31.0,0.00,0.0,0.00,52.08,1105.73,-1105.73
40,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,0.00,0.00,0.00
41,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-02-01,2013-01-24,2013-02-25,32.0,0.00,0.0,0.00,52.94,1166.15,-1166.15
42,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,0.00,0.00,0.00
43,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-03-01,2013-02-25,2013-03-26,29.0,0.00,0.0,0.00,50.93,1169.81,-1169.81
44,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,0.00,0.00,0.00
45,KINGSBOROUGH,BLD 06,10.0 - BLD 06,1864559,2013-04-01,2013-03-26,2013-04-24,29.0,0.00,0.0,0.00,51.46,1146.50,-1146.50


#### remove those rows from the dataset

In [130]:
df = df[~((df['Current_Charges'] == 0) & (df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Other_Charges'] == 0) & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0))]

In [131]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### Only one row left, seems a case of rebilling

In [132]:
a

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Service_Start_Date,Service_End_Date,# days,Current_Charges,Consumption_KWH,KWH_Charges,Consumption_KW,KW_Charges,Other_Charges
0,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,1306.02,12880.0,858.84,0.0,0.0,447.18
1,THROGGS NECK,BLD 11,63.0 - BLD 11,8125318,2011-10-01,2011-09-22,2011-10-24,32.0,2693.18,26560.0,1771.02,0.0,0.0,922.16


In [133]:
a.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_rebilling_cases")

### 9. Add a column for Revenue_Year and reorder the columns

In [134]:
df['Revenue_Year'] = df['Revenue_Month'].dt.year

In [135]:
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date',
       '# days', 'Consumption_KW', 'KW_Charges', 
       'Consumption_KWH', 'KWH_Charges', 'Other_Charges', 'Current_Charges']

df = df[col_ordered]

### save a copy of the original dataframe before further data cleaning with alterations

In [136]:
df_orig = df

In [137]:
df_orig.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_original_dataset")

## Part II - Data Cleaning with alterations - aggregation, mapping

### 10.1 Check the zero values in Current_Charges, KWH_Charges and KW_Charges

#### High Percentage of rows have current_charges == 0

In [138]:
print ("{:.2%}".format(df[df['Current_Charges'] == 0].shape[0]/df.shape[0]))

16.61%


#### when current_charges == 0, all kwh_charges == 0 (NaN correlation coefficients with all other variables) and kw_charges seems negatively correlated with other_charges

In [139]:
df[df['Current_Charges'] == 0][['KWH_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

Unnamed: 0,KWH_Charges,KW_Charges,KWH_Charges.1,Other_Charges
KWH_Charges,,,,
KW_Charges,,1.0,,-0.694394
KWH_Charges,,,,
Other_Charges,,-0.694394,,1.0


#### when current_charges == 0, many cases kw_charges == - other_charges

In [140]:
mask = (df['Other_Charges'] + df['KW_Charges'] == 0) & (df['Current_Charges'] == 0) & (df['KWH_Charges'] == 0)

In [141]:
print("{:.2%}".format(df[mask].shape[0]/df[df['Current_Charges'] == 0].shape[0]))

82.30%


#### found that actually kw_charges should equal to (-1) * other_charges whenever current_charges = 0

In [142]:
df[(df['Current_Charges'] == 0) & ((df['Other_Charges'] == df['KW_Charges']) \
        | (df['Other_Charges'] + df['KW_Charges'] == 0))].shape[0] / \
df[df['Current_Charges'] == 0].shape[0]

1.0

#### correct the rows where Other_Charges == KW_Charges with Other_Charges = -KW_Charges

In [143]:
mask = (df['Current_Charges'] == 0) & ((df['Other_Charges'] == df['KW_Charges']))

new_column = pd.Series(df[mask]['KW_Charges'].values * (-1),name = 'Other_Charges', index = mask[mask == True].index)

df.update(new_column)

In [144]:
df[df['Current_Charges'] == 0][['Current_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

Unnamed: 0,Current_Charges,KW_Charges,KWH_Charges,Other_Charges
Current_Charges,,,,
KW_Charges,,1.0,,-1.0
KWH_Charges,,,,
Other_Charges,,-1.0,,1.0


In [145]:
df[df['Current_Charges'] == 0][['Current_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

Unnamed: 0,Current_Charges,KW_Charges,KWH_Charges,Other_Charges
Current_Charges,,,,
KW_Charges,,1.0,,-1.0
KWH_Charges,,,,
Other_Charges,,-1.0,,1.0


### Calculate Metrics regarding zero-values and meter types - 1st time

In [146]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_temp = pysql(str1)


df_temp['kwh_only'] = df_temp['count_kwh_only'] == df_temp['count']
df_temp['kw_only'] = df_temp['count_kw_only'] == df_temp['count']

#### check the meters


print("perc of kw_only meters:", "{:.2%}".format(df_temp[(df_temp['kw_only'] == 1) & (df_temp['kwh_only'] == 0)].shape[0] / df_temp.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_temp[(df_temp['kwh_only'] == 1) & (df_temp['kw_only'] == 0)].shape[0] / df_temp.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_temp[(df_temp['kwh_only'] == 0) & (df_temp['kw_only'] == 0)].shape[0] / df_temp.shape[0]))

#### check the building_ids

a = df_temp[df_temp['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_temp[df_temp['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_temp.groupby(['Building_ID']).agg('count').reset_index().shape[0]))

del(df_temp)

#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 26.99%
perc of kwh_only meters: 31.93%
perc of kwh_and_kw meters: 41.09%
perc of buildings with both kw_only and kwh_only meters: 40.99%
perc of rows - current charges of zero: 16.61%
perc of rows - kw charges of zero: 41.13%
perc of rows - kwh charges of zero: 33.03%
perc of rows - consumption/charge inconsistency: 4.45%
perc of rows - sum of charges inconsistency: 26.40%


### 10.2 Identify accounts that have separated meters for KW and KWH charges and combine the meter numbers

There are many cases where under the same Building_ID, two meter numbers share the same last 6 digits and service date ranges. Usually one meter has zero values in all KW_Charges and one has zero values in all KWH_Charges. It seems reasonable to combined them.
- (Exceptions do exist though. Further investigation needed based on the following commented-out codes.)

- Output:
    - df (with consolidated meter numbers)

#### Use sql to explore the dataset

In [147]:
temp = df.groupby(['Building_ID', 'Meter_Number']).agg('count').reset_index()[['Building_ID', 'Meter_Number']]

In [148]:
str1 = "select distinct l.Building_ID, l.Meter_Number, r.Meter_Number\
        from temp l join temp r on l.Building_ID = r.Building_ID and l.Meter_Number > r.Meter_Number \
        where substr(l.Meter_Number, 2, length(l.Meter_number)) == substr(r.Meter_Number, 2, length(r.Meter_number))"
df_meter_mapping = pysql(str1)

df_meter_mapping.columns = ['Building_ID', 'Meter_Number_L', 'Meter_Number_S']

#### 25.7% of the meter numbers can be mapped to another

In [149]:
str1 = "select count (distinct Meter_Number_S) as count_redudant_meters\
        from df_meter_mapping"
str2 = "select count (distinct Meter_Number) as count_meters\
        from temp"
pysql(str1)['count_redudant_meters'][0]/pysql(str2)['count_meters'][0]


0.2571872571872572

In [150]:
del(temp)

#### check if the two meters correspond to KWH_Charges and KW_Charges respectively

In [151]:
# df_sumChargesByMeter = df.groupby(['Building_ID', 'Meter_Number']).agg({'KW_Charges': 'sum', 'KWH_Charges' : 'sum', 'Other_Charges' : 'sum'}).reset_index()

# df_sumChargesByMeter.columns = ['Building_ID', 'Meter_Number', 'KW_Charges', 'KWH_Charges', 'Other_Charges']

# pd.merge(df_sumChargesByMeter[df_sumChargesByMeter['KW_Charges'] == 0][['Building_ID', 'Meter_Number']],
# df_sumChargesByMeter[df_sumChargesByMeter['KWH_Charges'] == 0][['Building_ID', 'Meter_Number']], on = 'Building_ID'
# , how = 'inner')

# df_sumChargesByMeter[df_sumChargesByMeter['KWH_Charges'] == 0].head()

# str1 = "select l.Building_ID, l.Meter_Number_L, l.Meter_Number_S \
#         , sum(r.KW_Charges) as total_KW_Charges, sum(r.KWH_Charges) as total_KWH_Charges \
#         from df_meter_mapping l join df_sumChargesByMeter r on l.Building_ID = r.Building_ID\
#         and l.Meter_Number_L = r.Meter_Number group by l.Building_ID, l.Meter_Number_L, l.Meter_Number_S"
# pysql(str1)

##### examples:

In [152]:
# df[(df['Building_ID'] == '10.0 - BLD 01') & ((df['Meter_Number'] == 7864550) | (df['Meter_Number'] == 1864550))].sort_values(['Service_Start_Date'])


# df[(df['Building_ID'] == '101.0 - BLD 02') & (df['KW_Charges'] > 0) & ((df["Meter_Number"] == 7834072) )]

In [153]:
# ## This query (using "not exists") does not work for pandas dataframe...
# str1 = "select distinct l.Building_ID, l.Meter_Number, r.Meter_Number \
#         from df_gap l join df_gap r on l.Building_ID = r.Building_ID and l.Meter_Number > r.Meter_Number \
#         where not exists (l.Service_Start_Date > r.Service_Start_Date or l.Service_End_Date < r.Service_End_Date)"
# pysql(str1)

#### Combine the meter numbers 

In [154]:
temp = pd.merge(df, df_meter_mapping, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID','Meter_Number_S'], how = 'left')
temp['Meter_Number_New'] = temp['Meter_Number_L'].combine_first(temp['Meter_Number'])

df = temp

del(temp)

In [155]:
df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_S'], axis = 1, inplace = True)

df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']

col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']

df = df[col_ordered]

### Calculate Metrics regarding zero-values and meter types - 2nd time

In [156]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_temp = pysql(str1)


df_temp['kwh_only'] = df_temp['count_kwh_only'] == df_temp['count']
df_temp['kw_only'] = df_temp['count_kw_only'] == df_temp['count']

#### check the meters


print("perc of kw_only meters:", "{:.2%}".format(df_temp[(df_temp['kw_only'] == 1) & (df_temp['kwh_only'] == 0)].shape[0] / df_temp.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_temp[(df_temp['kwh_only'] == 1) & (df_temp['kw_only'] == 0)].shape[0] / df_temp.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_temp[(df_temp['kwh_only'] == 0) & (df_temp['kw_only'] == 0)].shape[0] / df_temp.shape[0]))

#### check the building_ids

a = df_temp[df_temp['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_temp[df_temp['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_temp.groupby(['Building_ID']).agg('count').reset_index().shape[0]))

del(df_temp)

#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 2.71%
perc of kwh_only meters: 17.54%
perc of kwh_and_kw meters: 79.74%
perc of buildings with both kw_only and kwh_only meters: 0.73%
perc of rows - current charges of zero: 16.61%
perc of rows - kw charges of zero: 41.13%
perc of rows - kwh charges of zero: 33.03%
perc of rows - consumption/charge inconsistency: 4.45%
perc of rows - sum of charges inconsistency: 26.40%


### 11. Find the accounts with switched meters

There are Building_ID's whose meter number changed over the years, need to find the mapping and consolidate the meter numbers (In some cases it's a many-to-many mapping, I'm excluding those cases for now)

outputs: 
1. df_multiple_meter_switch (building_id's with many-to-many meter mapping, need to investigate later)
2. df (with consolidated meter numbers)

In [157]:
from dateutil.relativedelta import *

In [158]:
a = df.groupby(['Building_ID']).agg({'Meter_Number': 'nunique'}).reset_index()

a = a[a["Meter_Number"]>1]

a.columns = ['Building_ID', 'Counts']

a = pd.merge(a, df, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number', "Revenue_Month"]]\
.groupby(['Building_ID', 'Meter_Number']).agg({'Revenue_Month': ['max','min']}).reset_index()

a.columns = a.columns.get_level_values(0)

a.columns = ['Building_ID', 'Meter_Number', 'Max_Month', 'Min_Month']

a['Max_Month_Next'] = a['Max_Month'].map(lambda x: x + relativedelta(months=+1))
a['Min_Month_Prior'] = a['Min_Month'].map(lambda x: x - relativedelta(months=+1))
df_switch_meter = a

del(a)

In [159]:
str1 = "select l.Building_ID, l.Meter_Number as Meter_Number_E, r.Meter_Number as Meter_Number_L \
        from df_switch_meter l join df_switch_meter r on l.Building_ID = r.Building_ID and l.Meter_Number != r.Meter_Number \
        where l.Max_Month == r.Min_Month_Prior"
a = pysql(str1)

In [160]:
df_meter_switch = pd.DataFrame(a['Building_ID'].value_counts() > 1).reset_index()
df_meter_switch.columns = ['Building_ID', 'Dummy']

df_single_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == False]
df_multiple_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == True]

In [161]:
df_meter_switch = pd.merge(a, df_single_meter_switch, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number_E', 'Meter_Number_L']]

In [162]:
del(a)

#### 13% of the meters can be mapped to another meter

In [163]:
df_meter_switch['Meter_Number_E'].count() / df['Meter_Number'].nunique()

0.13045751633986927

#### Combined the meter numbers 

In [164]:
a = pd.merge(df, df_meter_switch, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID', 'Meter_Number_E'], how = 'left')
a['Meter_Number_New'] = a['Meter_Number_L'].combine_first(a['Meter_Number'])
df = a

df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_E'], axis = 1, inplace = True)
df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']
df = df[col_ordered]

In [165]:
# save the df_multiple_meter_switch
df_multiple_meter_switch.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_multiple_meter_switch")

### 12. After combinging the meter numbers in the 2 steps above, there are cases where multiple rows exist for the same Meter and Service date ranges

In [166]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).agg(['count'])['Account_Name'].reset_index()
idx = idx[idx['count'] > 1]

In [167]:
idx['count'].value_counts()

2    68680
Name: count, dtype: int64

#### see the example below, read starting from the 3rd row

In [168]:
mask = (df['Building_ID'] == '70.0 - BLD 01') & (df['Revenue_Year'] == 2013) & ( (df['Meter_Number'] == '8095177') | (df['Meter_Number'] == '8095173'))
df[mask].sort_values(['Service_Start_Date', 'Meter_Number'])

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges
78691,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-04-01,2013,2013-03-26,2013-04-24,29.0,0.0,0.0,45360.0,2339.67,4569.3,6908.97
78695,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-04-01,2013,2013-03-26,2013-04-24,29.0,0.0,0.0,42720.0,2203.5,4303.35,6506.85
78705,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-05-01,2013,2013-04-24,2013-05-23,29.0,90.53,2155.75,0.0,0.0,-2155.75,0.0
78719,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-05-01,2013,2013-04-24,2013-05-23,29.0,0.0,0.0,65040.0,3354.76,5421.44,8776.2
78709,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-05-01,2013,2013-04-24,2013-05-23,29.0,97.06,2311.25,0.0,0.0,-2311.25,0.0
78723,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-05-01,2013,2013-04-24,2013-05-23,29.0,0.0,0.0,75840.0,3911.83,6321.71,10233.54
78733,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-06-01,2013,2013-05-23,2013-06-24,32.0,116.16,2163.26,0.0,0.0,-2163.26,0.0
78747,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095173,2013-06-01,2013,2013-05-23,2013-06-24,32.0,0.0,0.0,90480.0,5100.36,6561.4,11661.76
78737,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-06-01,2013,2013-05-23,2013-06-24,32.0,130.94,2438.51,0.0,0.0,-2438.51,0.0
78751,CYPRESS HILLS,BLD 01,70.0 - BLD 01,8095177,2013-06-01,2013,2013-05-23,2013-06-24,32.0,0.0,0.0,105360.0,5939.14,7640.52,13579.66


#### remove the multiple rows by aggregating at building, meter, revenue month, service date range level

In [169]:
df = df.groupby(['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date',
       'Service_End_Date', '# days']).\
    agg({'Consumption_KW': 'sum', 'KW_Charges': 'sum', 'Consumption_KWH': 'sum', 'KWH_Charges': 'sum', 'Other_Charges': 'sum', 'Current_Charges': 'sum'}).reset_index()

### Calculate Metrics regarding zero-values and meter types - 3rd time

In [170]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_temp = pysql(str1)


df_temp['kwh_only'] = df_temp['count_kwh_only'] == df_temp['count']
df_temp['kw_only'] = df_temp['count_kw_only'] == df_temp['count']

#### check the meters


print("perc of kw_only meters:", "{:.2%}".format(df_temp[(df_temp['kw_only'] == 1) & (df_temp['kwh_only'] == 0)].shape[0] / df_temp.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_temp[(df_temp['kwh_only'] == 1) & (df_temp['kw_only'] == 0)].shape[0] / df_temp.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_temp[(df_temp['kwh_only'] == 0) & (df_temp['kw_only'] == 0)].shape[0] / df_temp.shape[0]))

#### check the building_ids

a = df_temp[df_temp['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_temp[df_temp['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_temp.groupby(['Building_ID']).agg('count').reset_index().shape[0]))

del(df_temp)

#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 2.12%
perc of kwh_only meters: 19.00%
perc of kwh_and_kw meters: 78.88%
perc of buildings with both kw_only and kwh_only meters: 0.50%
perc of rows - current charges of zero: 4.17%
perc of rows - kw charges of zero: 19.90%
perc of rows - kwh charges of zero: 8.47%
perc of rows - consumption/charge inconsistency: 6.16%
perc of rows - sum of charges inconsistency: 33.11%


### 13. Find the gaps between service date ranges

We'd like to know how many account have gaps (> 5 days) in their billing windows

#### concatenate service date ranges for each builing_id, meter number and revenue year

In [171]:
# sort by building_id, revenue year, meter number
df = df.sort_values(by = ['Meter_Number', 'Revenue_Year', 'Service_Start_Date'], ascending=[True, True, True])

def merge_dates(grp):
    # Find contiguous date groups, and get the first/last start/end date for each group.
    dt_groups = (grp['Service_Start_Date'] != grp['Service_End_Date'].shift()).cumsum()
    return grp.groupby(dt_groups).agg({'Service_Start_Date': 'first', 'Service_End_Date': 'last'})

# Perform a groupby and apply the merge_dates function, followed by formatting.
df_gap = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Year']).apply(merge_dates)
df_gap = df_gap.reset_index().drop('level_3', axis = 1)
df_gap = df_gap.reset_index()
df_gap.columns = ['rowNum', 'Building_ID', 'Meter_Number', 'Revenue_Year',
       'Service_Start_Date', 'Service_End_Date']

df_gap['nextRowNum'] = df_gap['rowNum'].map(lambda x: x+1)

# Join the dataframe with itself to find the gap between service ranges
df_gap = pd.merge(df_gap, df_gap[['Building_ID', 'Meter_Number', 'nextRowNum', 'Service_End_Date']],\
        left_on = ['Building_ID', 'Meter_Number', 'rowNum'], right_on = ['Building_ID', 'Meter_Number', 'nextRowNum'], how = 'left')

# consecutive days of billing for the same meter number
df_gap['consecutive_days'] = \
df_gap[['Service_End_Date_x', 'Service_Start_Date']].apply(lambda x: (x[0] - x[1]).days, axis = 1)

# gap days from the previous service range of the same meter number
df_gap['gap_days'] = \
df_gap[['Service_Start_Date', 'Service_End_Date_y']].apply(lambda x: (x[0] - x[1]).days, axis = 1)


# Rename and reorder the columns
df_gap = df_gap[['Building_ID', 'Meter_Number', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date_x', 'consecutive_days', 'gap_days']]
df_gap.columns = ['Building_ID', 'Meter_Number', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', 'consecutive_days', 'gap_days']

In [172]:
df_gap['Building_Meter'] = df_gap['Building_ID'] + df_gap['Meter_Number']

In [173]:
df_gap.head()

Unnamed: 0,Building_ID,Meter_Number,Revenue_Year,Service_Start_Date,Service_End_Date,consecutive_days,gap_days,Building_Meter
0,1.0 - BLD 01,7836716,2010,2009-12-24,2010-12-23,364,,1.0 - BLD 017836716
1,1.0 - BLD 01,7836716,2011,2010-12-23,2011-05-24,152,0.0,1.0 - BLD 017836716
2,1.0 - BLD 01,7836716,2011,2011-06-23,2011-08-23,61,30.0,1.0 - BLD 017836716
3,1.0 - BLD 01,7836716,2011,2011-09-22,2011-12-23,92,30.0,1.0 - BLD 017836716
4,1.0 - BLD 01,7836716,2012,2011-12-23,2012-06-22,182,0.0,1.0 - BLD 017836716


In [189]:
df_gap.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps")

#### How frequent does a meter has gaps longer than 5 days in a year (i.e. service date ranges didn't cover the whole year) ? ~40%

In [175]:
a = df_gap[df_gap['gap_days'] > 5].groupby(['Revenue_Year']).agg({'Building_Meter':'nunique'}).reset_index()

b = df_gap.groupby(['Revenue_Year']).agg({'Building_Meter':'nunique'}).reset_index()

temp = pd.merge(a, b, on = 'Revenue_Year')

temp.columns = ['Revenue_Year', 'Meter_Count_Gap', 'Meter_Count_Total']

temp['Gap_Perc'] = temp['Meter_Count_Gap'] / temp['Meter_Count_Total']

In [176]:
np.mean(temp['Gap_Perc'])

0.3982947414836333

In [177]:
temp

Unnamed: 0,Revenue_Year,Meter_Count_Gap,Meter_Count_Total,Gap_Perc
0,2010,911,1986,0.458711
1,2011,1027,2133,0.481481
2,2012,1087,2269,0.479066
3,2013,638,2244,0.284314
4,2014,775,2455,0.315682
5,2015,546,2167,0.251961
6,2016,1094,2209,0.495247
7,2017,764,2103,0.363291
8,2018,933,2051,0.4549


In [178]:
del(a, b, temp)

#### overlapping service date ranges - 1.27% of the meter accounts

In [179]:
mask = df_gap['gap_days'] < 0
df_gap[mask].head()

Unnamed: 0,Building_ID,Meter_Number,Revenue_Year,Service_Start_Date,Service_End_Date,consecutive_days,gap_days,Building_Meter
352,101.0 - BLD 02,7834072,2015,2015-06-01,2015-12-24,206,-23.0,101.0 - BLD 027834072
5393,206.0 - BLD 02,7382886,2015,2015-03-17,2015-12-24,282,-9.0,206.0 - BLD 027382886
5560,21.0 - BLD 01,8300966,2011,2010-09-22,2010-10-22,30,-92.0,21.0 - BLD 018300966
5574,21.0 - BLD 02,6443527,2011,2010-09-22,2010-10-22,30,-92.0,21.0 - BLD 026443527
5593,21.0 - BLD 03,6443449,2011,2010-09-22,2010-10-22,30,-92.0,21.0 - BLD 036443449


In [180]:
print("Perc of meters with overlapping service date ranges:", "{:.2%}".format(df_gap[mask]['Building_Meter'].agg('nunique')/df_gap['Building_Meter'].agg('nunique')))

Perc of meters with overlapping service date ranges: 1.27%


### 14. Combine rows to the Building-Meter-Month level and Building-Month level; add new aggregation metrics

We need to analyze anamolous values of charges and consumptions at the Building-Meter-Month level and Building-Month level

In [200]:
df_combined_meter = pd.pivot_table(df, values = ['# days', 'Current_Charges','Consumption_KWH', 'KWH_Charges',\
       'Consumption_KW', 'KW_Charges', 'Other_Charges'], index=['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month'], aggfunc = np.sum).reset_index()

df_combined_building = pd.pivot_table(df, values = ['# days', 'Current_Charges','Consumption_KWH', 'KWH_Charges',\
       'Consumption_KW', 'KW_Charges', 'Other_Charges'], index=['Account_Name', 'Location', 'Building_ID',
       'Revenue_Month'], aggfunc = np.sum).reset_index()

In [201]:
df_combined_meter['Total_Charges'] = df_combined_meter['KW_Charges'] + df_combined_meter['KWH_Charges']
df_combined_meter['Total_Energy_Rate'] = df_combined_meter['Total_Charges']/df_combined_meter['Consumption_KWH']

In [202]:
df_combined_building['Total_Charges'] = df_combined_building['KW_Charges'] + df_combined_building['KWH_Charges']
df_combined_building['Total_Energy_Rate'] = df_combined_building['Total_Charges']/df_combined_building['Consumption_KWH']

### 15. Save the cleaned data to the output folder

In [184]:
df.to_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")

In [203]:
df_combined_meter.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_meter")

In [204]:
df_combined_building.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_building")

In [205]:
df_combined_meter.head()

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,# days,Consumption_KW,Consumption_KWH,Current_Charges,KWH_Charges,KW_Charges,Other_Charges,Total_Charges,Total_Energy_Rate
0,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-01-01,33.0,148.0,84000.0,10314.51,4818.24,1924.0,3572.27,6742.24,0.080265
1,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-02-01,30.0,144.0,75200.0,9422.06,4313.47,1872.0,3236.59,6185.47,0.082254
2,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-03-01,29.0,136.0,68800.0,8988.12,3946.37,1768.0,3273.75,5714.37,0.083058
3,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-04-01,31.0,124.0,68400.0,9146.17,3923.42,1612.0,3610.75,5535.42,0.080927
4,ADAMS,BLD 01,118.0 - BLD 01,7518352,2010-05-01,28.0,144.0,64800.0,9137.42,3716.93,1872.0,3548.49,5588.93,0.086249


## Q&A with Linnea:

1. why would "Consumption_KW" be zero?
    - KW and KWH should be both positive, unless there are some related bills that already covers it
    - Maybe one account was separated into multiple meters?
2. What's the "Other Charges"?
    - negative values to adjust for the payments from previous month
    - taxes, fee for meter-reading, little fees charged by utilities and states (e.g. system benefit charge), credit (state got a better deal after charging the clients)

## To Do:

1.  Correct the data where Meter_Numbers are in irregular format, such as:
    - '102.0 - MORRIS I BLD 05'	'8096662 41-5'
    - '116.0 - WEST BRIGHTON I BLD 06'	'1860113_1600'	'7860113_1600-87.8%'
    - '8096662 41-5'	'8096662-41.5'
2. Statistical & Graphical Analysis on the combined datasets

## Continue the work:

In [206]:
df = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")

df_combined_meter = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_meter")

df_combined_building = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_building")

In [207]:
df_gap = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps")
df_orig = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_original_dataset")

In [208]:
df.shape

(185900, 15)

In [209]:
df[["Consumption_KWH",  "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"]].describe()

Unnamed: 0,Consumption_KWH,Consumption_KW,Current_Charges,KWH_Charges,KW_Charges,Other_Charges
count,185900.0,185900.0,185900.0,185900.0,185900.0,185900.0
mean,44911.34,93.022783,6207.166327,2307.370183,1478.292498,2179.809372
std,58015.15,132.137336,8038.91886,3225.485781,1934.143193,3743.660313
min,0.0,0.0,-243.15,0.0,0.0,-59396.43
25%,4266.0,12.6,998.7375,253.76,148.5,301.34
50%,31520.0,66.09,4206.955,1524.0,1076.05,1356.2
75%,61800.0,124.0,8290.4775,3065.335,2064.78,3058.5675
max,1779600.0,16135.46,329800.37,195575.86,78782.96,134224.51
