In [None]:
from __future__ import division
import pandas as pd
import numpy as np
import pandasql as pdsql
from datetime import datetime
from dateutil.relativedelta import *

# import matplotlib as mpl
import matplotlib.pyplot as plt
# Setup matplotlib to display in notebook:
%matplotlib inline

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)         # initiate notebook for offline plot


### 1. Read in the data 

In [None]:
df = pd.read_csv("../data/NYC Open Data - Electric_Consumption_And_Cost__2010_-__June_2018_.csv", low_memory=False)

In [None]:
df.shape

#### Check the number of empty values in each column

In [None]:
df.isnull().sum()

## Part I - General Data Cleaning

### 2. Remove empty rows

In [None]:
mask = (pd.isna(df['Account Name']) == True)
df.drop(mask[mask == True].index, axis = 0, inplace = True)

### 3. Remove rows where electricity charges were estimated

In [None]:
df.drop(df.loc[df['Estimated'] == 'Y         '].index, axis = 0, inplace = True)

#### Check data types of columns

In [None]:
df.dtypes

#### Change column names for easy reference

In [None]:
df.columns = ['Development_Name', 'Borough', 'Account_Name', 'Location', 'Meter_AMR',
       'Meter_Scope', 'TDS #', 'EDP', 'RC_Code', 'Funding_Source', 'AMP #',
       'Vendor_Name', 'UMIS_BILL_ID', 'Revenue_Month', 'Service_Start_Date',
       'Service_End_Date', '# days', 'Meter_Number', 'Estimated',
       'Current_Charges', 'Rate_Class', 'Bill_Analyzed', 'Consumption_KWH',
       'KWH_Charges', 'Consumption_KW', 'KW_Charges', 'Other_Charges']

### 4. Data Type Converstion

1. Change the following fields from string to numerical:
    - "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"

In [None]:
df["Consumption_KW"] = df["Consumption_KW"].apply(lambda x: x.replace(",","") if type(x) == str else str(x))
df["Consumption_KW"] = df["Consumption_KW"].astype(float)

In [None]:
df["Current_Charges"] = df["Current_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Current_Charges"] = df["Current_Charges"].astype(float)

In [None]:
df["KWH_Charges"] = df["KWH_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KWH_Charges"] = df["KWH_Charges"].astype(float, inplace = True)

In [None]:
df["KW_Charges"] = df["KW_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["KW_Charges"] = df["KW_Charges"].astype(float, inplace = True)

In [None]:
df["Other_Charges"] = df["Other_Charges"].apply(lambda x: x.replace("$","").replace(",","").replace("(","-").replace(")","") if type(x) == str else str(x))
df["Other_Charges"] = df["Other_Charges"].astype(float, inplace = True)

##### More than 25% of the values for all except "Curent Charges" are 0, which seem unusual

In [None]:
df[["Consumption_KWH",  "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"]].describe()

2. Unify the format of "Meter_Number" field (some values exists in both numerical and string

In [None]:
df['Meter_Number'] = df['Meter_Number'].apply(lambda x: str(x) if type(x) == int else x)

### 5.1 Convert Revenue_Month and Two dates to datetime type

In [None]:
df["Revenue_Month"] = df["Revenue_Month"].map(lambda x: datetime.strptime(x.split(" ")[0], '%m/%d/%Y'))
df['Service_Start_Date'] = df['Service_Start_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df['Service_End_Date'] = df['Service_End_Date'].map(lambda x: datetime.strptime(x, '%m/%d/%Y'))

#### In some cases the Revenue_Month is not in the same revenue_year as the Service Start and End dates when those two are

In [None]:
df['start_date_year'] = df['Service_Start_Date'].apply(lambda x: datetime(x.year, 1, 1))

df['end_date_year'] = df['Service_End_Date'].apply(lambda x: datetime(x.year, 1, 1))

df['revenue_month_year'] = df['Revenue_Month'].apply(lambda x: datetime(x.year, 1, 1))

mask = ((df['end_date_year'] == df['start_date_year']) & (df['revenue_month_year'] != df['end_date_year']))

In [None]:
mask.value_counts()

In [None]:
df[mask][['Revenue_Month', 'Service_Start_Date', 'Service_End_Date', 'Meter_Number']].sort_values(['Revenue_Month', 'Service_Start_Date', 'Meter_Number'])

#### Correct the cases where Revenue_Month is in the wrong year

In [None]:
df.loc[mask, "Revenue_Month"] = datetime.strptime('10/01/2010', '%m/%d/%Y')

### 5.2 Clean up the Meter_Number field
- remove the leadng zeros 
- remove white spaces
- standardize the format for meter_numbers of the similar pattern

In [None]:
df['Meter_Number'] = df['Meter_Number'].apply(lambda x: x.lstrip("0").strip(" "))

In [None]:
df['Meter_Length'] = df['Meter_Number'].apply(lambda x: len(x))

In [None]:
df['Meter_Length'].value_counts()

In [None]:
df[df['Meter_Length'] == 12]['Meter_Number'].value_counts()

In [None]:
df.loc[df['Meter_Number'] == '1096662 41-5', 'Meter_Number'] = '1096662-41.5'

df.loc[df['Meter_Number'] == '1096662 58-5', 'Meter_Number'] = '1096662-58.5'

df.loc[df['Meter_Number'] == '8096662 41-5', 'Meter_Number'] = '8096662-41.5'

df.loc[df['Meter_Number'] == '8096662 58-5', 'Meter_Number'] = '8096662-58.5'

#### Check "Meter Scope": Do the row with a range value represent a "Master Meter" (i.e. its value is the sum of other related rows)? - No

df['Meter Scope'].value_counts()

df[(df['TDS #'] == 118) & (df["Revenue_Month"] == '2010-02-01')][["Location", "Meter Scope", "Revenue_Month", "Current_Charges"]]

df[(df['Meter Scope'] == 'Community Center')].groupby('Location').mean()

df[(df['Meter Scope'] == 'BLD 1 - 9')].groupby('Location').mean()

### 6. Create an unique identifier for each building and remove unnecessary fields

In [None]:
# the combination of TDS# and Location uniquely determines a buildling
# Use EDP or RC Code when TDS# is not available
df['Building_ID'] = df['TDS #'].combine_first(df['EDP']).map(str).combine_first(df['RC_Code']) \
                    + " - " + df['Location'].map(lambda x: 'NA' if pd.isna(x) else x)

In [None]:
# Building_ID alone is not the primary key of the data
df.groupby(['Building_ID', 'Revenue_Month']).count().shape[0]/df.shape[0]

In [None]:
# the combination of Building_ID, meter number and revenue month is still not a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).count().shape[0]/df.shape[0]

In [None]:
# Define a list of columns of interest
cols = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
        'Revenue_Month', 'Service_Start_Date', 'Service_End_Date', '# days', 
       'Current_Charges','Consumption_KWH', 'KWH_Charges',
       'Consumption_KW', 'KW_Charges', 'Other_Charges']
df = df[cols]

In [None]:
# the combination of Building_ID, meter number and revenue month is almost a primary key
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count().shape[0]/df.shape[0]

### 7. Drop Duplicated rows and clean up to format of Meter_Number field

In [None]:
df = df.drop_duplicates()

### 8. Check which combinations of the 5 fields (Building_ID, Meter, Month, StartDate, EndDate) has multiple rows and why

In [None]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### half of these problematic rows has zero values in the numerical fields of charges and consumptions

In [None]:
a

#### remove those rows from the dataset

In [None]:
df = df[~((df['Current_Charges'] == 0) & (df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Other_Charges'] == 0) & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0))]

#### we also don't care about entries that only has other_charges not equal to zero

In [None]:
df = df[~((df['Other_Charges'] != 0) & (df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0))]

In [None]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).count()['Account_Name'].reset_index()
idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month','Service_Start_Date', 'Service_End_Date', 'Counts']
idx = idx[idx['Counts'] > 1]

dupRows = idx.sort_values('Counts', ascending = False)

a = pd.merge(dupRows.iloc[:, 0:3], df[cols], on = \
         ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')[cols]\
        .sort_values(['Building_ID', 'Meter_Number', 'Revenue_Month'])

#### Only 2 rows left, seems a case of rebilling

In [None]:
a

### 9. save a copy of the original dataframe before further data cleaning with alterations and flag the rows with problems

In [None]:
df_orig = df

In [None]:
df_orig['flag'] = ""

In [None]:
# df = df_orig.iloc[:, 0:15]

#### update the flag in df_orig

In [None]:
mask = (df_orig['Building_ID'] == '63.0 - BLD 11') & (df_orig['Meter_Number'] == '8125318') & (df_orig['Revenue_Month'] == '2011-10-01')
df_orig.loc[mask, 'flag'] = 'rebill'
df_orig = df_orig.iloc[:, 0:15]
df_orig.flag.value_counts()

#### Remove the entries with rebilling from the working dataset df

In [None]:
df = df[~mask]

In [None]:
del(a)

### 10. Add a column for Revenue_Year and reorder the columns

In [None]:
df.loc[:, 'Revenue_Year'] = df['Revenue_Month'].dt.year

In [None]:
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date', 'Service_End_Date',
       '# days', 'Consumption_KW', 'KW_Charges', 
       'Consumption_KWH', 'KWH_Charges', 'Other_Charges', 'Current_Charges']

df = df[col_ordered]

## Part II - Data Cleaning with alterations - aggregation, mapping

### 11. Check the zero values in Current_Charges, KWH_Charges and KW_Charges

#### High Percentage of rows have current_charges == 0

In [None]:
print ("{:.2%}".format(df[df['Current_Charges'] == 0].shape[0]/df.shape[0]))

#### when current_charges == 0, all kwh_charges == 0 (NaN correlation coefficients with all other variables) and kw_charges seems negatively correlated with other_charges

In [None]:
df[df['Current_Charges'] == 0][['KWH_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

#### when current_charges == 0, 82% of the time kw_charges == - other_charges and kw_charges ==  other_charges otherwise

In [None]:
mask = (df['Other_Charges'] + df['KW_Charges'] == 0) & (df['Current_Charges'] == 0) & (df['KWH_Charges'] == 0)

In [None]:
print("{:.2%}".format(df[mask].shape[0]/df[df['Current_Charges'] == 0].shape[0]))

In [None]:
df[(df['Current_Charges'] == 0) & ((df['Other_Charges'] == df['KW_Charges']) \
        | (df['Other_Charges'] + df['KW_Charges'] == 0))].shape[0] / \
df[df['Current_Charges'] == 0].shape[0]

#### correct the rows where Other_Charges == KW_Charges with Other_Charges = -KW_Charges

In [None]:
mask = (df['Current_Charges'] == 0) & ((df['Other_Charges'] == df['KW_Charges']) & (df['KW_Charges'] != 0))


In [None]:
df.loc[mask, 'KW_Charges'] = df.loc[mask, 'Other_Charges'] * (-1)

In [None]:
df[df['Current_Charges'] == 0][['Current_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

In [None]:
df[df['Current_Charges'] == 0][['Current_Charges', 'KW_Charges', 'KWH_Charges', 'Other_Charges']].corr()

#### update the flag in df_orig

In [None]:
mask = (df_orig['Current_Charges'] == 0) & ((df_orig['Other_Charges'] == df_orig['KW_Charges']) & (df_orig['KW_Charges'] != 0))
valid = df_orig[mask]['flag']
df_orig.loc[mask, 'flag'] = valid.apply(lambda x: 'Sign of Other_Charges is incorrect' if x == "" else x + '; ' + 'Sign of Other_Charges is incorrect')

df_orig = df_orig.iloc[:, 0:15]

del( valid, mask)
df_orig.flag.value_counts()

### Calculate Metrics regarding zero-values and meter types - 1st time

In [None]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

### 12. Identify accounts that have separated meters for KW and KWH charges and combine the meters

There are many cases where under the same Building_ID, two meter numbers differ only in the first digit and share the same service date ranges. Usually the larger meter number has zero values in all KW_Charges and the smaller one has zero values in all KWH_Charges. It seems reasonable to combined them.
- (Exceptions do exist - some larger meter number have values in both KW and KWH)

- Output:
    - df (with consolidated meter numbers)

In [None]:
temp = df.groupby(['Building_ID', 'Meter_Number']).agg('count').reset_index()[['Building_ID', 'Meter_Number']]

In [None]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select distinct l.Building_ID, l.Meter_Number, r.Meter_Number\
        from temp l join temp r on l.Building_ID = r.Building_ID and l.Meter_Number > r.Meter_Number \
        where substr(l.Meter_Number, 2, length(l.Meter_number)) == substr(r.Meter_Number, 2, length(r.Meter_number))"
df_meter_mapping = pysql(str1)

df_meter_mapping.columns = ['Building_ID', 'Meter_Number_L', 'Meter_Number_S']

#### 26.7% of the meter numbers can be mapped to another

In [None]:
str1 = "select count (distinct Meter_Number_S) as count_redudant_meters\
        from df_meter_mapping"
str2 = "select count (distinct Meter_Number) as count_meters\
        from temp"
pysql(str1)['count_redudant_meters'][0]/pysql(str2)['count_meters'][0]


In [None]:
del(temp)

In [None]:
df_meter_mapping.head()

#### check if the two meters correspond to KWH_Charges and KW_Charges respectively, by comparing to the df_meter_type table obtained above

In [None]:
temp = pd.merge(df_meter_mapping, df_meter_type, left_on = ['Building_ID', 'Meter_Number_S']\
         , right_on = ['Building_ID', 'Meter_Number'], how = 'left')\
        [['Building_ID', 'Meter_Number_S', 'count_kwh_only', 'count_kw_only', 'count', 'kwh_only', 'kw_only', 'Meter_Number_L']]

temp.columns = ['Building_ID', 'Meter_Number_S', 'count_kwh_only_s', 'count_kw_only_s', 'count_s', 'kwh_only_s', 'kw_only_s',
       'Meter_Number_L']

temp = pd.merge(temp, df_meter_type, left_on = ['Building_ID', 'Meter_Number_L']\
         , right_on = ['Building_ID', 'Meter_Number'], how = 'left')\
        [['Building_ID', 'Meter_Number_S', 'count_kwh_only_s', 'count_kw_only_s', 'count_s', 'kwh_only_s', 'kw_only_s', 'Meter_Number_L', 'count_kwh_only', 'count_kw_only', 'count', 'kwh_only', 'kw_only']]

temp.columns = ['Building_ID', 'Meter_Number_S', 'count_kwh_only_s', 'count_kw_only_s', 'count_s', 'kwh_only_s', 'kw_only_s',
       'Meter_Number_L', 'count_kwh_only_l', 'count_kw_only_l', 'count_l', 'kwh_only_l', 'kw_only_l']

In [None]:
temp.head()

#### Nearly all the "small" meter_numbers are kw_only meters (they only have non-zero values in kw charges), it seems okay to map them to the "large" corresponding meter_numbers

#### kwh_only_l means the "larger" meter_number only has non-zero values in KWH charges; Better doc needed here

In [None]:
temp[(temp['kwh_only_l'] == False) & (temp['kw_only_l'] == False)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

In [None]:
temp[(temp['kwh_only_s'] == False) & (temp['kw_only_s'] == False)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

In [None]:
temp[(temp['kwh_only_s'] == True) & (temp['kw_only_s'] == False)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

In [None]:
temp[(temp['kwh_only_s'] == False) & (temp['kw_only_s'] == True)].Meter_Number_S.nunique() / temp.Meter_Number_S.nunique()

#### Combine the meter numbers 

In [None]:
temp = pd.merge(df, df_meter_mapping, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID','Meter_Number_S'], how = 'left')
temp['Meter_Number_New'] = temp['Meter_Number_L'].combine_first(temp['Meter_Number'])

df = temp

del(temp)

In [None]:
df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_S'], axis = 1, inplace = True)

df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']

col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']

df = df[col_ordered]

In [None]:
df_meter_mapping.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_meter_mapping")

### Calculate Metrics regarding zero-values and meter types - 2nd time

In [None]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

### 13. Find the accounts with switched meter numbers

There are Building_ID's whose meter number changed over the years, need to find the mapping and consolidate the meter numbers (In some cases it's a many-to-many mapping, I'm excluding those cases for now)

outputs: 
1. df_multiple_meter_switch (building_id's with many-to-many meter mapping, need to investigate later)
2. df (with consolidated meter numbers)

In [None]:
a = df.groupby(['Building_ID']).agg({'Meter_Number': 'nunique'}).reset_index()

a = a[a["Meter_Number"]>1]

a.columns = ['Building_ID', 'Counts']

a = pd.merge(a, df, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number', "Revenue_Month"]]\
.groupby(['Building_ID', 'Meter_Number']).agg({'Revenue_Month': ['max','min']}).reset_index()

a.columns = a.columns.get_level_values(0)

a.columns = ['Building_ID', 'Meter_Number', 'Max_Month', 'Min_Month']

a['Max_Month_Next'] = a['Max_Month'].map(lambda x: x + relativedelta(months=+1))
a['Min_Month_Prior'] = a['Min_Month'].map(lambda x: x - relativedelta(months=+1))
df_switch_meter = a

del(a)

In [None]:
str1 = "select l.Building_ID, l.Meter_Number as Meter_Number_E, r.Meter_Number as Meter_Number_L \
        from df_switch_meter l join df_switch_meter r on l.Building_ID = r.Building_ID and l.Meter_Number != r.Meter_Number \
        where l.Max_Month == r.Min_Month_Prior"
a = pysql(str1)

In [None]:
df_meter_switch = pd.DataFrame(a['Building_ID'].value_counts() > 1).reset_index()
df_meter_switch.columns = ['Building_ID', 'Dummy']

df_single_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == False]
df_multiple_meter_switch = df_meter_switch[df_meter_switch['Dummy'] == True]

In [None]:
df_meter_switch = pd.merge(a, df_single_meter_switch, on = 'Building_ID', how = 'inner')[['Building_ID', 'Meter_Number_E', 'Meter_Number_L']]

In [None]:
del(a)

#### 14% of the meters can be mapped to another meter

In [None]:
df_meter_switch['Meter_Number_E'].count() / df['Meter_Number'].nunique()

#### Combine the meter numbers 

In [None]:
a = pd.merge(df, df_meter_switch, left_on = ['Building_ID', 'Meter_Number'], right_on = ['Building_ID', 'Meter_Number_E'], how = 'left')
a['Meter_Number_New'] = a['Meter_Number_L'].combine_first(a['Meter_Number'])
df = a

df.drop(['Meter_Number', 'Meter_Number_L', 'Meter_Number_E'], axis = 1, inplace = True)
df.columns = ['Account_Name', 'Location', 'Building_ID', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges', 'Meter_Number']
col_ordered = ['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month',
       'Revenue_Year', 'Service_Start_Date', 'Service_End_Date', '# days',
       'Consumption_KW', 'KW_Charges', 'Consumption_KWH', 'KWH_Charges',
       'Other_Charges', 'Current_Charges']
df = df[col_ordered]

In [None]:
df_multiple_meter_switch.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_multiple_meter_switch")

### 14. Consolidate data to Building-Meter-Service_Date_Range level
After combinging the meter numbers in the 2 steps above, there are cases where 2 rows exist for the same Meter and Service Date ranges (1 row for KW charges, 1 row for KWH charges)

In [None]:
idx = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month', 'Service_Start_Date', 'Service_End_Date']).agg(['count'])['Account_Name'].reset_index()
idx = idx[idx['count'] > 1]

In [None]:
idx['count'].value_counts()

#### see the example below, read starting from the 3rd row

In [None]:
mask = (df['Building_ID'] == '70.0 - BLD 01') & (df['Revenue_Year'] == 2013) & ( (df['Meter_Number'] == '8095177') | (df['Meter_Number'] == '8095173'))
df[mask].sort_values(['Service_Start_Date', 'Meter_Number']).head(10)

#### remove the multiple rows by aggregating at building, meter, revenue month, service_date_range level

In [None]:
df = df.groupby(['Account_Name', 'Location', 'Building_ID', 'Meter_Number',
       'Revenue_Month', 'Revenue_Year', 'Service_Start_Date',
       'Service_End_Date', '# days']).\
    agg({'Consumption_KW': 'sum', 'KW_Charges': 'sum', 'Consumption_KWH': 'sum', 'KWH_Charges': 'sum', 'Other_Charges': 'sum', 'Current_Charges': 'sum'}).reset_index()

### Calculate Metrics regarding zero-values and meter types - 3rd time

In [None]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

### 15. Consolidate data to Building-Meter-Revenue_Month level

##### only need to work on the cases where multiple rows exist for the same builing_id, meter_number and revenue_month, due to different service_date_ranges, which might be concatenated in many cases

In [None]:
temp = df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).agg('count').reset_index().iloc[:, 0:4]
temp.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month', 'Row_Counts']

In [None]:
df_multiple = pd.merge(df, temp[temp['Row_Counts']  > 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner').iloc[:, 0:15]
df_single = pd.merge(df, temp[temp['Row_Counts']  == 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner').iloc[:, 0:15]

In [None]:
# sort by building_id, revenue month, meter number
df_multiple = df_multiple.sort_values(by = ['Meter_Number', 'Revenue_Month', 'Service_Start_Date'], ascending=[True, True, True])

def merge_dates(grp):
    # Find contiguous date groups, and get the first/last start/end date for each group.
    dt_groups = (grp['Service_Start_Date'] != grp['Service_End_Date'].shift()).cumsum()
    return grp.groupby(dt_groups).agg({'Service_Start_Date': 'first', 'Service_End_Date': 'last',
        '# days':'sum', 'Consumption_KW':'sum', 'KW_Charges':'sum',
       'Consumption_KWH':'sum', 'KWH_Charges':'sum', 'Other_Charges':'sum', 'Current_Charges':'sum'})

# Perform a groupby and apply the merge_dates function, followed by formatting.
df_multiple_concatenate = df_multiple.groupby(['Account_Name', 'Location', 'Building_ID', 'Meter_Number', 'Revenue_Month', 'Revenue_Year']).apply(merge_dates)
df_multiple_concatenate = df_multiple_concatenate.reset_index().drop('level_6', axis = 1)
df_multiple_concatenate = df_multiple_concatenate.reset_index().iloc[:, 1:16]

In [None]:
idx = df_multiple_concatenate.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).count().reset_index().iloc[:, 0:4]

idx.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month', 'Count']

idx[idx['Count'] > 1]

#### Only 6 meters have multiple entries under the same Revenue_Month that can't be concatenated. Again they are caused by the separated logging of KWH and KW charges

In [None]:
pd.merge(df, idx[idx['Count'] > 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'inner')

#### Remove them from the working dataset

In [None]:
temp = pd.merge(df_multiple_concatenate, idx[idx['Count'] > 1], on = ['Building_ID', 'Meter_Number', 'Revenue_Month'], how = 'left')

temp = temp[temp.Count.isnull()].iloc[:, 0:15]

#### Create the new working dataset df at Building-Meter-Revenue_Month level

In [None]:
df = df_single.append(temp)

In [None]:
df.groupby(['Building_ID', 'Meter_Number', 'Revenue_Month']).agg('count')\
.reset_index()['Account_Name'].value_counts()

### Calculate Metrics regarding zero-values and meter types - 4th time

In [175]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select Building_ID, Meter_Number \
        , sum(case when KWH_Charges == 0 and KW_Charges > 0 then 1 else 0 end) as count_kw_only \
        , sum(case when KW_Charges == 0 and KWH_Charges > 0 then 1 else 0 end) as count_kwh_only \
        , sum(Current_Charges) as total_current_charges \
        , count(*) as count \
        from df \
        group by df.Building_ID, df.Meter_Number"
df_meter_type = pysql(str1)


df_meter_type['kwh_only'] = ((df_meter_type['count_kwh_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kw_only'] == 0)
df_meter_type['kw_only'] = ((df_meter_type['count_kw_only']/df_meter_type['count']) > 0.9) & (df_meter_type['count_kwh_only'] == 0)

#### check the meters

print("perc of kw_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kw_only'] == 1) & (df_meter_type['kwh_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_only meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 1) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))

print("perc of kwh_and_kw meters:", "{:.2%}".format(df_meter_type[(df_meter_type['kwh_only'] == 0) & (df_meter_type['kw_only'] == 0)].shape[0] / df_meter_type.shape[0]))


#### check the building_ids

a = df_meter_type[df_meter_type['kwh_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
b =  df_meter_type[df_meter_type['kw_only'] == 1].groupby(['Building_ID']).agg('count').reset_index().iloc[:, 0:2]
a.columns = ['Building_ID', 'Count']
b.columns = ['Building_ID', 'Count']

print("perc of buildings with both kw_only and kwh_only meters:", \
     "{:.2%}".format(pd.merge(a, b, on = 'Building_ID', how = 'inner').shape[0] \
/ df_meter_type.groupby(['Building_ID']).agg('count').reset_index().shape[0]))


#### Check the statistics of zero-value rows:

print("perc of rows - current charges of zero:", "{:.2%}".format(df[df['Current_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kw charges of zero:", "{:.2%}".format(df[df['KW_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - kwh charges of zero:", "{:.2%}".format(df[df['KWH_Charges'] == 0].shape[0] / df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) ^ (df['Consumption_KWH'] == 0)) \
   | ((df['KW_Charges'] == 0) ^ (df['Consumption_KW'] == 0)) ].shape[0]\
    /df.shape[0]))

print("perc of rows - sum of charges inconsistency:", \
     "{:.2%}".format(1 - df[df['Current_Charges'] == df['KWH_Charges'] + df['KW_Charges'] + df['Other_Charges']].shape[0]\
    /df.shape[0]))

perc of kw_only meters: 0.78%
perc of kwh_only meters: 16.85%
perc of kwh_and_kw meters: 82.37%
perc of buildings with both kw_only and kwh_only meters: 0.25%
perc of rows - current charges of zero: 2.41%
perc of rows - kw charges of zero: 17.35%
perc of rows - kwh charges of zero: 5.57%
perc of rows - consumption/charge inconsistency: 6.33%
perc of rows - sum of charges inconsistency: 34.28%


In [176]:
print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KWH_Charges'] == 0) & (df['Consumption_KWH'] != 0))].shape[0]\
    /df.shape[0]))

print("perc of rows - consumption/charge inconsistency:", \
      "{:.2%}".format(df[((df['KW_Charges'] == 0) & (df['Consumption_KW'] != 0))].shape[0]\
    /df.shape[0]))

print("perc of rows - KWH Charges negative:", \
     "{:.2%}".format(df[df['KWH_Charges'] < 0].shape[0]\
    /df.shape[0]))

print("perc of rows - KW Charges negative:", \
     "{:.2%}".format(df[df['KW_Charges'] < 0].shape[0]\
    /df.shape[0]))

perc of rows - consumption/charge inconsistency: 0.34%
perc of rows - consumption/charge inconsistency: 5.86%
perc of rows - KWH Charges negative: 0.00%
perc of rows - KW Charges negative: 4.10%


### 16. Find the gaps between service date ranges

We'd like to know how many account have gaps (> 5 days) in their billing windows

#### concatenate service date ranges for each builing_id and  meter_number, across all years

In [None]:
# sort by building_id, meter number
df = df.sort_values(by = ['Building_ID', 'Meter_Number', 'Service_Start_Date'], ascending=[True, True, True])

def merge_dates(grp):
    # Find contiguous date groups, and get the first/last start/end date for each group.
    dt_groups = (grp['Service_Start_Date'] != grp['Service_End_Date'].shift()).cumsum()
    return grp.groupby(dt_groups).agg({'Service_Start_Date': 'first', 'Service_End_Date': 'last'})

# Perform a groupby and apply the merge_dates function, followed by formatting.
df_gap = df.groupby(['Building_ID', 'Meter_Number']).apply(merge_dates)
df_gap = df_gap.reset_index().drop('level_2', axis = 1)
df_gap = df_gap.reset_index()
df_gap.columns = ['rowNum', 'Building_ID', 'Meter_Number', 
       'Service_Start_Date', 'Service_End_Date']

df_gap['nextRowNum'] = df_gap['rowNum'].map(lambda x: x+1)

# Join the dataframe with itself to find the gap between service ranges
df_gap = pd.merge(df_gap, df_gap[['Building_ID', 'Meter_Number', 'nextRowNum', 'Service_End_Date']],\
        left_on = ['Building_ID', 'Meter_Number', 'rowNum'], right_on = ['Building_ID', 'Meter_Number', 'nextRowNum'], how = 'left')

# consecutive days of billing for the same meter number
df_gap['consecutive_days'] = \
df_gap[['Service_End_Date_x', 'Service_Start_Date']].apply(lambda x: (x[0] - x[1]).days, axis = 1)

# number of days elapsed since the previous service range
df_gap['gap_days'] = \
df_gap[['Service_Start_Date', 'Service_End_Date_y']].apply(lambda x: (x[0] - x[1]).days, axis = 1)


# Rename and reorder the columns
df_gap = df_gap[['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date_x', 'consecutive_days', 'gap_days']]
df_gap.columns = ['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date', 'consecutive_days', 'gap_days']

df_gap['Building_Meter'] = df_gap['Building_ID'] + df_gap['Meter_Number']

#### How frequent does a meter has gaps of at least 3 days through all the years ? ~83.2%

In [133]:
df_gap[df_gap['gap_days'] >= 3]['Building_Meter'].nunique() / df_gap['Building_Meter'].nunique()

0.8322010869565217

#### Overlapping service date ranges - 0.71% of the meter accounts

In [None]:
mask = df_gap['gap_days'] < 0
df_gap[mask]

In [None]:
print("Perc of meters with overlapping service date ranges:", "{:.2%}".format(df_gap[mask]['Building_Meter'].agg('nunique')/df_gap['Building_Meter'].agg('nunique')))

In [None]:
df_gap[mask].gap_days.value_counts()

#### Examples

In [None]:
mask = (df['Building_ID'] == '79.0 - RED HOOK WEST BLD 03') \
& ((df['Meter_Number'] == '6477455')|(df['Meter_Number'] == '6477455') ) \
& (df['Revenue_Year'] == 2011)

df[mask].sort_values(['Revenue_Month', 'Service_Start_Date', 'Meter_Number'])

### Summarize gaps by days

In [None]:
df_gap_summary = df_gap[df_gap['gap_days'] > 0].groupby('Building_Meter').agg({'consecutive_days':'sum', 'gap_days':'sum'}).reset_index()

df_gap_summary['perc_gap'] = df_gap_summary['gap_days']/(df_gap_summary['consecutive_days'] + df_gap_summary['gap_days'])

#### Only 29.3% of the meters have % of missing days less than 10%

In [None]:
df_gap_summary[df_gap_summary['perc_gap'] < 0.1].shape[0]/ df_gap_summary.shape[0]

#### For those who doesn't have gaps longer than 5 days, most of them just have one revenue_month reported 

In [None]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select a.m1 as Building_Meter from \
        (select distinct Building_Meter as m1\
        from df_gap) a \
        left join \
        (select distinct Building_Meter as m2, 1 as ind\
        from df_gap where gap_days >= 5) b \
        on a.m1 == b.m2 where b.ind is null \
        "
a = pysql(str1)

#### Only two metes have almost no gap in all 8 years

In [None]:
pd.merge(a, df_gap, on = 'Building_Meter', how = 'inner').gap_days.value_counts()

#### Save the data for later use

In [None]:
df_gap.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps")
df_gap_summary.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps_summary")

### 17. Summarize gaps by months (since we found that most of the cases, service date ranges either missed the entire month, or covers the whole month)

In [None]:
df_gap_month.columns

In [None]:
# sort by building_id, meter number and revenue month
df = df.sort_values(by = ['Building_ID', 'Meter_Number', 'Revenue_Month'], ascending=[True, True, True])
a = df[['Building_ID', 'Meter_Number', 'Revenue_Month']]
a.loc[:, 'Next_Revenue_Month'] = a['Revenue_Month'].map(lambda x: x + relativedelta(months=+1))

def merge_months(grp):
    # Find contiguous date groups, and get the first/last start/end date for each group.
    dt_groups = (grp['Revenue_Month'] != grp['Next_Revenue_Month'].shift()).cumsum()
    return grp.groupby(dt_groups).agg({'Revenue_Month': 'first', 'Next_Revenue_Month': 'last'})

# Perform a groupby and apply the merge_dates function, followed by formatting.
df_gap_month = a.groupby(['Building_ID', 'Meter_Number']).apply(merge_months)
df_gap_month = df_gap_month.reset_index().drop('level_2', axis =1)

df_gap_month.columns = ['Building_ID', 'Meter_Number', 
       'Revenue_Month_Start', 'Revenue_Month_End']

df_gap_month.loc[:, 'Consecutive_Months'] = \
(df_gap_month['Revenue_Month_End'].dt.year - df_gap_month['Revenue_Month_Start'].dt.year) * 12 + \
(df_gap_month['Revenue_Month_End'].dt.month - df_gap_month['Revenue_Month_Start'].dt.month)

df_gap_month['Building_Meter'] = df_gap_month['Building_ID'] + df_gap_month['Meter_Number']

In [None]:
a = pd.merge(df.groupby(['Building_ID', 'Meter_Number']).agg({'Revenue_Month':'max'}).reset_index() \
, df.groupby(['Building_ID', 'Meter_Number']).agg({'Revenue_Month':'min'}).reset_index() \
, on = ['Building_ID', 'Meter_Number'], how = 'inner' \
)


a.columns = ['Building_ID', 'Meter_Number', 'Revenue_Month_max', 'Revenue_Month_min']

a.loc[:, 'Span_Months'] = \
(a['Revenue_Month_max'].dt.year - a['Revenue_Month_min'].dt.year) * 12 + \
(a['Revenue_Month_max'].dt.month - a['Revenue_Month_min'].dt.month) + 1

df_gap_month_summary = \
pd.merge(df_gap_month.groupby(['Building_ID', 'Meter_Number']).agg({'Consecutive_Months':'sum'}).reset_index()\
, a, on = ['Building_ID', 'Meter_Number'], how = 'inner')

del(a)

In [None]:
cols = ['Building_ID', 'Meter_Number', 'Consecutive_Months', 'Span_Months']
df_gap_month_summary = df_gap_month_summary[cols]

df_gap_month_summary.loc[:, 'Consecutive_Months_Perc'] = \
df_gap_month_summary['Consecutive_Months'] / df_gap_month_summary['Span_Months']

#### Save the data for later use

In [None]:
df_gap_month.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_revenue_month_gaps")
df_gap_month_summary.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_revenue_month_gaps_summary")

### 17. Combine rows to the Building-Meter-Month level and Building-Month level; add new aggregation metrics

We need to analyze anamolous values of charges and consumptions at the Building-Meter-Month level and Building-Month level

In [None]:
df_combined_meter = df

df_combined_building = pd.pivot_table(df, values = ['Current_Charges','Consumption_KWH', 'KWH_Charges',\
       'Consumption_KW', 'KW_Charges', 'Other_Charges'], index=['Account_Name', 'Location', 'Building_ID',
       'Revenue_Month'], aggfunc = np.sum).reset_index()

In [None]:
df_combined_meter['Total_Charges'] = df_combined_meter['KW_Charges'] + df_combined_meter['KWH_Charges']
df_combined_meter['Total_Energy_Rate'] = df_combined_meter['Total_Charges']/df_combined_meter['Consumption_KWH']

df_combined_meter['Building_Meter'] = df_combined_meter['Building_ID'] + df_combined_meter['Meter_Number']

In [None]:
df_combined_building['Total_Charges'] = df_combined_building['KW_Charges'] + df_combined_building['KWH_Charges']
df_combined_building['Total_Energy_Rate'] = df_combined_building['Total_Charges']/df_combined_building['Consumption_KWH']

### 18. Save the cleaned data to the output folder

In [None]:
# original data after general data cleansing
df_orig.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_original_dataset")

In [None]:
# data at Building_ID, Meter_Number, Revenue_Month level
df.to_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")

In [None]:
# data at Building_ID, Meter_Number, Revenue_Month level
df_combined_meter.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_meter")

In [None]:
# data at Building_ID, Meter_Number level
df_combined_building.to_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_building")

## To continue the work:

In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import pandasql as pdsql
from datetime import datetime
from dateutil.relativedelta import *

# import matplotlib as mpl
import matplotlib.pyplot as plt
# Setup matplotlib to display in notebook:
%matplotlib inline

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)         # initiate notebook for offline plot


In [2]:
df_orig = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_original_dataset")

df = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_CleanedDF")

df_combined_meter = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_meter")
df_combined_building = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_combined_building")

df_gap = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps")
df_gap_summary = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_service_range_gaps_summary")

df_gap_month = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_revenue_month_gaps")
df_gap_month_summary = pd.read_pickle("../output/NYCHA_Electricity_2010_to_2018_df_revenue_month_gaps_summary")

In [3]:
df.head()

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges,Total_Charges,Total_Energy_Rate,Building_Meter
54186,FIRST HOUSES,BLD 01,1.0 - BLD 01,7836716,2010-01-01,2010,2009-12-24,2010-01-26,33.0,3.84,41.28,876.0,52.64,60.36,154.28,93.92,0.107215,1.0 - BLD 017836716
54187,FIRST HOUSES,BLD 01,1.0 - BLD 01,7836716,2010-02-01,2010,2010-01-26,2010-02-25,30.0,1.68,18.06,708.0,42.54,36.64,97.24,60.6,0.085593,1.0 - BLD 017836716
54188,FIRST HOUSES,BLD 01,1.0 - BLD 01,7836716,2010-03-01,2010,2010-02-25,2010-03-26,29.0,2.88,30.96,828.0,49.75,57.31,138.02,80.71,0.097476,1.0 - BLD 017836716
54189,FIRST HOUSES,BLD 01,1.0 - BLD 01,7836716,2010-04-01,2010,2010-03-26,2010-04-26,31.0,3.0,32.25,720.0,43.26,64.33,139.84,75.51,0.104875,1.0 - BLD 017836716
54190,FIRST HOUSES,BLD 01,1.0 - BLD 01,7836716,2010-05-01,2010,2010-04-26,2010-05-24,28.0,2.16,23.22,756.0,45.43,45.14,113.79,68.65,0.090807,1.0 - BLD 017836716


#### Use SQL to explore the data

In [None]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select count(*) \
        from df \
        "
temp = pysql(str1)

#### How many meters per building?

In [103]:
df.groupby('Building_ID').agg({'Meter_Number':'nunique'}).reset_index()['Meter_Number'].value_counts()

1     1314
2      554
3       64
4       49
5        6
36       1
21       1
19       1
15       1
7        1
6        1
Name: Meter_Number, dtype: int64

#### Summary Statistics 

In [104]:
df[["Consumption_KWH",  "Consumption_KW", "Current_Charges", "KWH_Charges", "KW_Charges", "Other_Charges"]].describe()

Unnamed: 0,Consumption_KWH,Consumption_KW,Current_Charges,KWH_Charges,KW_Charges,Other_Charges
count,180020.0,180020.0,180020.0,180020.0,180020.0,180020.0
mean,46370.48,96.046511,6408.680075,2382.229199,1393.816722,2383.041867
std,58546.57,134.623154,8147.173101,3261.329493,2046.64884,3689.362177
min,0.0,0.0,-243.15,0.0,-20198.18,-59396.43
25%,6030.0,18.9,1162.0675,328.5275,112.63,467.955
50%,32740.0,68.69,4386.445,1596.9,1044.78,1471.835
75%,63040.0,126.2975,8540.8025,3138.3,2052.41,3112.72
max,1779600.0,16135.46,329800.37,195575.86,78782.96,134224.51


#### Perc of accounts with no missing data for all months

In [222]:
a = df_gap.groupby('Building_Meter').agg({'gap_days':'sum'}).reset_index()

In [223]:
a[a['gap_days'] == 0].shape[0]/a.shape[0]

0.1671195652173913

In [353]:
del(a)

#### Trendline of % of accounts with missing data by revenue month

In [8]:
# list of unique meters
meters = df.Building_Meter.value_counts().index.values

end = df['Revenue_Month'].max()
start = df['Revenue_Month'].min()
diff = (end.year - start.year) * 12 + end.month - start.month
# list of unique months
months = [start + relativedelta(months=x) for x in range(0, diff + 1)]

# create a matrix to record the months that should have billing data for each meter
matrix_month_meter = np.zeros((len(months), len(meters)))

for j in range(len(meters)):
    mask = (df['Building_Meter'] == meters[j])
    start = df[mask]['Revenue_Month'].min()
    end = df[mask]['Revenue_Month'].max()
    start_index = months.index(start)
    end_index = months.index(end)
    # update the matrix value for months that meter j should have data available
    matrix_month_meter[start_index:end_index + 1, j] = 1

# for each month, calculate the number of meters that should have billing data
df_month_meter = pd.DataFrame({'Revenue_Month':months, 'meters_count':list(matrix_month_meter.sum(axis = 1))})

In [314]:
temp = df_combined_meter.groupby(['Revenue_Month']).agg({'Building_Meter':'nunique'}).reset_index()
temp.columns = ['Revenue_Month', 'meters_with_data_count']
temp = pd.merge(temp, df_month_meter, on = 'Revenue_Month', how = 'inner')

temp['meter_with_data_perc'] = round(temp['meters_with_data_count'] / temp['meters_count'], 4)
temp = temp.sort_values('Revenue_Month')
temp['meters_missing_data_count'] = temp['meters_count']  - temp['meters_with_data_count']

df_data_completeness_by_month = temp

In [315]:
# Create a trace
trace1 = go.Bar(
    x = df_data_completeness_by_month.Revenue_Month,
    y = df_data_completeness_by_month.meters_count,
    name = '# of Meters that should have data in the month', 
    marker=dict(
        color='rgba(204,204,204,1)'
    ),
    yaxis= 'y'
)

trace2 = go.Scatter(
    x = df_data_completeness_by_month.Revenue_Month,
    y = 1 - df_data_completeness_by_month.meter_with_data_perc,
    name = '% of Meters with missing data',
    yaxis = 'y2'
)

data = [trace1, trace2]

layout = go.Layout(
    title='Trend Line of Data Compleness',
    yaxis=dict(
        title='# of Meters that should have data in the month',
        tickformat=",",
    ),
    yaxis2=dict(
        title='% of Meters with missing data',
        tickformat=".1%",
        side='right',
        overlaying='y',
    ), 
    legend=dict(x = -0.05, y=1.5)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

#### Trendline of % of accounts with billing gaps (no data or 3+ days of gap) by revenue month

In [317]:
df['gaps'] = (df['Service_End_Date'] - df['Service_Start_Date']).dt.days

df['gaps'] = df['gaps'].map(lambda x: max(0, 31-x))

df.gaps.value_counts().sort_index().head()

0    73939
1    46588
2    55509
3     3638
4      100
Name: gaps, dtype: int64

In [318]:
meters_missing_3_days = [df[(df['gaps'] > 3) & (df['Revenue_Month'] ==  month)].Building_Meter.nunique() for month in months]

df_gap_3days_by_month = pd.DataFrame({'Revenue_Month':months, 'meters_3days_count':meters_missing_3_days})

df_data_completeness_by_month = pd.merge(df_data_completeness_by_month, df_gap_3days_by_month)

df_data_completeness_by_month['meter_gaps_days_perc'] = (df_data_completeness_by_month['meters_3days_count'] \
                                                        + df_data_completeness_by_month['meters_missing_data_count']) \
                                                        /df_data_completeness_by_month['meters_count']

In [352]:
# Create a trace
trace1 = go.Bar(
    x = df_data_completeness_by_month.Revenue_Month,
    y = df_data_completeness_by_month.meters_count,
    name = '# of Accounts that should have data in the month', 
    marker=dict(
        color='rgba(204,204,204,1)'
    ),
    yaxis= 'y'
)

trace2 = go.Scatter(
    x = df_data_completeness_by_month.Revenue_Month,
    y = 1 - df_data_completeness_by_month.meter_with_data_perc,
    name = '% of Accounts with no data',
    yaxis = 'y2'
)

trace3 = go.Scatter(
    x = df_data_completeness_by_month.Revenue_Month,
    y = df_data_completeness_by_month.meter_gaps_days_perc,
    name = '% of Accounts with no data or 3+ days of gap', 
    yaxis= 'y2'
)

data = [trace1, trace2, trace3]

layout = go.Layout(
    title='Trend Line of Data Incompleness',
    yaxis=dict(
        title='# of Accounts that should have data in the month',
        tickformat=",",
    ),
    yaxis2=dict(
        title='% of Accounts',
        tickformat=".1%",
        side='right',
        overlaying='y',
    ), 
    legend=dict(x = -0.05, y= -0.4)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

##### average % of accounts that have no data or 3+ days of gap

In [351]:
np.mean(df_data_completeness_by_month.meter_gaps_days_perc)

0.12709727189291173

#### Trend Line of Average Energy Charges

In [135]:
# Create a trace

# Create traces
trace1 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.Total_Charges,
#     mode = 'lines',
    name = 'Avg. Total Charge'
)
trace2 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.Total_Energy_Rate,
#     mode = 'lines+markers',
    name = 'Avg. Total Charge Rate', 
    yaxis='y2'
)

data = [trace1, trace2]

layout = go.Layout(
    title='Trend Line of Average Energy Charges',
    yaxis=dict(
        title='Avg. Total Charges($)',
        tickformat=","
    ),
    yaxis2=dict(
        title='Avg. Total Charge Rates($/KWH)',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
#         tickformat=".2%",
        overlaying='y',
        side='right'
    ),
    legend=dict(x=-.1, y=1.2)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

#### Trend Line of Average KW and KWH Charges

In [136]:
# Create a trace

# Create traces
trace1 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.KWH_Charges,
#     mode = 'lines',
    name = 'Avg. KWH Charges'
)
trace2 = go.Scatter(
    x = temp.Revenue_Month,
    y = temp.KW_Charges,
#     mode = 'lines+markers',
    name = 'Avg. KW Charges', 
    yaxis='y2'
)

data = [trace1, trace2]

layout = go.Layout(
    title='Trend Line of Average KW and KWH Charges',
    yaxis=dict(
        title='Avg. KWH Charges($)',
        tickformat=","
    ),
    yaxis2=dict(
        title='Avg. KW Charges($)',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickformat=",",
        overlaying='y',
        side='right'
    ),
    legend=dict(x=-.1, y=1.2)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

## Q&A with Linnea:

1. why would "Consumption_KW" be zero?
    - KW and KWH should be both positive, unless there are some related bills that already covers it
    - Maybe one account was separated into multiple meters?
2. What's the "Other Charges"?
    - negative values to adjust for the payments from previous month
    - taxes, fee for meter-reading, little fees charged by utilities and states (e.g. system benefit charge), credit (state got a better deal after charging the clients)

## To Do:

1. Statistical & Graphical Analysis on the combined datasets
2. Summarize all types of entries that doesn't make sense; flag and ignore them
   - Cases where other == kw and kwh == 0, why?
   - Cases where other == current and (kw!=0 or kwh != 0)
   - Negative values in KWH, KW
   - Inconsistency between consumption & charges
   - KW charge is offset by negative "other charge" (16.7%)
   - Meter accounts that only have non-zero values in either KW (0.8%) or KWH (16.9%) charges
3. Calendarize the bills (calculate avg. daily cost and consumption and multiple by # of days) All analyses on missing data and gaps should be based on calendarized bills
4. Starting from 2015, does data quality get better? less meters are missing data? (Government required companies to submit utility data since that time)
5. January are more likely to miss data. Why? Check if that's true. 
6. Check the distribution of % of accounts with gaps days == 3, May 2010 and May 2010 have really high %...
6. Check the relationship between Building_ID and Account_Name. Are they 1-on-1 mapping?
6. Check anomalies in the following order
    - KWH (consumption) .. only compare where there are months of data (ignore the gap month), or we can also use usage per day and then exclude the days with no consumption(instead of using the pro-rated value)
    - KWH_Charges
    - KW (capacity) consumption and charges (difference in daytime vs. nighttime, summer vs. winter, whole summer is at capacity, we will have really high charges for summer capacity use) (Later Metrics defined below)

####  Metrics to consider later

1) total capacity (kW) for all the meters for the month (building level aggregate)

2) Max kW value for the month (both building level and account level)

3) Max kW for each meter for the previous 12 months

4) Sum of the Max kW for each individual meter

5) The variance of Total Charge (sum of KWH_charge and KW_charge) at both account level and building level

#### Edge case examples

##### 1. Check where df_combined_meter['Total_Charges'] < 0 or df_combined_meter['Consumption_KWH'] == 0

In [134]:
mask = (df_combined_meter['Consumption_KWH'] > 0) & (df_combined_meter['Total_Charges'] > 0)

mask = df_combined_meter['Consumption_KWH'] > 0
temp = df_combined_meter[mask].groupby(['Revenue_Month']).agg({\
        'Total_Charges':'mean', 'Total_Energy_Rate': 'mean', 'KWH_Charges':'mean', 'KW_Charges':'mean'}).reset_index()

temp.columns = ['Revenue_Month', 'Total_Charges', 'Total_Energy_Rate', 'KWH_Charges', 'KW_Charges']

temp = temp.sort_values('Revenue_Month')

#### 2. Check where other_charges is not zero, but all other metrics are zero

In [137]:
df[(df['Other_Charges'] != 0) & (df['Current_Charges'] == df['Other_Charges']) & (~((df['KWH_Charges'] == 0) & (df['KW_Charges'] == 0) \
  & (df['Consumption_KWH'] == 0) & (df['Consumption_KW'] == 0)))]

Unnamed: 0,Account_Name,Location,Building_ID,Meter_Number,Revenue_Month,Revenue_Year,Service_Start_Date,Service_End_Date,# days,Consumption_KW,KW_Charges,Consumption_KWH,KWH_Charges,Other_Charges,Current_Charges,Total_Charges,Total_Energy_Rate,Building_Meter,gaps
54599,FIRST HOUSES,BLD 04,1.0 - BLD 04,8638820,2017-01-01,2017,2016-12-23,2017-01-25,33.0,0.0,0.00,45.0,0.0,13.39,13.39,0.00,0.000000,1.0 - BLD 048638820,0
54600,FIRST HOUSES,BLD 04,1.0 - BLD 04,8638820,2017-02-01,2017,2017-01-25,2017-02-24,30.0,0.0,0.00,41.0,0.0,12.37,12.37,0.00,0.000000,1.0 - BLD 048638820,1
54602,FIRST HOUSES,BLD 04,1.0 - BLD 04,8638820,2017-04-01,2017,2017-03-27,2017-04-25,29.0,0.0,0.00,34.0,0.0,10.04,10.04,0.00,0.000000,1.0 - BLD 048638820,2
54603,FIRST HOUSES,BLD 04,1.0 - BLD 04,8638820,2017-05-01,2017,2017-04-25,2017-05-23,28.0,0.0,0.00,21.0,0.0,7.74,7.74,0.00,0.000000,1.0 - BLD 048638820,3
54604,FIRST HOUSES,BLD 04,1.0 - BLD 04,8638820,2017-06-01,2017,2017-05-23,2017-06-22,30.0,0.0,0.00,24.0,0.0,8.64,8.64,0.00,0.000000,1.0 - BLD 048638820,1
177395,WILLIAMSBURG,BLD 9,2.0 - BLD 9,6372360,2012-02-01,2012,2012-01-25,2012-02-24,30.0,0.0,0.00,1.0,0.0,4.72,4.72,0.00,0.000000,2.0 - BLD 96372360,1
177410,WILLIAMSBURG,BLD 9,2.0 - BLD 9,6372360,2014-08-01,2014,2014-08-22,2014-09-02,11.0,0.0,0.00,48.0,0.0,13.90,13.90,0.00,0.000000,2.0 - BLD 96372360,20
177420,WILLIAMSBURG,BLD 9,2.0 - BLD 9,6546181,2012-02-01,2012,2012-01-25,2012-02-24,30.0,0.0,0.00,1.0,0.0,4.07,4.07,0.00,0.000000,2.0 - BLD 96546181,1
177422,WILLIAMSBURG,BLD 9,2.0 - BLD 9,6546181,2012-10-01,2012,2012-09-21,2012-10-23,32.0,0.0,0.00,25.0,0.0,8.57,8.57,0.00,0.000000,2.0 - BLD 96546181,0
177424,WILLIAMSBURG,BLD 9,2.0 - BLD 9,6546181,2014-03-01,2014,2014-02-25,2014-03-26,29.0,0.0,0.00,1.0,0.0,4.09,4.09,0.00,0.000000,2.0 - BLD 96546181,2
