# Step 1: Load packages and Data

In [1]:
import pandas as pd
import numpy as np
import pandasql as pdsql
import math

from datetime import timedelta, datetime
from dateutil.relativedelta import *

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
# initiate notebook for offline plot
init_notebook_mode(connected=True)         

In [4]:
file_name = "/Users/feiwang/Desktop/capstone/anomalyDetectors/data/Client 2 - Data for UW team.xlsx"
xl_file = pd.ExcelFile(file_name)

dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

df_orig = dfs['Sheet1']

In [5]:
df_orig.head()

Unnamed: 0,Building ID,Fuel,Meter_ID,Account_Name,Start Date,End Date,Consumption,Units,Cost,Currency,Demand,Units.1,Vendor,Invoice Number
0,47068,Electricity,MN10000,AN100,2015-12-08,2016-01-06,69828.0,kWh,4124.6,USD,180.0,Kw,V1,IN1
1,47068,Electricity,MN10000,AN100,2016-01-07,2016-02-04,69395.0,kWh,4501.79,USD,164.0,Kw,V1,IN2
2,47068,Electricity,MN10000,AN100,2016-02-05,2016-03-07,79178.0,kWh,4979.07,USD,176.0,Kw,V1,IN3
3,47068,Electricity,MN10000,AN100,2016-03-08,2016-04-05,74855.0,kWh,5291.78,USD,185.0,Kw,V1,IN4
4,47068,Electricity,MN10000,AN100,2016-04-06,2016-05-05,78745.0,kWh,5950.15,USD,205.0,Kw,V1,IN5


### Add a row_id

In [6]:
df_orig = df_orig.reset_index()

df_orig.rename(columns = {'index':'row_id'}, inplace = True)

In [7]:
df_orig.columns

Index(['row_id', 'Building ID', 'Fuel', 'Meter_ID', 'Account_Name',
       'Start Date', 'End Date', 'Consumption', 'Units', 'Cost', 'Currency',
       'Demand', 'Units.1', 'Vendor', 'Invoice Number'],
      dtype='object')

### Create a working dataframe

In [8]:
df = df_orig.copy()

In [9]:
df.rename(columns = {'Building ID':'Building_ID', 'Meter_ID':'Meter_Number',\
                    'Start Date':'Service_Start_Date', 'End Date':'Service_End_Date', 'Units.1':'Units'},\
          inplace = True)

### Create a data frame to log the rows with data quality issues.

In [10]:
df_flags = pd.DataFrame(columns = ['row_id', 'flag'])

# Step 2: Data Cleaning

#### Remove the two columns on demand and its units

In [29]:
df = pd.concat([df.iloc[:, 0:11], df.iloc[:, 13:15]], axis = 1)

In [30]:
df.head()

Unnamed: 0,row_id,Building_ID,Fuel,Meter_Number,Account_Name,Service_Start_Date,Service_End_Date,Consumption,Units,Cost,Currency,Vendor,Invoice Number
0,0,47068,Electricity,MN10000,AN100,2015-12-08,2016-01-06,69828.0,kWh,4124.6,USD,V1,IN1
1,1,47068,Electricity,MN10000,AN100,2016-01-07,2016-02-04,69395.0,kWh,4501.79,USD,V1,IN2
2,2,47068,Electricity,MN10000,AN100,2016-02-05,2016-03-07,79178.0,kWh,4979.07,USD,V1,IN3
3,3,47068,Electricity,MN10000,AN100,2016-03-08,2016-04-05,74855.0,kWh,5291.78,USD,V1,IN4
4,4,47068,Electricity,MN10000,AN100,2016-04-06,2016-05-05,78745.0,kWh,5950.15,USD,V1,IN5


### clean up the Unit column

In [32]:
df.Units.value_counts()

kWh       2745
therms    1234
Dth        250
kBtu       159
MMBtu       46
ccf         46
m3          39
kwh         35
gal         31
each        28
Name: Units, dtype: int64

In [33]:
mask = df.Units == 'kWh'
df.loc[mask, 'Units'] = 'kwh'

### check the missing values

In [36]:
df.isna().sum()

row_id                  0
Building_ID             0
Fuel                    0
Meter_Number          139
Account_Name          848
Service_Start_Date      0
Service_End_Date        0
Consumption             0
Units                   0
Cost                    0
Currency                0
Vendor                  0
Invoice Number        848
dtype: int64

In [38]:
mask = df.Meter_Number.isnull()
df = df[~mask]
df = df.reset_index(drop=True)

In [39]:
df.isna().sum()

row_id                  0
Building_ID             0
Fuel                    0
Meter_Number            0
Account_Name          709
Service_Start_Date      0
Service_End_Date        0
Consumption             0
Units                   0
Cost                    0
Currency                0
Vendor                  0
Invoice Number        709
dtype: int64

### Remove the rows that have 0 consumption value and no consumption unit

In [95]:
mask = df_orig['Consumption'] == '0.00\xa0'
mask.sum()

83

In [96]:
df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'Consumption value is zero, Rows removed'})])

In [97]:
df = df[~mask]
df = df.reset_index(drop=True)

In [98]:
for i in range(len(df['Consumption'])):
    if len(str.split(df['Consumption'][i])) != 2:
        print (df['Consumption'][i], ", ", len(str.split(df['Consumption'][i])), ", ", df.index.values[i])

384.00 kWh  28.00 Therm ,  4 ,  4951
632.00 kWh  3.00 Therm ,  4 ,  4952
328.00 kWh  8.00 Therm ,  4 ,  4954
363.00 kWh  58.00 Therm ,  4 ,  4957
340.00 kWh  21.00 Therm ,  4 ,  4958
288.00 kWh  40.00 Therm ,  4 ,  4962
433.00 kWh  75.00 Therm ,  4 ,  4963
501.00 kWh  144.00 Therm ,  4 ,  4964
417.00 kWh  49.00 Therm ,  4 ,  4965
676.00 kWh  2.00 Therm ,  4 ,  4966
371.00 kWh  82.00 Therm ,  4 ,  4969
402.00 kWh  127.00 Therm ,  4 ,  4970
222.00 kWh  5.00 Therm ,  4 ,  4971
376.00 kWh  84.00 Therm ,  4 ,  4973
415.00 kWh  131.00 Therm ,  4 ,  4974
155.00 kWh  151.00 Therm ,  4 ,  4975
679.00 kWh  157.00 Therm ,  4 ,  4976
395.00 kWh  28.00 Therm ,  4 ,  4977
232.00 kWh  83.00 Therm ,  4 ,  4978
484.00 kWh  182.00 Therm ,  4 ,  4979
395.00 kWh  77.00 Therm ,  4 ,  4983
411.00 kWh  113.00 Therm ,  4 ,  4984
368.00 kWh  79.00 Therm ,  4 ,  4985


In [99]:
df.iloc[4951:4985, ].Meter_Number.unique()

array(['Account_ID-1128'], dtype=object)

### All the rows that have multipe types of energy consumption values are under the same account 'Account_ID-1128'. Let's remove it from the dataset for now.

In [100]:
mask = df['Meter_Number'] == 'Account_ID-1128'
df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'The Account_ID contains rows with multiple types of consumptions, rows removed'})])
df = df[~mask]
df = df.reset_index(drop=True)

### Check again the 'Consumption' field

In [101]:
for i in range(len(df['Consumption'])):
    if len(str.split(df['Consumption'][i])) != 2:
        print (df['Consumption'][i], ", ", len(str.split(df['Consumption'][i])), ", ", df.index.values[i])

In [102]:
df.loc[:, 'Unit'] = df["Consumption"].map(lambda x: str.split(x.replace(",",""))[1])

df["Consumption"] = df["Consumption"].map(lambda x: str.split(x.replace(",",""))[0])
df["Consumption"] = df["Consumption"].astype(float, inplace = True)

### Drop the "Currency" column

In [103]:
df.Currency.unique()

array(['USD'], dtype=object)

In [104]:
df.drop('Currency', axis = 1, inplace = True)

# Step 3: Focus on the kwh consumption data only

In [105]:
mask = (df.Unit == 'kWh')
df = df[mask].copy()

### Clean up Meter_Number field

In [106]:
df.loc[:, 'Meter_Length'] = df['Meter_Number'].map(lambda x: len(x))

In [107]:
df.loc[:, 'Meter_Length'].value_counts()

15    8706
Name: Meter_Length, dtype: int64

In [108]:
df.drop('Meter_Length', axis = 1, inplace = True)

## Find & removes rows/accounts with problems

### Check consumption and current_charges columns

In [112]:
mask = (df.Consumption == 0) & (df.Current_Charges == 0)
df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'Both consumption & charge values are zero, rows removed'})])
df = df[~mask]

### Check the overlapping of service periods

In [113]:
# order by dataframe and assign a row number
df = df.sort_values(by = ['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date'], ascending=[True, True, True, True]).copy()
df = df.reset_index(drop=True)
df = df.reset_index()
df.rename(columns = {'index':'row_number'}, inplace = True)

prev = df.loc[:, ['row_number', 'Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date', 'Consumption', 'Current_Charges']]
prev.loc[:, 'row_number'] = prev.row_number.map(lambda x: x + 1)

tmp = df[['row_number', 'row_id', 'Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date', 'Consumption', 'Current_Charges']]\
.merge(prev, on = ['row_number', 'Building_ID', 'Meter_Number'], how = 'left')

tmp.columns = ['row_number', 'row_id', 'Building_ID', 'Meter_Number', 'Service_Start_Date',
       'Service_End_Date', 'Consumption', 'Current_Charges', 'Service_Start_Date_Prev', 'Service_End_Date_Prev', 'Consumption_Prev', 'Current_Charges_Prev']

tmp.loc[:, 'Diff_Prev'] = tmp.apply(lambda x: (x['Service_Start_Date'] - x['Service_End_Date_Prev']).days, axis = 1)

In [117]:
rows = list(tmp[tmp.Diff_Prev < -2].row_number.values)

### Go through each row that overlaps with its previous row

In [118]:
tmp.loc[tmp['row_number'].isin(rows)].sort_values('Diff_Prev')

Unnamed: 0,row_number,row_id,Building_ID,Meter_Number,Service_Start_Date,Service_End_Date,Consumption,Current_Charges,Service_Start_Date_Prev,Service_End_Date_Prev,Consumption_Prev,Current_Charges_Prev,Diff_Prev
6127,6127,4319,Building_Code-B52,Account_ID-1130,2016-01-11,2016-02-09,152812.0,18601.49,2016-01-09,2017-02-08,127386.0,15969.26,-394.0
1892,1892,10263,Building_Code-B145,Account_ID-1327,2015-01-13,2015-02-11,14400.0,1333.79,2015-01-11,2016-01-13,16370.0,1437.18,-365.0
7826,7826,880,Building_Code-B9,Account_ID-1024,2018-04-25,2018-05-24,394400.0,66345.95,2018-03-27,2018-07-25,0.0,76118.54,-91.0
4275,4275,14449,Building_Code-B173,Account_ID-1434,2018-05-25,2018-06-25,0.0,313181.17,2018-05-24,2018-06-25,4881585.0,453164.71,-31.0
2280,2280,10689,Building_Code-B145,Account_ID-1350,2015-05-06,2015-06-04,5027.0,607.6,2015-05-06,2015-06-04,5027.0,607.6,-29.0
2874,2874,1707,Building_Code-B16,Account_ID-1051,2016-01-24,2016-02-21,827488.0,64787.79,2016-01-22,2016-02-22,988070.0,76423.42,-29.0
7825,7825,849,Building_Code-B9,Account_ID-1024,2018-03-27,2018-07-25,0.0,76118.54,2018-03-27,2018-04-25,404000.0,59512.01,-29.0
234,234,7761,Building_Code-B105,Account_ID-1246,2018-05-01,2018-05-31,1684800.0,102452.63,2018-04-13,2018-05-09,1033344.0,62514.43,-8.0
3797,3797,13791,Building_Code-B168,Account_ID-1420,2015-09-09,2015-10-19,35743.0,4583.79,2015-08-18,2015-09-17,30825.0,3710.1,-8.0
6237,6237,4465,Building_Code-B53,Account_ID-1135,2016-11-17,2016-12-18,1371700.8,72753.32,2016-10-19,2016-11-23,1233372.16,66634.59,-6.0


### Manually remove the rows that have overlapping billing periods with other rows

In [132]:
i = 7825
df[df.row_number.isin(range(i - 2, i + 3))]

Unnamed: 0,row_number,row_id,Building_ID,Meter_Number,Service_Start_Date,Service_End_Date,Consumption,Current_Charges,Unit,Service_Duration
7823,7823,923,Building_Code-B9,Account_ID-1024,2018-02-26,2018-03-27,408000.0,61029.68,kWh,29
7824,7824,895,Building_Code-B9,Account_ID-1024,2018-03-27,2018-04-25,404000.0,59512.01,kWh,29
7826,7826,880,Building_Code-B9,Account_ID-1024,2018-04-25,2018-05-24,394400.0,66345.95,kWh,29
7827,7827,843,Building_Code-B9,Account_ID-1024,2018-05-24,2018-06-25,463200.0,76449.1,kWh,32


In [131]:
mask = df.row_number == 2874
df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'Bill range overlap with others, rows removed'})])
df = df[~mask]

### Drop the row_number field as it's no longer continuous

In [292]:
df.drop('row_number', axis = 1, inplace = True)

### Check if it make sense to union the billing windows of different Meter_Numbers under the same Building_ID  - NOTHING

In [146]:
tmp = df.groupby(['Building_ID', 'Meter_Number']).agg({'Service_Start_Date':'min', 'Service_End_Date':'max'}).reset_index()

tmp.rename(columns = {'Service_Start_Date':'Min_Start_Date', 'Service_End_Date':'Max_End_Date'}, inplace = True)

tmp.loc[:, 'Service_Duration'] = tmp.apply(lambda x: (x['Max_End_Date'] - x['Min_Start_Date']).days, axis = 1)

df_duration = tmp.copy()
del(tmp)

In [147]:
mask = (df_duration.Building_ID == 'Building_Code-B1') & (df_duration.Meter_Number == 'Account_ID-1000')
df_duration[mask]

Unnamed: 0,Building_ID,Meter_Number,Min_Start_Date,Max_End_Date,Service_Duration
0,Building_Code-B1,Account_ID-1000,2014-12-16,2017-12-31,1111


In [148]:
pysql = lambda q: pdsql.sqldf(q, globals())
str1 = "select distinct l.Building_ID, l.Meter_Number, r.Meter_Number\
        from df_duration l join df_duration r on l.Building_ID = r.Building_ID and l.Meter_Number != r.Meter_Number \
        and l.Max_End_Date = r.Min_Start_Date"
df_meter_mapping = pysql(str1)

In [149]:
df_meter_mapping

Unnamed: 0,Building_ID,Meter_Number,Meter_Number.1
0,Building_Code-B167,Account_ID-1407,Account_ID-1417
1,Building_Code-B167,Account_ID-1407,Account_ID-1418


### Seems one account was splitted into 2 after 2017-01-04

In [150]:
mask = df_duration.Meter_Number.isin(['Account_ID-1407', 'Account_ID-1417', 'Account_ID-1418'])
df_duration[mask]

Unnamed: 0,Building_ID,Meter_Number,Min_Start_Date,Max_End_Date,Service_Duration
121,Building_Code-B167,Account_ID-1407,2015-01-05,2017-01-04,730
126,Building_Code-B167,Account_ID-1417,2017-01-04,2019-02-01,758
127,Building_Code-B167,Account_ID-1418,2017-01-04,2019-01-01,727


### 3. Find the % of days missing for each Meter_Number, check the ones with high %, check if there should be merging of Meter_Numbers 

In [None]:
# # sort by building_id, meter number
# df = df.sort_values(by = ['Building_ID', 'Meter_Number', 'Service_Start_Date'], ascending=[True, True, True])

# def merge_dates(grp):
#     # Find contiguous date groups, and get the first/last start/end date for each group.
#     dt_groups = (grp['Service_Start_Date'] != grp['Service_End_Date'].shift()).cumsum()
#     return grp.groupby(dt_groups).agg({'Service_Start_Date': 'first', 'Service_End_Date': ['last', 'count']})

# # Perform a groupby and apply the merge_dates function, followed by formatting.
# df_gap = df.groupby(['Building_ID', 'Meter_Number']).apply(merge_dates)

# df_gap = df_gap.reset_index().drop('level_2', axis = 1)
# df_gap = df_gap.reset_index()

# df_gap.columns = ['rowNum', 'Building_ID', 'Meter_Number', 
#        'Service_Start_Date', 'Service_End_Date']

# df_gap['nextRowNum'] = df_gap['rowNum'].map(lambda x: x+1)

# # Join the dataframe with itself to find the gap between service ranges
# df_gap = pd.merge(df_gap, df_gap[['Building_ID', 'Meter_Number', 'nextRowNum', 'Service_End_Date']],\
#         left_on = ['Building_ID', 'Meter_Number', 'rowNum'], right_on = ['Building_ID', 'Meter_Number', 'nextRowNum'], how = 'left')

# # consecutive days of billing for the same meter number
# df_gap['consecutive_days'] = \
# df_gap[['Service_End_Date_x', 'Service_Start_Date']].apply(lambda x: (x[0] - x[1]).days, axis = 1)

# # number of days elapsed since the previous service range
# df_gap['gap_days'] = \
# df_gap[['Service_Start_Date', 'Service_End_Date_y']].apply(lambda x: (x[0] - x[1]).days, axis = 1)


# # Rename and reorder the columns
# df_gap = df_gap[['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date_x', 'consecutive_days', 'gap_days']]
# df_gap.columns = ['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date', 'consecutive_days', 'gap_days']

# # The first consecutive period of billing days has no gap days and should be filled with 0
# df_gap = df_gap.fillna(0)

#### For some cases the service_end_date and start_date overlap with each other or has a 1 day gap, treat as no gap 

In [None]:
# mask = (df_gap.gap_days == 1)|(df_gap.gap_days == -1)
# df_gap.loc[mask, 'gap_days'] = 0

# df_gap[mask].shape

# mask = df_gap.gap_days < 0
# df_gap[mask].sort_values('gap_days').sort_values(['Building_ID', 'Meter_Number'])

# mask = (df_gap.Building_ID == 'Building_Code-B52') & (df_gap.Meter_Number == 'Account_ID-1130')
# df_gap[mask]

# mask = (df.Building_ID == 'Building_Code-B52') & (df.Meter_Number == 'Account_ID-1130')
# df[mask]

### Assign Account_ID

In [164]:
df.loc[:, 'Account'] = df.apply(lambda x: x['Building_ID'] + ' - ' + x['Meter_Number'], axis = 1)

In [165]:
df.head()

Unnamed: 0,row_number,row_id,Building_ID,Meter_Number,Service_Start_Date,Service_End_Date,Consumption,Current_Charges,Unit,Service_Duration,Account
0,0,30,Building_Code-B1,Account_ID-1000,2014-12-16,2015-01-16,2473.47,237.45,kWh,31,Building_Code-B1 - Account_ID-1000
1,1,19,Building_Code-B1,Account_ID-1000,2015-01-16,2015-02-17,3010.8,406.46,kWh,32,Building_Code-B1 - Account_ID-1000
2,2,0,Building_Code-B1,Account_ID-1000,2015-02-17,2015-03-17,2815.17,450.43,kWh,28,Building_Code-B1 - Account_ID-1000
3,3,31,Building_Code-B1,Account_ID-1000,2015-03-17,2015-04-17,3446.34,251.58,kWh,31,Building_Code-B1 - Account_ID-1000
4,4,20,Building_Code-B1,Account_ID-1000,2015-04-17,2015-05-15,3217.62,167.32,kWh,28,Building_Code-B1 - Account_ID-1000


# Step 4: Prorate the bills to calendar months

### Save a copy of the cleaned version of the data frame containing all KWH related data

In [293]:
df_kwh_clean = df.copy()

In [294]:
df = df_kwh_clean.copy()

In [295]:
df.head()

Unnamed: 0,row_id,Building_ID,Meter_Number,Service_Start_Date,Service_End_Date,Consumption,Current_Charges,Unit,Service_Duration,Account
0,30,Building_Code-B1,Account_ID-1000,2014-12-16,2015-01-16,2473.47,237.45,kWh,31,Building_Code-B1 - Account_ID-1000
1,19,Building_Code-B1,Account_ID-1000,2015-01-16,2015-02-17,3010.8,406.46,kWh,32,Building_Code-B1 - Account_ID-1000
2,0,Building_Code-B1,Account_ID-1000,2015-02-17,2015-03-17,2815.17,450.43,kWh,28,Building_Code-B1 - Account_ID-1000
3,31,Building_Code-B1,Account_ID-1000,2015-03-17,2015-04-17,3446.34,251.58,kWh,31,Building_Code-B1 - Account_ID-1000
4,20,Building_Code-B1,Account_ID-1000,2015-04-17,2015-05-15,3217.62,167.32,kWh,28,Building_Code-B1 - Account_ID-1000


### Check if the service_end_date is inclusive for each bill

In [296]:
# order by dataframe and assign a row number
df = df.sort_values(by = ['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date'], ascending=[True, True, True, True]).copy()
df = df.reset_index(drop=True)
df = df.reset_index()
df.rename(columns = {'index':'row_number'}, inplace = True)

In [311]:
df_next = df.loc[:, ['row_number', 'Account', 'Service_Start_Date', 'Service_End_Date']]
df_next.loc[:, 'row_number'] = df_next.row_number.map(lambda x: x - 1)

tmp = df.merge(df_next, on = ['row_number', 'Account'], how = 'left')



In [315]:
tmp.columns = ['row_number', 'row_id', 'Building_ID', 'Meter_Number',
       'Service_Start_Date', 'Service_End_Date', 'Consumption',
       'Current_Charges', 'Unit', 'Service_Duration', 'Account',
       'Service_Start_Date_Next', 'Service_End_Date_Next']

tmp.loc[:, 'Diff_Next'] = tmp.apply(lambda x: (x['Service_Start_Date_Next'] - x['Service_End_Date']).days, axis = 1)

In [320]:
df = tmp.copy()
del(tmp)

### Most cases, the start_date of the next bill is the same as the end_date of the previous bill (end_date is non-inclusive)
- But there are ~ 2,000 rows where next_start = previous_end + 1 day, meaning the previous_end_date is inclusive
- Solution - for any bill, if end_date = next_start_date - 1, then its bill window includes its end_date; otherwise, not inclusive

In [321]:
df.Diff_Next.value_counts().sort_index()

-8.0         2
-6.0         2
-3.0         1
-2.0        10
-1.0        11
 0.0      5835
 1.0      1978
 2.0        13
 3.0         5
 4.0         2
 6.0         1
 9.0         1
 10.0        2
 27.0        3
 28.0        9
 29.0        8
 30.0        5
 31.0        6
 32.0        4
 34.0        1
 60.0        1
 61.0        5
 62.0        2
 90.0        1
 92.0        1
 120.0       1
 182.0       1
 184.0       1
 213.0       1
 334.0       1
Name: Diff_Next, dtype: int64

### Add the month associated with service_start_date and service_end_date

In [322]:
df['Start_Date_Month'] = df['Service_Start_Date'].apply(\
  lambda x: pd.to_datetime('-'.join([str(x.year), str(x.month)])))

df['End_Date_Month'] = df.apply(\
  lambda x: pd.to_datetime('-'.join([str(x['Service_End_Date'].year), str(x['Service_End_Date'].month)]))\
            if x['Diff_Next'] == 1 else pd.to_datetime('-'.join([str((x['Service_End_Date'] + relativedelta(days=-1)).year), str((x['Service_End_Date'] + relativedelta(days=-1)).month)]))\
            ,axis = 1)

### Create a dataframe of the relevant columns to work on the mapping between row_id to the calendar month.

In [325]:
cols = ['row_id', 'Start_Date_Month', 'End_Date_Month']
temp = df[cols]

Create a new data frame to store the mapping. The dataframe will have 3 columns: 'row_id' (identifier of the bill), 'Start_Date_Month' and 'End_Date_Month'. We'll collapse the last 2 columns into 1 in order to get the associated calendar month for each bill.

First, save all the row-month mapping between row_id and its start_date_month and end_date_month to the new dataframe

In [326]:
df_month_row_mapping = temp.copy()

Second, there are cases where the billing window is longer than one calendar month. 

- So for each bill, check if the billing window is longer than one month;
If so, save the Start_Date_Month in df_month_row_mapping and then replace it with its subsequent month until the billing window is less than one month.

In [327]:
while (temp.shape[0] > 0):
    temp.loc[:, 'Start_Date_Month_Next'] = \
    temp['Start_Date_Month'].map(lambda x: x + relativedelta(months=+1))

    temp.loc[:, 'Ind'] = \
    temp.apply(lambda x: 1 if x['Start_Date_Month_Next'] < x['End_Date_Month'] else 0, axis = 1)


    mask = temp['Ind'] == 1
    temp = temp.loc[mask,['row_id', 'Start_Date_Month_Next', 'End_Date_Month']].copy()
    temp.columns = ['row_id', 'Start_Date_Month', 'End_Date_Month']

    df_month_row_mapping = pd.concat([df_month_row_mapping, temp])

Collapse the  'Start_Date_Month' and 'End_Date_Month' columns into one column that contains all corresponding calendar months of a given bill.

In [328]:
temp = pd.melt(df_month_row_mapping, id_vars = df_month_row_mapping.columns[0:-2].values, value_vars = df_month_row_mapping[cols].columns[-2:])
temp.drop('variable', axis = 1, inplace = True)

temp = temp.drop_duplicates()
temp.columns  = ['row_id', 'Month']

In [330]:
temp = pd.merge(temp, df, on = 'row_id', how = 'left')

In [332]:
temp.loc[:, 'Prorated_Days'] = \
temp.apply(lambda x: \
       (min(x['Month'] + relativedelta(months = 1), (x['Service_End_Date'] + relativedelta(days=1))) \
        - max(x['Service_Start_Date'], x['Month'])).days if x['Diff_Next'] == 1\
       else (min(x['Month'] + relativedelta(months = 1), x['Service_End_Date']) \
        - max(x['Service_Start_Date'], x['Month'])).days\
       , axis = 1) 

In [341]:
temp.loc[:, 'Bill_Duration'] = \
temp.apply(lambda x: (x['Service_End_Date'] + relativedelta(days=1) - x['Service_Start_Date']).days\
       if x['Diff_Next'] == 1\
       else (x['Service_End_Date'] - x['Service_Start_Date']).days\
       , axis = 1) 


Calculate the prorated kwh consumption values based on the prorated days.

In [345]:
temp.loc[:, 'Prorated_KWH'] = \
temp.apply(lambda x: (x['Consumption'] / x['Bill_Duration'] )* x['Prorated_Days'], axis = 1)

In [348]:
temp.loc[:, 'Prorated_Charges'] = \
temp.apply(lambda x: (x['Current_Charges'] / x['Bill_Duration'] )* x['Prorated_Days'], axis = 1)

Save a dataframe that contains the full dataset as well as the corresponding calendar month for each bill. This dataframe will be useful to calculate the gaps days per account per month later. Note this dataframe has more rows than the original dataset since one bill may correspond to multiple calendar months.

In [349]:
df_with_calendar_month = temp.copy()

In [350]:
df_with_calendar_month.head()

Unnamed: 0,row_id,Month,row_number,Building_ID,Meter_Number,Service_Start_Date,Service_End_Date,Consumption,Current_Charges,Unit,Account,Service_Start_Date_Next,Service_End_Date_Next,Diff_Next,Start_Date_Month,End_Date_Month,Prorated_Days,Bill_Duration,Prorated_KWH,Prorated_Charges
0,30,2014-12-01,0,Building_Code-B1,Account_ID-1000,2014-12-16,2015-01-16,2473.47,237.45,kWh,Building_Code-B1 - Account_ID-1000,2015-01-16,2015-02-17,0.0,2014-12-01,2015-01-01,16,31,1276.629677,122.554839
1,19,2015-01-01,1,Building_Code-B1,Account_ID-1000,2015-01-16,2015-02-17,3010.8,406.46,kWh,Building_Code-B1 - Account_ID-1000,2015-02-17,2015-03-17,0.0,2015-01-01,2015-02-01,16,32,1505.4,203.23
2,0,2015-02-01,2,Building_Code-B1,Account_ID-1000,2015-02-17,2015-03-17,2815.17,450.43,kWh,Building_Code-B1 - Account_ID-1000,2015-03-17,2015-04-17,0.0,2015-02-01,2015-03-01,12,28,1206.501429,193.041429
3,31,2015-03-01,3,Building_Code-B1,Account_ID-1000,2015-03-17,2015-04-17,3446.34,251.58,kWh,Building_Code-B1 - Account_ID-1000,2015-04-17,2015-05-15,0.0,2015-03-01,2015-04-01,15,31,1667.583871,121.732258
4,20,2015-04-01,4,Building_Code-B1,Account_ID-1000,2015-04-17,2015-05-15,3217.62,167.32,kWh,Building_Code-B1 - Account_ID-1000,2015-05-15,2015-06-16,0.0,2015-04-01,2015-05-01,14,28,1608.81,83.66


Aggregate the data to Account-Month level by summing up the prorated kwh consupmtion values per calendar_month.

In [351]:
df_prorated = \
df_with_calendar_month.groupby(['Account','Month']).\
    agg({'Prorated_KWH':'sum', 'Prorated_Charges':'sum', 'Prorated_Days':'sum'}).reset_index()

So far for each account we've only been working on the calendar months that the accounts has billing records. We also need to map the account id to the calendar months where it should have data but were not logged or reported.

Create a dataframe that maps the account (Building_Meter) with all the calendar months that it should have bills.

Find all unique accounts (Building_Meter) and months in the dataset.

In [357]:
accounts = df_with_calendar_month.Account.value_counts().index.values

end = df_with_calendar_month['Month'].max()
start = df_with_calendar_month['Month'].min()
diff = (end.year - start.year) * 12 + end.month - start.month
# list of unique months
months = [start + relativedelta(months=x) for x in range(0, diff + 1)]

Create a reference table with all the calendar months and the corresponding # of days in the month. 

In [360]:
month_days = [(x + relativedelta(months = 1) - x).days for x in months]
df_month_days = pd.DataFrame({'Month':months,  'Month_#_Days':month_days})

Now we can map the account (Building_Meter) to all the calendar months that it should have billing data (Here we assumed the account should have data in all months between the first and last calendar month that it has billing data of).

In [362]:
df_account_month = pd.DataFrame()

for j in range(len(meters)):
    mask = (df_with_calendar_month['Account'] == accounts[j])
    start = df_with_calendar_month[mask]['Month'].min()
    end = df_with_calendar_month[mask]['Month'].max()
    start_index = months.index(start)
    end_index = months.index(end)
    
    temp_df = pd.DataFrame({'Account':accounts[j], 'Month':months[start_index:end_index + 1]})
    temp_df.loc[:, 'Month_Type'] = 'Month_In_The_Middle'
    temp_df.loc[0, 'Month_Type'] = 'First_Month'
    temp_df.loc[temp_df.shape[0]-1, 'Month_Type'] = 'Last_Month'
    df_account_month = pd.concat([df_account_month, temp_df])

In [367]:
df_account_month = pd.merge(df_account_month, df_month_days, on = ['Month'], how = 'left')

Left join account_meter mapping table to get all months for each account.

In [None]:
df_prorated = pd.merge(df_account_month, df_prorated, on = ['Building_Meter', 'Month'], how = 'left')

For months that the account didn't have data, fill in with zeros.

In [None]:
mask = df_prorated['Prorated_Days'].isnull()
df_prorated.loc[mask, 'Prorated_KWH'] = 0
df_prorated.loc[mask, 'Prorated_Days'] = 0