# Objectives

Clean up the electricity billing data from Client 2 and output the prorated consumption and charge values by account and calendar month.

- Input: raw data
- Outupt: 
    - df_cleaned
    - df_prorated
    - df_mapping
    - prorated_ts
    - prorated_ts.csv

# Step 1: Load packages and Data

In [27]:
import pandas as pd
import numpy as np
import pandasql as pdsql
import math

from datetime import timedelta, datetime
from dateutil.relativedelta import *

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
# initiate notebook for offline plot
init_notebook_mode(connected=True)         

### Specify the output directory of the resulting datasets

In [28]:
output_directory = '../output/client2/electricity/'

In [29]:
file_name = "/Users/feiwang/Desktop/capstone/anomalyDetectors/data/Client 2 - Data for UW team.xlsx"
xl_file = pd.ExcelFile(file_name)

dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}

df_orig = dfs['Sheet1']

### Add a row_id in the original dataset for debugging

In [30]:
df_orig = df_orig.reset_index()
df_orig.rename(columns = {'index':'row_id'}, inplace = True)

### Create a data frame to log the rows with data quality issues.

In [31]:
df_flags = pd.DataFrame(columns = ['row_id', 'flag'])

### Create a working dataframe and only focus on electricity bills

In [32]:
df = df_orig.copy()

In [33]:
df.rename(columns = {'Building ID':'Building_ID', 'Meter_ID':'Meter_Number', 'Cost':'Charge', \
                    'Start Date':'Service_Start_Date', 'End Date':'Service_End_Date', 'Units.1':'Units'},\
          inplace = True)

In [34]:
mask = df.Fuel == 'Electricity'
df = df[mask]

# Step 2: Data Cleaning & Exploration

## Part I: Cleaning (necessary data cleaning work before prorating of consumption and charge values to calendar months)

### 1. Remove the two columns on demand and its units since we only care about consumption & charges

In [35]:
df.head()

Unnamed: 0,row_id,Building_ID,Fuel,Meter_Number,Account_Name,Service_Start_Date,Service_End_Date,Consumption,Units,Charge,Currency,Demand,Units.1,Vendor,Invoice Number
0,0,47068,Electricity,MN10000,AN100,2015-12-08,2016-01-06,69828.0,kWh,4124.6,USD,180.0,Kw,V1,IN1
1,1,47068,Electricity,MN10000,AN100,2016-01-07,2016-02-04,69395.0,kWh,4501.79,USD,164.0,Kw,V1,IN2
2,2,47068,Electricity,MN10000,AN100,2016-02-05,2016-03-07,79178.0,kWh,4979.07,USD,176.0,Kw,V1,IN3
3,3,47068,Electricity,MN10000,AN100,2016-03-08,2016-04-05,74855.0,kWh,5291.78,USD,185.0,Kw,V1,IN4
4,4,47068,Electricity,MN10000,AN100,2016-04-06,2016-05-05,78745.0,kWh,5950.15,USD,205.0,Kw,V1,IN5


In [36]:
df = pd.concat([df.iloc[:, 0:11], df.iloc[:, 13:15]], axis = 1)

### 2. Remove rows with missing values in Building_ID, Meter_Number, Service_Start_Date, Service_End_Date, Consumption or Charge (we need these columns to do value proration)

In [37]:
df.isna().sum()

row_id                  0
Building_ID             0
Fuel                    0
Meter_Number           58
Account_Name          450
Service_Start_Date      0
Service_End_Date        0
Consumption             0
Units                   0
Charge                  0
Currency                0
Vendor                  0
Invoice Number        450
dtype: int64

#### Remove rows that have no meter_number

In [38]:
mask = df.Meter_Number.isnull()

df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'Missing Meter_Number value, rows removed'})])

df = df[~mask]
df = df.reset_index(drop=True)

### 3. Ensure correct data types for the six columns

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2520 entries, 0 to 2519
Data columns (total 13 columns):
row_id                2520 non-null int64
Building_ID           2520 non-null int64
Fuel                  2520 non-null object
Meter_Number          2520 non-null object
Account_Name          2128 non-null object
Service_Start_Date    2520 non-null datetime64[ns]
Service_End_Date      2520 non-null datetime64[ns]
Consumption           2520 non-null float64
Units                 2520 non-null object
Charge                2520 non-null float64
Currency              2520 non-null object
Vendor                2520 non-null object
Invoice Number        2128 non-null object
dtypes: datetime64[ns](2), float64(2), int64(2), object(7)
memory usage: 256.0+ KB


### 4. Ensure each Meter_number is mapped to only one Building_ID

In [40]:
tmp = df.groupby(['Meter_Number']).agg({'Building_ID':'nunique'}).reset_index()

In [41]:
tmp[tmp['Building_ID'] != 1]

Unnamed: 0,Meter_Number,Building_ID
16,MN10042,8


In [42]:
mask = df.Meter_Number.isin(tmp[tmp['Building_ID'] != 1].Meter_Number.values)

df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'Meter_Number corresponds to multipel building_id, rows removed'})])

df = df[~mask]
df = df.reset_index(drop=True)

### 5. Check if there needs Meter_Number merging under the same Building_ID 

In [43]:
tmp = df.groupby(['Building_ID', 'Meter_Number']).agg({'Service_Start_Date':'min', 'Service_End_Date':'max'}).reset_index()

tmp.rename(columns = {'Service_Start_Date':'Min_Start_Date', 'Service_End_Date':'Max_End_Date'}, inplace = True)

tmp.loc[:, 'Service_Duration'] = tmp.apply(lambda x: (x['Max_End_Date'] - x['Min_Start_Date']).days, axis = 1)

df_duration = tmp.copy()
del(tmp)

In [44]:
tmp = pd.merge(df_duration, df_duration, on = 'Building_ID', how = 'inner')\
    [['Building_ID', 'Meter_Number_x', 'Meter_Number_y', 'Min_Start_Date_x', 'Max_End_Date_x', 'Min_Start_Date_y', 'Max_End_Date_y']]

mask = tmp['Meter_Number_x'] != tmp['Meter_Number_y']
tmp = tmp[mask]

In [45]:
tmp.loc[:, 'date_diff'] = tmp.apply(lambda x: (x['Min_Start_Date_x'] - x['Max_End_Date_y']).days, axis = 1)

In [46]:
mask = (tmp.date_diff >= -1) & (tmp.date_diff <= 1)
df_meter_mapping = tmp[mask]
df_meter_mapping

Unnamed: 0,Building_ID,Meter_Number_x,Meter_Number_y,Min_Start_Date_x,Max_End_Date_x,Min_Start_Date_y,Max_End_Date_y,date_diff
177,47094,MN10103,MN10101,2017-01-25,2018-12-23,2015-12-25,2017-01-25,0
178,47094,MN10103,MN10102,2017-01-25,2018-12-23,2015-12-24,2017-01-24,1
182,47094,MN10104,MN10101,2017-01-26,2018-12-23,2015-12-25,2017-01-25,1
220,47123,MN10140,MN10139,2018-06-01,2018-11-30,2016-01-01,2018-05-31,1
268,277107,MN10166,MN10165,2016-10-13,2017-01-12,2016-08-14,2016-10-12,1


#### Manually map the meter_numbers

In [47]:
mask = (df.Building_ID == 47094) & (df.Meter_Number == 'MN10103')
df.loc[mask, 'Meter_Number'] = 'MN10102'

In [48]:
mask = (df.Building_ID == 47094) & (df.Meter_Number == 'MN10104')
df.loc[mask, 'Meter_Number'] = 'MN10101'

In [49]:
mask = (df.Building_ID == 47123) & (df.Meter_Number == 'MN10140')
df.loc[mask, 'Meter_Number'] = 'MN10139'

In [50]:
mask = (df.Building_ID == 277107) & (df.Meter_Number == 'MN10166')
df.loc[mask, 'Meter_Number'] = 'MN10165'

### 6. Ensure the service dates columns are correct

In [51]:
mask = df['Service_Start_Date'] >= df['Service_End_Date']
df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'incorrect service dates, rows removed'})])
df = df[~mask]
df = df.reset_index(drop=True)

### 7. Assign an Account ID for each row

In [52]:
df.loc[:, 'Account'] = df.apply(lambda x: str(x['Building_ID']) + '-' + str(x['Meter_Number']), axis = 1)

## Part II: Exploration (potential data issues that do not prohibit data proration, and which we hope can be detected automatically later via our anomalty detection rules)

### 1. Check the currency column

#### There is no currency value with typo or caption variation

In [53]:
df.Currency.value_counts()

USD    2125
Name: Currency, dtype: int64

### 2. Check the Unit column

#### There is no unit value with typo or caption variation

In [54]:
df.Units.value_counts()

kWh     2119
each       6
Name: Units, dtype: int64

### 3. Check rows that have 0 in either consumption or cost column

- 26% of the rows have either 0 value in consumption or cost column
- Shall we remove them? No
    - sometimes there is legitimate reasons (the consumption is below a certain threshold and thus there was no charge)
    - hope our anomaly detection rules can find them without our manual effort of data cleaning

In [55]:
mask1 = df.Consumption == 0
mask2 = df.Charge == 0

In [56]:
df[mask1 & mask2].shape[0]/df.shape[0]

0.00611764705882353

In [57]:
df[mask1 & ~mask2].shape[0]/df.shape[0]

0.25694117647058823

In [58]:
df[~mask1 & mask2].shape[0]/df.shape[0]

0.0

In [59]:
df[~mask1 & ~mask2].shape[0]/df.shape[0]

0.7369411764705882

### Even though there are multiple typs of unit, they seem to belong to distinct accounts (except for "each" and "ccf"), therefore we don't need to do unit conversion since we only compare values within each account

In [60]:
pd.merge(df[df['Units'] == 'kWh'], df[df['Units'] == 'each'], on = ['Account'], how = 'inner').shape

(285, 27)

In [61]:
pd.merge(df[df['Units'] == 'kWh'], df[df['Units'] == 'each'], on = ['Account'], how = 'inner').head()

Unnamed: 0,row_id_x,Building_ID_x,Fuel_x,Meter_Number_x,Account_Name_x,Service_Start_Date_x,Service_End_Date_x,Consumption_x,Units_x,Charge_x,...,Meter_Number_y,Account_Name_y,Service_Start_Date_y,Service_End_Date_y,Consumption_y,Units_y,Charge_y,Currency_y,Vendor_y,Invoice Number_y
0,501,47071,Electricity,MN10022,AN122,2015-12-08,2016-01-06,61919.0,kWh,3310.04,...,MN10022,AN122,2018-08-07,2018-09-05,0.0,each,0.0,USD,V1,IN442
1,502,47071,Electricity,MN10022,AN122,2016-01-07,2016-02-04,62896.0,kWh,4368.76,...,MN10022,AN122,2018-08-07,2018-09-05,0.0,each,0.0,USD,V1,IN442
2,503,47071,Electricity,MN10022,AN122,2016-02-05,2016-03-07,71934.0,kWh,4907.62,...,MN10022,AN122,2018-08-07,2018-09-05,0.0,each,0.0,USD,V1,IN442
3,504,47071,Electricity,MN10022,AN122,2016-03-08,2016-04-05,67226.0,kWh,5185.98,...,MN10022,AN122,2018-08-07,2018-09-05,0.0,each,0.0,USD,V1,IN442
4,505,47071,Electricity,MN10022,AN122,2016-04-06,2016-05-05,70542.0,kWh,5469.73,...,MN10022,AN122,2018-08-07,2018-09-05,0.0,each,0.0,USD,V1,IN442


only the last month in the data has the "each" unit

In [62]:
mask = (df['Building_ID'] == 47071) & (df['Meter_Number'] == 'MN10022')
df[mask]

Unnamed: 0,row_id,Building_ID,Fuel,Meter_Number,Account_Name,Service_Start_Date,Service_End_Date,Consumption,Units,Charge,Currency,Vendor,Invoice Number,Account
342,501,47071,Electricity,MN10022,AN122,2015-12-08,2016-01-06,61919.0,kWh,3310.040000,USD,V1,IN407,47071-MN10022
343,502,47071,Electricity,MN10022,AN122,2016-01-07,2016-02-04,62896.0,kWh,4368.760000,USD,V1,IN408,47071-MN10022
344,503,47071,Electricity,MN10022,AN122,2016-02-05,2016-03-07,71934.0,kWh,4907.620000,USD,V1,IN409,47071-MN10022
345,504,47071,Electricity,MN10022,AN122,2016-03-08,2016-04-05,67226.0,kWh,5185.980000,USD,V1,IN410,47071-MN10022
346,505,47071,Electricity,MN10022,AN122,2016-04-06,2016-05-05,70542.0,kWh,5469.730000,USD,V1,IN411,47071-MN10022
347,506,47071,Electricity,MN10022,AN122,2016-05-06,2016-06-06,71300.0,kWh,6594.970000,USD,V1,IN412,47071-MN10022
348,507,47071,Electricity,MN10022,AN122,2016-06-07,2016-07-06,69585.0,kWh,6730.580000,USD,V1,IN413,47071-MN10022
349,508,47071,Electricity,MN10022,AN122,2016-07-07,2016-08-07,71831.0,kWh,6674.700000,USD,V1,IN414,47071-MN10022
350,509,47071,Electricity,MN10022,AN122,2016-08-08,2016-09-06,62121.0,kWh,5927.540000,USD,V1,IN415,47071-MN10022
351,510,47071,Electricity,MN10022,AN122,2016-09-07,2016-10-05,59579.0,kWh,5844.350000,USD,V1,IN416,47071-MN10022


### Check the overlapping of service periods

In [63]:
# order by dataframe and assign a row number
df = df.sort_values(by = ['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date'], ascending=[True, True, True, True]).copy()
df = df.reset_index(drop=True)
df = df.reset_index()
df.rename(columns = {'index':'row_number'}, inplace = True)

prev = df.loc[:, ['row_number', 'Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date', 'Consumption', 'Charge']]

prev.loc[:, 'row_number'] = prev['row_number'].map(lambda x: x + 1)


tmp = df[['row_number', 'row_id', 'Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date', 'Consumption', 'Charge']]\
.merge(prev, on = ['row_number', 'Building_ID', 'Meter_Number'], how = 'left')

tmp.columns = ['row_number', 'row_id', 'Building_ID', 'Meter_Number', 'Service_Start_Date',
       'Service_End_Date', 'Consumption', 'Current_Charges', 'Service_Start_Date_Prev', 'Service_End_Date_Prev', 'Consumption_Prev', 'Current_Charges_Prev']

tmp.loc[:, 'Diff_Prev'] = tmp.apply(lambda x: (x['Service_Start_Date'] - x['Service_End_Date_Prev']).days, axis = 1)

In [64]:
rows = list(tmp[tmp.Diff_Prev < -2].row_number.values)

### We have lots of bills with overlapping billing periods (17%)

In [107]:
len(rows)/df.shape[0]

0.1708235294117647

### Manually check the rows that have overlapping billing periods with other rows

In [66]:
# i = 7825
# df[df.row_number.isin(range(i - 2, i + 3))]

In [67]:
# mask = df.row_number == 2874
# df_flags = pd.concat([df_flags, pd.DataFrame({'row_id':df.loc[mask, ].row_id.values, 'flag':'Bill range overlap with others, rows removed'})])
# df = df[~mask]

### Drop the row_number field after we finish matching the adjacent rows

In [68]:
df.drop('row_number', axis = 1, inplace = True)

### Save a copy of the cleaned version of the dataset

In [69]:
df_cleaned = df.copy()

# Step 4: Prorate the bills to calendar months

### Check if the service_end_date is inclusive for each bill

In [70]:
# order by dataframe and assign a row number
df = df.sort_values(by = ['Building_ID', 'Meter_Number', 'Service_Start_Date', 'Service_End_Date'], ascending=[True, True, True, True]).copy()
df = df.reset_index(drop=True)
df = df.reset_index()
df.rename(columns = {'index':'row_number'}, inplace = True)

In [71]:
cols = ['row_number', 'row_id', 'Account', \
        'Service_Start_Date', 'Service_End_Date', 'Consumption', 'Charge']
df = df[cols]

In [72]:
df_next = df.loc[:, ['row_number', 'Account', 'Service_Start_Date', 'Service_End_Date']]
df_next.loc[:, 'row_number'] = df_next.row_number.map(lambda x: x - 1)

tmp = df.merge(df_next, on = ['row_number', 'Account'], how = 'left')

In [74]:
tmp.columns = ['row_number', 'row_id', 'Account',
       'Service_Start_Date', 'Service_End_Date', 'Consumption', 'Charge',
       'Service_Start_Date_Next', 'Service_End_Date_Next']

tmp.loc[:, 'Diff_Next'] = tmp.apply(lambda x: (x['Service_Start_Date_Next'] - x['Service_End_Date']).days, axis = 1)

In [75]:
df = tmp.copy()
del(tmp)

### Most cases, the start_date of the next bill is 1 day after the end_date of the previous bill (end_date is inclusive)

In [76]:
df.Diff_Next.value_counts().sort_index()

-364.0       1
-91.0        1
-59.0        3
-32.0        1
-31.0       30
-30.0       69
-29.0      115
-28.0       93
-27.0       43
-26.0        2
-13.0        3
-7.0         1
-6.0         1
 0.0       200
 1.0      1490
 2.0         1
 30.0        1
Name: Diff_Next, dtype: int64

### Add the month associated with service_start_date and service_end_date

In [77]:
df['Start_Date_Month'] = df['Service_Start_Date'].apply(\
  lambda x: pd.to_datetime('-'.join([str(x.year), str(x.month)])))

df['End_Date_Month'] = df.apply(\
  lambda x: pd.to_datetime('-'.join([str(x['Service_End_Date'].year), str(x['Service_End_Date'].month)]))\
            if x['Diff_Next'] == 1 else pd.to_datetime('-'.join([str((x['Service_End_Date'] + relativedelta(days=-1)).year), str((x['Service_End_Date'] + relativedelta(days=-1)).month)]))\
            ,axis = 1)

### Create a dataframe of the relevant columns to work on the mapping between row_id to the calendar month

In [78]:
cols = ['row_id', 'Start_Date_Month', 'End_Date_Month']
temp = df[cols]

Create a new data frame to store the mapping. The dataframe will have 3 columns: 'row_id' (identifier of the bill), 'Start_Date_Month' and 'End_Date_Month'. We'll collapse the last 2 columns into 1 in order to get the associated calendar month for each bill.

First, save all the row-month mapping between row_id and its start_date_month and end_date_month to the new dataframe

In [79]:
df_month_row_mapping = temp.copy()

Second, there are cases where the billing window is longer than one calendar month. 

- So for each bill, check if the billing window is longer than one month;
If so, save the Start_Date_Month in df_month_row_mapping and then replace it with its subsequent month until the billing window is less than one month.

In [80]:
while (temp.shape[0] > 0):
    temp.loc[:, 'Start_Date_Month_Next'] = \
    temp['Start_Date_Month'].map(lambda x: x + relativedelta(months=+1))

    temp.loc[:, 'Ind'] = \
    temp.apply(lambda x: 1 if x['Start_Date_Month_Next'] < x['End_Date_Month'] else 0, axis = 1)


    mask = temp['Ind'] == 1
    temp = temp.loc[mask,['row_id', 'Start_Date_Month_Next', 'End_Date_Month']].copy()
    temp.columns = ['row_id', 'Start_Date_Month', 'End_Date_Month']

    df_month_row_mapping = pd.concat([df_month_row_mapping, temp])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Collapse the  'Start_Date_Month' and 'End_Date_Month' columns into one column that contains all corresponding calendar months of a given bill.

In [81]:
temp = pd.melt(df_month_row_mapping, id_vars = df_month_row_mapping.columns[0:-2].values, value_vars = df_month_row_mapping[cols].columns[-2:])
temp.drop('variable', axis = 1, inplace = True)

temp = temp.drop_duplicates()
temp.columns  = ['row_id', 'Month']

Add calendar month to each bill.

In [82]:
temp = pd.merge(temp, df, on = 'row_id', how = 'left')

For each bill, calculate the number of days prorated into the corresponding calendar month.

In [83]:
temp.loc[:, 'Prorated_Days'] = \
temp.apply(lambda x: \
       (min(x['Month'] + relativedelta(months = 1), (x['Service_End_Date'] + relativedelta(days=1))) \
        - max(x['Service_Start_Date'], x['Month'])).days if x['Diff_Next'] == 1\
       else (min(x['Month'] + relativedelta(months = 1), x['Service_End_Date']) \
        - max(x['Service_Start_Date'], x['Month'])).days\
       , axis = 1) 

For each bill, calculate the number of days covered by the bill.

In [84]:
temp.loc[:, 'Bill_Duration'] = \
temp.apply(lambda x: (x['Service_End_Date'] + relativedelta(days=1) - x['Service_Start_Date']).days\
       if x['Diff_Next'] == 1\
       else (x['Service_End_Date'] - x['Service_Start_Date']).days\
       , axis = 1) 

Calculate the prorated consumption and charge values based on the prorated days and bill duration.

In [87]:
temp.loc[:, 'Prorated_Consumption'] = \
temp.apply(lambda x: (x['Consumption'] / x['Bill_Duration'] )* x['Prorated_Days'], axis = 1)

In [88]:
temp.loc[:, 'Prorated_Charge'] = \
temp.apply(lambda x: (x['Charge'] / x['Bill_Duration'] )* x['Prorated_Days'], axis = 1)

Save a dataframe that contains the full dataset as well as the corresponding calendar month for each bill. This dataframe will be useful to calculate the prorated consumption and charge values per account-month. Note this dataframe has more rows than the original dataset since one bill may correspond to multiple calendar months.

In [89]:
df_with_calendar_month = temp.copy()

Aggregate the data to Account-Month level by summing up the prorated kwh consupmtion values per calendar_month.

In [90]:
df_prorated = \
df_with_calendar_month.groupby(['Account','Month']).\
    agg({'Prorated_Consumption':'sum', 'Prorated_Charge':'sum', 'Prorated_Days':'sum'}).reset_index()

### Create a mapping table between the individual bill and account_month instance

In [91]:
df_prorated = df_prorated.reset_index()

# add row_id to the df_prorated dataset
df_prorated.rename(columns = {'index':'row_id'}, inplace = True)

df_mapping = pd.merge(df_with_calendar_month, df_prorated, on = ['Account', 'Month'], how = 'inner')[['row_id_x', 'row_id_y']]
df_mapping.rename(columns = {'row_id_x':'row_id_account_month', 'row_id_y':'row_id_bill'}, inplace = True)

So far for each account we've only been working on the calendar months that the accounts has billing records. We also need to map the account id to the calendar months where it should have data but were not logged or reported.

### Create a dataframe that maps the account with all the calendar months that it should have bills

Find all unique accounts (Building_Meter) and months in the dataset.

In [92]:
accounts = df_with_calendar_month.Account.value_counts().index.values

end = df_with_calendar_month['Month'].max()
start = df_with_calendar_month['Month'].min()
diff = (end.year - start.year) * 12 + end.month - start.month
# list of unique months
months = [start + relativedelta(months=x) for x in range(0, diff + 1)]

Create a reference table with all the calendar months and the corresponding # of days in the month. 

In [93]:
month_days = [(x + relativedelta(months = 1) - x).days for x in months]
df_month_days = pd.DataFrame({'Month':months,  'Month_#_Days':month_days})

Now we can map the account (Building_Meter) to all the calendar months that it should have billing data (Here we assumed the account should have data in all months between the first and last calendar month that it has billing data of).

In [94]:
df_account_month = pd.DataFrame()

for j in range(len(accounts)):
    mask = (df_with_calendar_month['Account'] == accounts[j])
    start = df_with_calendar_month[mask]['Month'].min()
    end = df_with_calendar_month[mask]['Month'].max()
    start_index = months.index(start)
    end_index = months.index(end)
    
    temp_df = pd.DataFrame({'Account':accounts[j], 'Month':months[start_index:end_index + 1]})
    temp_df.loc[:, 'Month_Type'] = 'Month_In_The_Middle'
    temp_df.loc[0, 'Month_Type'] = 'First_Month'
    temp_df.loc[temp_df.shape[0]-1, 'Month_Type'] = 'Last_Month'
    df_account_month = pd.concat([df_account_month, temp_df])

In [95]:
df_account_month = pd.merge(df_account_month, df_month_days, on = ['Month'], how = 'left')

Left join account_meter mapping table to get all months for each account.

In [96]:
df_prorated = pd.merge(df_account_month, df_prorated, on = ['Account', 'Month'], how = 'left')

For months that the account didn't have data, fill in with NA's.

In [97]:
mask = df_prorated['Prorated_Days'].isnull()
df_prorated.loc[mask, 'Prorated_Consumption'] = np.nan
df_prorated.loc[mask, 'Prorated_Charge'] = np.nan
df_prorated.loc[mask, 'NA_Ind'] = True

## Step 5: Save the datasets

In [98]:
df_mapping.to_pickle(output_directory + 'df_mapping')
df_prorated.to_pickle(output_directory + 'df_prorated')
df_cleaned.to_pickle(output_directory + 'df_cleaned')
df_orig.to_pickle(output_directory + 'df_orig')

#### Find the accounts that have no month with missing data for anomaly analysis.

In [104]:
tmp = pd.merge(
    df_prorated.groupby(['Account']).agg({'Month':'count'}).reset_index()\
    , df_prorated.groupby(['Account']).agg({'NA_Ind':'sum'}).reset_index()\
    , on = 'Account', how = 'inner')

tmp.rename(columns = {'Month':'Count', 'NA_Ind':'NA_Count'}, inplace = True)
mask = (tmp['NA_Count'] == 0) & (tmp['Count'] >= 24)

In [105]:
mask = df_prorated.Account.isin(tmp[mask].Account.values)
df_prorated[mask][['Account', 'Month', 'Prorated_Consumption', 'Prorated_Charge']].to_pickle(output_directory + 'electricity_prorated_ts')
df_prorated[mask][['Account', 'Month', 'Prorated_Consumption', 'Prorated_Charge']].to_csv(output_directory + 'electricity_prorated_ts.csv')