## Import Statements

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from scipy import stats

pd.options.display.max_columns=25

## Data FY 2013-1

In [2]:
data_2013 = pd.read_excel('houston-houston-electricity-bills/coh-fy2013-ee-bills-july2012-june2013.xlsx')
orig_shape_2013 = data_2013.shape[0]

data_2013.shape

(66776, 24)

In [3]:
data_2013.head(5)

Unnamed: 0,Reliant Contract No,Service Address,Meter No,ESID,Business Area,Cost Center,Fund,Bill Type,Bill Date,Read Date,Due Date,Meter Read,Base Cost ($),T&D Discretionary ($),T&D Charges ($),Current Due ($),Index Charge ($),Total Due ($),Franchise Fee ($),Voucher Date,Billed Demand (KVA),kWh Usage,Nodal Cu Charge ($),Adder Charge ($)
0,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-06-27,2013-06-23,2013-07-28,57061.0,57621.95,1319.11,10606.19,69785.2,,69785.2,-1016.9,2013-07-01,1462.5,876113.0,237.95,0.0
1,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-05-29,2013-05-23,2013-06-28,56331.0,57981.59,1316.35,10676.66,70177.01,,70177.01,-1041.76,2013-05-30,1496.907217,879842.0,202.41,0.0
2,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-04-29,2013-04-23,2013-05-29,55598.0,67005.8,1357.92,10853.62,79309.4,,79309.4,-1036.92,2013-04-30,1502.083333,997407.0,92.06,0.0
3,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-03-28,2013-03-21,2013-04-27,54767.0,57008.44,1300.83,10473.16,68778.3,,68778.3,-995.5,2013-03-29,1432.989691,849351.0,-4.13,0.0
4,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-02-26,2013-02-21,2013-03-28,54059.0,61287.57,1313.49,10534.33,73135.18,,73135.18,-1000.33,2013-02-27,1452.57732,911746.0,-0.21,0.0


### Checking Nulls

In [4]:
data_2013.isna().sum()

Reliant Contract No          0
Service Address              0
Meter No                 15228
ESID                         0
Business Area                0
Cost Center                  0
Fund                         0
Bill Type                    0
Bill Date                    0
Read Date                    0
Due Date                     0
Meter Read                  12
Base Cost ($)                1
T&D Discretionary ($)        0
T&D Charges ($)              1
Current Due ($)              0
Index Charge ($)         65183
Total Due ($)                0
Franchise Fee ($)            0
Voucher Date                 0
Billed Demand (KVA)         12
kWh Usage                    1
Nodal Cu Charge ($)          5
Adder Charge ($)            14
dtype: int64

### Checking Index Charge ($) column - This was previously Adjustment

In [5]:
data_2013['Index Charge ($)'].value_counts(dropna=False)

 NaN     65183
 0.00     1592
-0.54        1
Name: Index Charge ($), dtype: int64

The column does not have any relevant information based on the above reported values. Electing to drop the column.

In [6]:
data_2013.drop(columns=['Index Charge ($)'], inplace=True)

### Checking Unique Number of Customers

There are quite a few columns in the dataset that signify relating to a unique person/house/business. Checking the unique counts of such columns.

In [7]:
check_unique_columns = ['Reliant Contract No', 'Service Address ', 'Meter No', 
                        'ESID', 'Business Area', 'Cost Center',]

for col in check_unique_columns:
    print(f'Number of Unique Values in {col}: {data_2013[col].nunique()}')

Number of Unique Values in Reliant Contract No: 5900
Number of Unique Values in Service Address : 5840
Number of Unique Values in Meter No: 4035
Number of Unique Values in ESID: 5898
Number of Unique Values in Business Area: 9
Number of Unique Values in Cost Center: 39


Based on the above reported values and further research online:

ESID signifies a unique ID provided to each customer subscribed to the electricity board. It would be best to choose ESID and Service Address columns going forward as these would provide number of unique customers and the areas (streets) where higher usage of electricity occurs.

Business Area signifies a grouping a number of buildings which covers a certain area. This would be useful usage patterns grouped by certain zones in the city.

### Checking Bill Type

In [8]:
data_2013['Bill Type'].value_counts(dropna=False)

T    66222
P      552
C        2
Name: Bill Type, dtype: int64

Bill Type could signify the type of the connection given. Since commercial, residential and government spaces would have different type of pricing and needs this column could be capturing that information.

In [9]:
data_2013['Service Address '].nunique(), data_2013['Meter No'].nunique(), data_2013['ESID'].nunique()

(5840, 4035, 5898)

The next 3 columns are: Bill Date, Read Date and Due Date. Of these it would be best to choose the Bill date across all the data files to keep the data consistent. 

### Electricity Usage Statistics

In [10]:
data_2013[['Meter Read', 'Billed Demand (KVA)', 'kWh Usage']].describe()

Unnamed: 0,Meter Read,Billed Demand (KVA),kWh Usage
count,66764.0,66764.0,66775.0
mean,9869.779829,44.208272,18804.21
std,17911.694906,380.343991,202458.7
min,0.0,0.0,0.0
25%,0.0,0.0,1.0
50%,3123.0,0.0,231.0
75%,9007.25,8.0,1680.0
max,239800.0,16775.903614,9689658.0


There are 3 columns that denote the amount of electricity: Meter Read, Billed Demand, kWh Usage.

Using kWh Usage as a standard unit of measurement.

In [11]:
data_2013[[
    'Base Cost ($)', 'T&D Discretionary ($)', 'T&D Charges ($)', 
    'Current Due ($)', 'Total Due ($)', 'Franchise Fee ($)', 
    'Nodal Cu Charge ($)', 'Adder Charge ($)'
     ]].describe()

Unnamed: 0,Base Cost ($),T&D Discretionary ($),T&D Charges ($),Current Due ($),Total Due ($),Franchise Fee ($),Nodal Cu Charge ($),Adder Charge ($)
count,66775.0,66776.0,66775.0,66776.0,66776.0,66776.0,66771.0,66762.0
mean,1249.628836,367.439382,278.533215,1901.861997,1902.580866,-33.921297,6.00523,0.0
std,13443.314342,11796.148872,1997.001709,21320.228167,21320.83691,237.409585,132.671939,0.0
min,0.0,-7091.41,-37666.73,-44264.86,0.0,-7017.8,-323.08,0.0
25%,0.07,3.12,6.02,11.82,11.82,-5.41,0.0,0.0
50%,15.36,6.2,10.94,32.17,32.04,-0.46,0.0,0.0
75%,111.69,20.73,77.865,234.01,230.7375,0.0,0.15,0.0
max,650951.22,756478.12,69826.36,907001.56,907001.56,84.91,20461.93,0.0


Adder Charge ($) does not contain any useful information. Electing to drop that column. Previously this column was Reliability Unit Charge.

The columns other than Current Due or Total Due are adding up the value present in these two columns. Going forward choosing the column Total Due ($). 
Based on the above statistics the columns Current Due and Total Due represent the same value. 

Based on the above analysis of the dataset choosing the following columns:

1. ESID
2. Business Area
3. Service Address 
3. Bill Type
4. Bill Date
5. Total Due ($)
6. kWh Usage

### Selecting and Filtering Columns

In [12]:
data_2013 = data_2013[[
    'ESID', 'Business Area', 'Service Address ', 'Bill Type',
    'Bill Date', 'Total Due ($)', 'kWh Usage'
]]

In [13]:
rename_cols = {
    'ESID': 'esid',
    'Business Area': 'business_area',
    'Service Address ': 'service_address',
    'Bill Type': 'bill_type',
    'Bill Date': 'bill_date',
    'Total Due ($)': 'total_due',
    'kWh Usage': 'kwh_usage'
}

data_2013_main = data_2013.rename(columns=rename_cols)

Checking for Nulls again and dtypes

In [14]:
data_2013_main.isna().sum()

esid               0
business_area      0
service_address    0
bill_type          0
bill_date          0
total_due          0
kwh_usage          1
dtype: int64

In [15]:
data_2013_main.dropna(subset=['kwh_usage'], inplace=True)

In [16]:
data_2013_main.isna().sum()

esid               0
business_area      0
service_address    0
bill_type          0
bill_date          0
total_due          0
kwh_usage          0
dtype: int64

In [17]:
data_2013_main.dtypes

esid                       object
business_area               int64
service_address            object
bill_type                  object
bill_date          datetime64[ns]
total_due                 float64
kwh_usage                 float64
dtype: object

In [18]:
data_2013_main.shape

(66775, 7)

In [19]:
zscore_2013 = stats.zscore(data_2013_main[['total_due', 'kwh_usage']])

zscore_2013

Unnamed: 0,total_due,kwh_usage
0,3.183862,4.234519
1,3.202239,4.252938
2,3.630570,4.833629
3,3.136636,4.102333
4,3.340984,4.410522
...,...,...
66771,-0.057672,-0.080314
66772,-0.057229,-0.080314
66773,-0.057217,-0.080314
66774,-0.057477,-0.080314


Each zscore value signifies how many standard deviations away an individual value is from the mean. This is a good indicator to finding outliers in the dataframe.

Usually z-score=3 is considered as a cut-off value to set the limit. Therefore, any z-score greater than +3 or less than -3 is considered as outlier which is pretty much similar to standard deviation method

In [20]:
# data_2013_main = data_2013_main[(np.abs(zscore_2013) < 3).all(axis=1)]

data_2013_main.shape

(66775, 7)

The number of rows has decreased from 66,775 to 66,360. So 415 rows were outliers based on the data.

In [21]:
data_2013_main.head(5)

Unnamed: 0,esid,business_area,service_address,bill_type,bill_date,total_due,kwh_usage
0,1008901000140050014100,2000,10518 BELLAIRE,T,2013-06-27,69785.2,876113.0
1,1008901000140050014100,2000,10518 BELLAIRE,T,2013-05-29,70177.01,879842.0
2,1008901000140050014100,2000,10518 BELLAIRE,T,2013-04-29,79309.4,997407.0
3,1008901000140050014100,2000,10518 BELLAIRE,T,2013-03-28,68778.3,849351.0
4,1008901000140050014100,2000,10518 BELLAIRE,T,2013-02-26,73135.18,911746.0


In [22]:
orig_shape_2013 - data_2013_main.shape[0]

1

In [23]:
data_2013_main.to_csv('electricity_usage_data_2013.csv', index=False)