## Import Statements

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from scipy import stats

pd.options.display.max_columns=25

## Data FY 2013-2

In [2]:
data_2013_2 = pd.read_excel('houston-houston-electricity-bills/coh-ee-bills-may2012-apr2013.xlsx')
orig_shape_2013_2 = data_2013_2.shape[0]

data_2013_2.shape

(65806, 24)

In [3]:
data_2013_2.head(5)

Unnamed: 0,Reliant Contract No,Service Address,Meter No,ESID,Business Area,Cost Center,Fund,Bill Type,Bill Date,Read Date,Due Date,Meter Read,Base Cost ($),T&D Discretionary ($),T&D Charges ($),Current Due ($),Adjustment ($),Total Due ($),Franchise Fee ($),Voucher Date,Billed Demand (KVA),kWh Usage,Nodal Cu Charge ($),Reliability Unit Charge ($)
0,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-04-29,2013-04-23,2013-05-29,55598.0,67005.8,1357.92,10853.62,79309.4,,79309.4,-1036.92,2013-04-30,1502.083333,997407.0,92.06,0.0
1,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-03-28,2013-03-21,2013-04-27,54767.0,57008.44,1300.83,10473.16,68778.3,,68778.3,-995.5,2013-03-29,1432.989691,849351.0,-4.13,0.0
2,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-02-26,2013-02-21,2013-03-28,54059.0,61287.57,1313.49,10534.33,73135.18,,73135.18,-1000.33,2013-02-27,1452.57732,911746.0,-0.21,0.0
3,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2013-01-28,2013-01-22,2013-02-27,53299.0,64657.23,1369.2,10878.65,77043.9,,77043.9,-1048.66,2013-01-29,1498.969072,969810.0,138.82,0.0
4,2059605,10518 BELLAIRE,303261,1008901000140050014100,2000,2000040005,8300,T,2012-12-28,2012-12-20,2013-01-27,52491.0,60872.54,1612.61,10928.6,73740.34,,73740.34,-1088.01,2012-12-31,1572.916667,927935.0,326.59,0.0


### Checking Nulls

In [4]:
data_2013_2.isna().sum()

Reliant Contract No                0
Service Address                    0
Meter No                       14049
ESID                               0
Business Area                      0
Cost Center                        0
Fund                               0
Bill Type                          0
Bill Date                          0
Read Date                          0
Due Date                           0
Meter Read                        16
Base Cost ($)                      0
T&D Discretionary ($)              0
T&D Charges ($)                    0
Current Due ($)                    0
Adjustment ($)                 64229
Total Due ($)                      0
Franchise Fee ($)                  0
Voucher Date                       0
Billed Demand (KVA)               16
kWh Usage                          1
Nodal Cu Charge ($)                4
Reliability Unit Charge ($)       16
dtype: int64

### Checking Adjustment ($) column 

This column was named Index Charge in the other FY 2013 electricity usage data file

In [5]:
data_2013_2['Adjustment ($)'].value_counts(dropna=False)

NaN       64229
0.0        1576
9425.9        1
Name: Adjustment ($), dtype: int64

The column does not have any relevant information based on the above reported values. Electing to drop the column.

In [6]:
data_2013_2.drop(columns=['Adjustment ($)'], inplace=True)

### Checking Unique Number of Customers

There are quite a few columns in the dataset that signify relating to a unique person/house/business. Checking the unique counts of such columns.

In [7]:
check_unique_columns = ['Reliant Contract No', 'Service Address ', 'Meter No', 
                        'ESID', 'Business Area', 'Cost Center',]

for col in check_unique_columns:
    print(f'Number of Unique Values in {col}: {data_2013_2[col].nunique()}')

Number of Unique Values in Reliant Contract No: 5786
Number of Unique Values in Service Address : 5725
Number of Unique Values in Meter No: 4035
Number of Unique Values in ESID: 5783
Number of Unique Values in Business Area: 9
Number of Unique Values in Cost Center: 39


Based on the above reported values and further research online:

ESID signifies a unique ID provided to each customer subscribed to the electricity board. It would be best to choose ESID and Service Address columns going forward as these would provide number of unique customers and the areas (streets) where higher usage of electricity occurs.

Business Area signifies a grouping a number of buildings which covers a certain area. This would be useful usage patterns grouped by certain zones in the city.

### Checking Bill Type

In [8]:
data_2013_2['Bill Type'].value_counts(dropna=False)

T    65252
P      552
C        2
Name: Bill Type, dtype: int64

Bill Type could signify the type of the connection given. Since commercial, residential and government spaces would have different type of pricing and needs this column could be capturing that information.

In [9]:
data_2013_2['Service Address '].nunique(), data_2013_2['Meter No'].nunique(), data_2013_2['ESID'].nunique()

(5725, 4035, 5783)

The next 3 columns are: Bill Date, Read Date and Due Date. Of these it would be best to choose the Bill date across all the data files to keep the data consistent. 

### Electricity Usage Statistics

In [10]:
data_2013_2[['Meter Read', 'Billed Demand (KVA)', 'kWh Usage']].describe()

Unnamed: 0,Meter Read,Billed Demand (KVA),kWh Usage
count,65790.0,65790.0,65805.0
mean,9743.299217,45.011893,19261.32
std,17901.894291,382.63421,204739.2
min,0.0,0.0,0.0
25%,0.0,0.0,1.0
50%,3004.5,0.0,241.0
75%,8669.0,9.0,1789.0
max,239800.0,16775.903614,9689658.0


There are 3 columns that denote the amount of electricity: Meter Read, Billed Demand, kWh Usage.

Using kWh Usage as a standard unit of measurement.

In [11]:
data_2013_2[[
    'Base Cost ($)', 'T&D Discretionary ($)', 'T&D Charges ($)', 
    'Current Due ($)', 'Total Due ($)', 'Franchise Fee ($)', 
    'Nodal Cu Charge ($)', 'Reliability Unit Charge ($)'
     ]].describe()

Unnamed: 0,Base Cost ($),T&D Discretionary ($),T&D Charges ($),Current Due ($),Total Due ($),Franchise Fee ($),Nodal Cu Charge ($),Reliability Unit Charge ($)
count,65806.0,65806.0,65806.0,65806.0,65806.0,65806.0,65802.0,65790.0
mean,1286.967789,374.537243,283.605427,1950.938646,1951.890851,-34.636836,5.695807,0.0
std,13665.877497,11888.489071,1995.337024,21571.815431,21572.522026,241.0143,132.109391,0.0
min,0.0,-7091.41,-37666.73,-44264.86,0.0,-7017.8,-323.08,0.0
25%,0.07,3.1,6.02,10.83,11.02,-5.52,0.0,0.0
50%,16.07,5.8,11.14,32.77,32.65,-0.46,0.0,0.0
75%,119.2575,21.5275,81.7925,250.2975,247.5925,0.0,0.14,0.0
max,650951.22,756478.12,69826.36,907001.56,907001.56,84.91,20461.93,0.0


Reliability Unit Charge ($) does not contain any useful information. Electing to drop that column.

The columns other than Current Due or Total Due are adding up the value present in these two columns. Going forward choosing the column Total Due ($). 
Based on the above statistics the columns Current Due and Total Due represent the same value. 

Based on the above analysis of the dataset choosing the following columns:

1. ESID
2. Business Area
3. Service Address 
3. Bill Type
4. Bill Date
5. Total Due ($)
6. kWh Usage

### Selecting and Filtering Columns

In [12]:
data_2013_2 = data_2013_2[[
    'ESID', 'Business Area', 'Service Address ', 'Bill Type',
    'Bill Date', 'Total Due ($)', 'kWh Usage'
]]

In [13]:
rename_cols = {
    'ESID': 'esid',
    'Business Area': 'business_area',
    'Service Address ': 'service_address',
    'Bill Type': 'bill_type',
    'Bill Date': 'bill_date',
    'Total Due ($)': 'total_due',
    'kWh Usage': 'kwh_usage'
}

data_2013_2_main = data_2013_2.rename(columns=rename_cols)

Checking for Nulls again and dtypes

In [14]:
data_2013_2_main.isna().sum()

esid               0
business_area      0
service_address    0
bill_type          0
bill_date          0
total_due          0
kwh_usage          1
dtype: int64

In [15]:
data_2013_2_main.dropna(subset=['kwh_usage'], inplace=True)

In [16]:
data_2013_2_main.isna().sum()

esid               0
business_area      0
service_address    0
bill_type          0
bill_date          0
total_due          0
kwh_usage          0
dtype: int64

In [17]:
data_2013_2_main.dtypes

esid                       object
business_area               int64
service_address            object
bill_type                  object
bill_date          datetime64[ns]
total_due                 float64
kwh_usage                 float64
dtype: object

In [18]:
data_2013_2_main.shape

(65805, 7)

In [19]:
zscore_2013_2 = stats.zscore(data_2013_2_main[['total_due', 'kwh_usage']])

zscore_2013_2

Unnamed: 0,total_due,kwh_usage
0,3.585927,4.777556
1,3.097755,4.054406
2,3.299719,4.359162
3,3.480909,4.642764
4,3.327772,4.438234
...,...,...
65801,-0.058835,-0.081652
65802,-0.059092,-0.081652
65803,-0.058462,-0.081652
65804,-0.058694,-0.081652


Each zscore value signifies how many standard deviations away an individual value is from the mean. This is a good indicator to finding outliers in the dataframe.

Usually z-score=3 is considered as a cut-off value to set the limit. Therefore, any z-score greater than +3 or less than -3 is considered as outlier which is pretty much similar to standard deviation method

In [20]:
# data_2013_2_main = data_2013_2_main[(np.abs(zscore_2013_2) < 3).all(axis=1)]

data_2013_2_main.shape

(65805, 7)

The number of rows has decreased from 65,805 to 65,388. So 417 rows were outliers based on the data.

In [21]:
data_2013_2_main.head(5)

Unnamed: 0,esid,business_area,service_address,bill_type,bill_date,total_due,kwh_usage
0,1008901000140050014100,2000,10518 BELLAIRE,T,2013-04-29,79309.4,997407.0
1,1008901000140050014100,2000,10518 BELLAIRE,T,2013-03-28,68778.3,849351.0
2,1008901000140050014100,2000,10518 BELLAIRE,T,2013-02-26,73135.18,911746.0
3,1008901000140050014100,2000,10518 BELLAIRE,T,2013-01-28,77043.9,969810.0
4,1008901000140050014100,2000,10518 BELLAIRE,T,2012-12-28,73740.34,927935.0


In [22]:
orig_shape_2013_2 - data_2013_2_main.shape[0]

1

In [23]:
data_2013_2_main.to_csv('electricity_usage_data_2013_2.csv', index=False)