In [1]:
import requests
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm as cm

import seaborn as sns 
import datetime
from datetime import datetime

# 1 Data Gathering - SolarView

https://github.com/social-energy-atlas/solarview-data

## 1.1 Installations

## 1.8 Utility Rates

In [2]:
util_rates = pd.read_csv('data/sea-utility-rates.csv', encoding='utf-8')
util_rates.head(5)

Unnamed: 0,sea-rate-id,year,zip,utility-id,service-type,comm-rate,ind-rate,res-rate
0,sea-util-1-rate-31702,2015,31702,sea-util-1,Bundled,0.105766,0.0,0.108669
1,sea-util-1-rate-31703,2015,31703,sea-util-1,Bundled,0.105766,0.0,0.108669
2,sea-util-1-rate-31706,2015,31706,sea-util-1,Bundled,0.105766,0.0,0.108669
3,sea-util-1-rate-31705,2015,31705,sea-util-1,Bundled,0.105766,0.0,0.108669
4,sea-util-1-rate-31701,2015,31701,sea-util-1,Bundled,0.105766,0.0,0.108669


# 2 Data Assessment
## 2.1 Installations

## 2.8 Utility Rates

In [3]:
util_rates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
sea-rate-id     2000 non-null object
year            2000 non-null int64
zip             2000 non-null int64
utility-id      2000 non-null object
service-type    2000 non-null object
comm-rate       2000 non-null float64
ind-rate        2000 non-null float64
res-rate        2000 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 125.1+ KB


In [4]:
util_rates.head(5)

Unnamed: 0,sea-rate-id,year,zip,utility-id,service-type,comm-rate,ind-rate,res-rate
0,sea-util-1-rate-31702,2015,31702,sea-util-1,Bundled,0.105766,0.0,0.108669
1,sea-util-1-rate-31703,2015,31703,sea-util-1,Bundled,0.105766,0.0,0.108669
2,sea-util-1-rate-31706,2015,31706,sea-util-1,Bundled,0.105766,0.0,0.108669
3,sea-util-1-rate-31705,2015,31705,sea-util-1,Bundled,0.105766,0.0,0.108669
4,sea-util-1-rate-31701,2015,31701,sea-util-1,Bundled,0.105766,0.0,0.108669


In [5]:
util_rates.isnull().sum()

sea-rate-id     0
year            0
zip             0
utility-id      0
service-type    0
comm-rate       0
ind-rate        0
res-rate        0
dtype: int64

In [6]:
util_rates.duplicated().sum()

0

- zip is integer and not string
- Need to modify column name for sea-rate-id, utlity-id, service-type, comm-rate, ind-rate, and res-rate

# 3 Data Cleaning

## 3.1 Copy All Dataframes

### 3.1.1 Define
Copy all dataframes for cleaning purposes.

#### 3.1.1.2 Code

In [7]:
util_rates_clean = util_rates.copy()

#### 3.1.1.3 Test

In [8]:
util_rates_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
sea-rate-id     2000 non-null object
year            2000 non-null int64
zip             2000 non-null int64
utility-id      2000 non-null object
service-type    2000 non-null object
comm-rate       2000 non-null float64
ind-rate        2000 non-null float64
res-rate        2000 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 125.1+ KB


## 3.2 Column Naming
### 3.2.1 Installations
#### 3.2.1.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully, as well as correcting any spelling errors.

#### 3.2.1.2 Code

In [9]:
util_rates_clean = util_rates_clean.rename(columns={"sea-rate-id": "sea_rate_id", 
                                                "utility-id": "utility_id", 
                                                "service-type": "service_type", 
                                                "comm-rate": "comm_rate", 
                                                "ind-rate": "ind_rate", 
                                                "res-rate": "res_rate"})

#### 3.2.1.3 Test

In [10]:
util_rates_clean.head()

Unnamed: 0,sea_rate_id,year,zip,utility_id,service_type,comm_rate,ind_rate,res_rate
0,sea-util-1-rate-31702,2015,31702,sea-util-1,Bundled,0.105766,0.0,0.108669
1,sea-util-1-rate-31703,2015,31703,sea-util-1,Bundled,0.105766,0.0,0.108669
2,sea-util-1-rate-31706,2015,31706,sea-util-1,Bundled,0.105766,0.0,0.108669
3,sea-util-1-rate-31705,2015,31705,sea-util-1,Bundled,0.105766,0.0,0.108669
4,sea-util-1-rate-31701,2015,31701,sea-util-1,Bundled,0.105766,0.0,0.108669


### 3.2.2 Zip to string
#### 3.2.2.1 Define
 

#### 3.2.2.2 Code

In [11]:
util_rates_clean.zip = util_rates_clean['zip'].astype(str)

#### Test

In [12]:
util_rates_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
sea_rate_id     2000 non-null object
year            2000 non-null int64
zip             2000 non-null object
utility_id      2000 non-null object
service_type    2000 non-null object
comm_rate       2000 non-null float64
ind_rate        2000 non-null float64
res_rate        2000 non-null float64
dtypes: float64(3), int64(1), object(4)
memory usage: 125.1+ KB


#### Date Time for Year

In [13]:
util_rates_clean['year'] = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

#### 3.2.2.3 Test

In [14]:
util_rates_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
sea_rate_id     2000 non-null object
year            2000 non-null datetime64[ns]
zip             2000 non-null object
utility_id      2000 non-null object
service_type    2000 non-null object
comm_rate       2000 non-null float64
ind_rate        2000 non-null float64
res_rate        2000 non-null float64
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 125.1+ KB


## Store Data

In [15]:
util_rates_clean.to_csv('clean_data/util_rates_clean.csv', encoding='utf-8', index=False)