In [1]:
import requests
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm as cm

import seaborn as sns 
import datetime

# 1 Data Gathering - SolarView

https://github.com/social-energy-atlas/solarview-data

## 1.1 Installations

In [2]:
installs = pd.read_csv('data/sea-solar-installations.csv', encoding='utf-8')
installs.head(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
0,southface_S53,Southface,31408.0,13051,Garden City,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601.0,13059,Athens,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030.0,13089,Decatur,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005.0,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2008-01-01,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281.0,13151,Stockbridge,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


In [3]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


## 1.2 County-level Census Data

In [4]:
census = pd.read_csv('data/sea-county-census.csv', encoding='utf-8')
census.head(5)

Unnamed: 0,fips,med-income,owner-occ,pop-tot,dem-fem-pct,dem-male-pct,dem-white-pct,dem-baa-pct,dem-aian-pct,dem-a-pct,dem-nhpi-pct,dem-two-pct,dem-hl-pct,dem-vet,dem-hh
0,13001,"$37,135",71.10%,18428,50.10%,49.90%,69.50%,19.40%,0.60%,0.90%,0.20%,1.20%,9.60%,1053,8381
1,13003,"$30,933",72.00%,8273,49.90%,50.10%,56.20%,17.50%,1.40%,0.90%,1.10%,2.10%,25.20%,385,3429
2,13005,"$37,162",68.60%,11372,50.70%,49.30%,73.10%,16.60%,0.30%,0.60%,0.20%,1.60%,8.70%,559,4715
3,13007,"$44,297",73.60%,3150,51.40%,48.60%,47.00%,45.90%,0.40%,1.10%,0.10%,1.10%,5.60%,178,1620
4,13009,"$32,460",53.80%,45144,49.40%,50.60%,52.60%,42.40%,0.30%,1.80%,0.10%,1.20%,2.20%,2911,20277


## 1.3 County Name-Fips Map

In [5]:
fips = pd.read_csv('data/sea-county-name.csv', encoding='utf-8')
fips.head(5)

Unnamed: 0,fips,county
0,13001,Appling
1,13003,Atkinson
2,13005,Bacon
3,13007,Baker
4,13009,Baldwin


## 1.4 Zillow Valuation Information

In [6]:
zillow = pd.read_csv('data/sea-county-zillow.csv', encoding='utf-8')
zillow.head(5)

Unnamed: 0,fips,med-zhvi,med-zrvi
0,13001,,
1,13003,,
2,13005,,
3,13007,,
4,13009,$66.08,$0.62


## 1.5 DSIRE Incentive Counts

In [7]:
dsire = pd.read_csv('data/sea-dsire-incentives.csv', encoding='utf-8')
dsire.head(5)

Unnamed: 0,fips,fed-total,fed-fin-incent-total,fed-fin-incent-corp-deprec,fed-fin-incent-corp-tax-credit,fed-fin-incent-corp-tax-deduction,fed-fin-incent-corp-tax-exemption,fed-fin-incent-grant-prog,fed-fin-incent-loan-prog,fed-fin-incent-pers-tax-credit,...,state-reg-policy,state-reg-policy-build-energy-code,state-reg-policy-energy-stand-build,state-reg-policy-interconn,state-reg-policy-net-metering,state-reg-policy-solar-wind-access,state-tech-res,state-tech-res-energy-analysis,state-tech-res-other,state-tech-res-training-info
0,13001,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
1,13003,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
2,13005,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
3,13007,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
4,13009,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1


## 1.6 Solar Suitability

In [8]:
suit = pd.read_csv('data/sea-solar-suitability.csv', encoding='utf-8')
suit.head(5)

Unnamed: 0,zip,locale,nbld,pct-suitable
0,30002,Suburb Large,1607.0,0.634723
1,30004,Suburb Large,13273.0,0.820359
2,30005,Suburb Large,8411.0,0.841822
3,30008,Suburb Large,5849.0,0.819148
4,30009,Suburb Large,3358.0,0.824698


## 1.7 Utility List and Ownership Type

In [9]:
util = pd.read_csv('data/sea-utility-id.csv', encoding='utf-8')
util.head(5)

Unnamed: 0,utility-id,name,ownership
0,sea-util-1,Albany Water Gas & Light Comm,Municipal
1,sea-util-2,Altamaha Electric Member Corp,Cooperative
2,sea-util-3,Amicalola Electric Member Corp,Cooperative
3,sea-util-4,Blue Ridge Mountain EMC - (GA),Cooperative
4,sea-util-5,Canoochee Electric Member Corp,Cooperative


## 1.8 Utility Rates

In [10]:
util_rates = pd.read_csv('data/sea-utility-rates.csv', encoding='utf-8')
util_rates.head(5)

Unnamed: 0,sea-rate-id,year,zip,utility-id,service-type,comm-rate,ind-rate,res-rate
0,sea-util-1-rate-31702,2015,31702,sea-util-1,Bundled,0.105766,0.0,0.108669
1,sea-util-1-rate-31703,2015,31703,sea-util-1,Bundled,0.105766,0.0,0.108669
2,sea-util-1-rate-31706,2015,31706,sea-util-1,Bundled,0.105766,0.0,0.108669
3,sea-util-1-rate-31705,2015,31705,sea-util-1,Bundled,0.105766,0.0,0.108669
4,sea-util-1-rate-31701,2015,31701,sea-util-1,Bundled,0.105766,0.0,0.108669


## 1.9 Zipcode Table

In [11]:
zipcode = pd.read_csv('data/sea-zipcode-county.csv', encoding='utf-8')
zipcode.head(5)

Unnamed: 0,sea-zip-id,zip,fips,year
0,zip-2018-1,31557,13001,2018
1,zip-2018-2,31513,13001,2018
2,zip-2018-3,31518,13001,2018
3,zip-2018-4,31539,13001,2018
4,zip-2018-5,31560,13001,2018


# 2 Data Assessment
## 2.1 Installations

In [12]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


- install-date is not in datetime format
- lat and long are not strings
- fips is not a string
- error in spelling of original database column

In [13]:
installs.sample(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
846,southface_M119,Southface,30677.0,13133,Watkinsville,Georgia,Solar Hot Water,Residential,2011-06-02,Walton EMC,10.0,25.0,120.0,33.711257,-83.340185
162,southface_S124,Southface,31328.0,13051,Tybee Island,Georgia,Solar Electric,Residential,2009-02-13,,,,,32.006704,-80.84231
1831,southface_GPC-24237,Southface,31642.0,13003,Pearson,Georgia,Solar Electric,Utility,2016-05-30,Georgia Power Company,8.0,7.0,176.0,31.188063,-82.794125
1335,southface_M993,Southface,31804.0,13145,Cataula,Georgia,Solar Hot Water,Residential,2013-01-08,Unknown,3.0,29.0,133.0,32.683231,-84.962355
246,southface_S179,Southface,30350.0,13121,Atlanta,Georgia,Solar Hot Water,Residential,2009-08-10,Unknown,6.0,40.0,51.0,33.973443,-84.305448


In [14]:
installs.shape

(2147, 15)

In [15]:
installs.isnull().sum()

sea-install-id          0
oiriginal-db            0
zip                     1
fips                    0
town                   40
state                   0
system-type             0
sector                  0
install-date            0
utility              1544
federal-cong-dist    1380
state-senate-dist    1380
state-house-dist     1380
lat                    22
long                   22
dtype: int64

- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.
- zip is an integer and not a string

## 2.2 County-level Census Data

In [16]:
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 15 columns):
fips             159 non-null int64
 med-income      159 non-null object
owner-occ        159 non-null object
pop-tot          159 non-null object
dem-fem-pct      159 non-null object
dem-male-pct     159 non-null object
dem-white-pct    159 non-null object
dem-baa-pct      159 non-null object
dem-aian-pct     159 non-null object
dem-a-pct        159 non-null object
dem-nhpi-pct     159 non-null object
dem-two-pct      159 non-null object
dem-hl-pct       159 non-null object
dem-vet          159 non-null object
dem-hh           159 non-null object
dtypes: int64(1), object(14)
memory usage: 18.7+ KB


- owner-occ,, dem-vet, and all pct columns are objects/strings and not floats
- median income, pop-tot, dem-hh, and dem-vet are objects/strings and not integers.

In [17]:
census.sample(5)

Unnamed: 0,fips,med-income,owner-occ,pop-tot,dem-fem-pct,dem-male-pct,dem-white-pct,dem-baa-pct,dem-aian-pct,dem-a-pct,dem-nhpi-pct,dem-two-pct,dem-hl-pct,dem-vet,dem-hh
70,13143,"$42,281",69.70%,29042,50.10%,49.90%,77.30%,16.90%,0.40%,1.00%,0.10%,1.70%,3.30%,3490,11699
131,13267,"$35,578",66.90%,25092,52.00%,48.00%,57.50%,38.80%,0.20%,0.80%,0,1.00%,2.50%,478,3368
2,13005,"$37,162",68.60%,11372,50.70%,49.30%,73.10%,16.60%,0.30%,0.60%,0.20%,1.60%,8.70%,559,4715
24,13051,"$47,218",54.60%,289082,51.80%,48.20%,49.00%,40.50%,0.30%,2.90%,0.10%,2.20%,6.30%,24548,104912
149,13303,"$37,417",69.80%,20457,48.50%,51.50%,71.60%,20.00%,0.60%,0.60%,0.10%,2.00%,6.30%,2324,10016


In [18]:
census.isnull().sum()

fips             0
 med-income      0
owner-occ        0
pop-tot          0
dem-fem-pct      0
dem-male-pct     0
dem-white-pct    0
dem-baa-pct      0
dem-aian-pct     0
dem-a-pct        0
dem-nhpi-pct     0
dem-two-pct      0
dem-hl-pct       0
dem-vet          0
dem-hh           0
dtype: int64

In [19]:
census.duplicated().sum()

0

## 2.3 County Name-FIPS Map

In [20]:
fips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 2 columns):
fips      159 non-null int64
county    159 non-null object
dtypes: int64(1), object(1)
memory usage: 2.6+ KB


- fips should be a string and not an integer

In [21]:
fips.head(5)

Unnamed: 0,fips,county
0,13001,Appling
1,13003,Atkinson
2,13005,Bacon
3,13007,Baker
4,13009,Baldwin


In [22]:
fips.isnull().sum()

fips      0
county    0
dtype: int64

In [23]:
fips.duplicated().sum()

0

## 2.4 Zillow Valuation Information

In [24]:
zillow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 3 columns):
fips        159 non-null int64
med-zhvi    84 non-null object
med-zrvi    117 non-null object
dtypes: int64(1), object(2)
memory usage: 3.8+ KB


In [25]:
zillow.head(5)

Unnamed: 0,fips,med-zhvi,med-zrvi
0,13001,,
1,13003,,
2,13005,,
3,13007,,
4,13009,$66.08,$0.62


- med-zhvi and med-zrvi should both be floats
- Multiple NaN values exist for med-zhvi and med-zrvi

In [26]:
zillow.isnull().sum()

fips         0
med-zhvi    75
med-zrvi    42
dtype: int64

In [27]:
zillow.duplicated().sum()

0

## 2.5 DSIRE Incentive Counts

In [28]:
dsire.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 38 columns):
fips                                   159 non-null int64
fed-total                              159 non-null int64
fed-fin-incent-total                   159 non-null int64
fed-fin-incent-corp-deprec             159 non-null int64
fed-fin-incent-corp-tax-credit         159 non-null int64
fed-fin-incent-corp-tax-deduction      159 non-null int64
fed-fin-incent-corp-tax-exemption      159 non-null int64
fed-fin-incent-grant-prog              159 non-null int64
fed-fin-incent-loan-prog               159 non-null int64
fed-fin-incent-pers-tax-credit         159 non-null int64
fed-fin-incent-pers-tax-exemption      159 non-null int64
fed-reg-policy                         159 non-null int64
fed-reg-policy-appeq-eff-stand         159 non-null int64
fed-reg-policy-energy-stand-build      159 non-null int64
fed-reg-policy-gpp                     159 non-null int64
fed-reg-policy-interconn 

- Naming issues in columns must be taken care of (- should be replaced with _)
- After naming, investigate float for state-reg-policy-energy-stand-build
- fips should be string and not integer
- state-reg-policy-energy-stand-build looks to be an incomplete column for most observations.

In [30]:
dsire.head(5)

Unnamed: 0,fips,fed-total,fed-fin-incent-total,fed-fin-incent-corp-deprec,fed-fin-incent-corp-tax-credit,fed-fin-incent-corp-tax-deduction,fed-fin-incent-corp-tax-exemption,fed-fin-incent-grant-prog,fed-fin-incent-loan-prog,fed-fin-incent-pers-tax-credit,...,state-reg-policy,state-reg-policy-build-energy-code,state-reg-policy-energy-stand-build,state-reg-policy-interconn,state-reg-policy-net-metering,state-reg-policy-solar-wind-access,state-tech-res,state-tech-res-energy-analysis,state-tech-res-other,state-tech-res-training-info
0,13001,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
1,13003,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
2,13005,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
3,13007,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1
4,13009,41,24,1,2,1,2,7,8,2,...,4,1,,1,1,1,5,4,0,1


## 2.6 Solar Suitability

In [31]:
suit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 4 columns):
zip             712 non-null int64
locale          712 non-null object
nbld            702 non-null float64
pct-suitable    712 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 22.3+ KB


In [32]:
suit.head(5)

Unnamed: 0,zip,locale,nbld,pct-suitable
0,30002,Suburb Large,1607.0,0.634723
1,30004,Suburb Large,13273.0,0.820359
2,30005,Suburb Large,8411.0,0.841822
3,30008,Suburb Large,5849.0,0.819148
4,30009,Suburb Large,3358.0,0.824698


In [33]:
suit.isnull().sum()

zip              0
locale           0
nbld            10
pct-suitable     0
dtype: int64

In [34]:
suit.duplicated().sum()

0

## 2.7 Utility List and Ownership Type

In [35]:
util.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 3 columns):
utility-id    74 non-null object
name          74 non-null object
ownership     74 non-null object
dtypes: object(3)
memory usage: 1.8+ KB


In [36]:
util.head(5)

Unnamed: 0,utility-id,name,ownership
0,sea-util-1,Albany Water Gas & Light Comm,Municipal
1,sea-util-2,Altamaha Electric Member Corp,Cooperative
2,sea-util-3,Amicalola Electric Member Corp,Cooperative
3,sea-util-4,Blue Ridge Mountain EMC - (GA),Cooperative
4,sea-util-5,Canoochee Electric Member Corp,Cooperative


- Need to modify utility-id column heading

## 2.8 Utility Rates

In [37]:
util_rates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
sea-rate-id     2000 non-null object
year            2000 non-null int64
zip             2000 non-null int64
utility-id      2000 non-null object
service-type    2000 non-null object
comm-rate       2000 non-null float64
ind-rate        2000 non-null float64
res-rate        2000 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 125.1+ KB


In [38]:
util_rates.head(5)

Unnamed: 0,sea-rate-id,year,zip,utility-id,service-type,comm-rate,ind-rate,res-rate
0,sea-util-1-rate-31702,2015,31702,sea-util-1,Bundled,0.105766,0.0,0.108669
1,sea-util-1-rate-31703,2015,31703,sea-util-1,Bundled,0.105766,0.0,0.108669
2,sea-util-1-rate-31706,2015,31706,sea-util-1,Bundled,0.105766,0.0,0.108669
3,sea-util-1-rate-31705,2015,31705,sea-util-1,Bundled,0.105766,0.0,0.108669
4,sea-util-1-rate-31701,2015,31701,sea-util-1,Bundled,0.105766,0.0,0.108669


- zip is integer and not string
- Need to modify column name for sea-rate-id, utlity-id, service-type, comm-rate, ind-rate, and res-rate

## 2.9 Zipcode Table

In [39]:
zipcode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13051 entries, 0 to 13050
Data columns (total 4 columns):
sea-zip-id    13051 non-null object
zip           13051 non-null int64
fips          13051 non-null int64
year          13051 non-null int64
dtypes: int64(3), object(1)
memory usage: 407.9+ KB


In [40]:
zipcode.head(5)

Unnamed: 0,sea-zip-id,zip,fips,year
0,zip-2018-1,31557,13001,2018
1,zip-2018-2,31513,13001,2018
2,zip-2018-3,31518,13001,2018
3,zip-2018-4,31539,13001,2018
4,zip-2018-5,31560,13001,2018


- zip and fips are integers and not strings
- sea-zip-id needs to be renamed

# 3 Data Cleaning

## 3.1 Copy All Dataframes

### 3.1.1 Define
Copy all dataframes for cleaning purposes.

#### 3.1.1.2 Code

In [41]:
installs_clean = installs.copy()
census_clean = census.copy()
fips_clean = fips.copy()
zillow_clean = zillow.copy()
dsire_clean = dsire.copy()
suit_clean = suit.copy()
util_clean = util.copy()
util_rates_clean = util_rates.copy()
zipcode_clean = zipcode.copy()

#### 3.1.1.3 Test

In [42]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


In [43]:
census_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 15 columns):
fips             159 non-null int64
 med-income      159 non-null object
owner-occ        159 non-null object
pop-tot          159 non-null object
dem-fem-pct      159 non-null object
dem-male-pct     159 non-null object
dem-white-pct    159 non-null object
dem-baa-pct      159 non-null object
dem-aian-pct     159 non-null object
dem-a-pct        159 non-null object
dem-nhpi-pct     159 non-null object
dem-two-pct      159 non-null object
dem-hl-pct       159 non-null object
dem-vet          159 non-null object
dem-hh           159 non-null object
dtypes: int64(1), object(14)
memory usage: 18.7+ KB


In [44]:
fips_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 2 columns):
fips      159 non-null int64
county    159 non-null object
dtypes: int64(1), object(1)
memory usage: 2.6+ KB


In [45]:
zillow_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 3 columns):
fips        159 non-null int64
med-zhvi    84 non-null object
med-zrvi    117 non-null object
dtypes: int64(1), object(2)
memory usage: 3.8+ KB


In [46]:
dsire_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 38 columns):
fips                                   159 non-null int64
fed-total                              159 non-null int64
fed-fin-incent-total                   159 non-null int64
fed-fin-incent-corp-deprec             159 non-null int64
fed-fin-incent-corp-tax-credit         159 non-null int64
fed-fin-incent-corp-tax-deduction      159 non-null int64
fed-fin-incent-corp-tax-exemption      159 non-null int64
fed-fin-incent-grant-prog              159 non-null int64
fed-fin-incent-loan-prog               159 non-null int64
fed-fin-incent-pers-tax-credit         159 non-null int64
fed-fin-incent-pers-tax-exemption      159 non-null int64
fed-reg-policy                         159 non-null int64
fed-reg-policy-appeq-eff-stand         159 non-null int64
fed-reg-policy-energy-stand-build      159 non-null int64
fed-reg-policy-gpp                     159 non-null int64
fed-reg-policy-interconn 

In [47]:
suit_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Data columns (total 4 columns):
zip             712 non-null int64
locale          712 non-null object
nbld            702 non-null float64
pct-suitable    712 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 22.3+ KB


In [48]:
util_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 3 columns):
utility-id    74 non-null object
name          74 non-null object
ownership     74 non-null object
dtypes: object(3)
memory usage: 1.8+ KB


In [49]:
util_rates_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
sea-rate-id     2000 non-null object
year            2000 non-null int64
zip             2000 non-null int64
utility-id      2000 non-null object
service-type    2000 non-null object
comm-rate       2000 non-null float64
ind-rate        2000 non-null float64
res-rate        2000 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 125.1+ KB


In [50]:
zipcode_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13051 entries, 0 to 13050
Data columns (total 4 columns):
sea-zip-id    13051 non-null object
zip           13051 non-null int64
fips          13051 non-null int64
year          13051 non-null int64
dtypes: int64(3), object(1)
memory usage: 407.9+ KB


## 3.2 Installations

Installations

- install-date is not in datetime format
- lat and long are not strings
- fips is not a string
- error in spelling of original database column
- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.
- zip is an integer and not a string
- Remove duplicates of sea_install_id

### 3.2.1 Rename Columns
#### 3.2.1.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully.

#### 3.2.1.2 Code

In [51]:
installs_clean = installs_clean.rename(columns={"sea-install-id": "sea_install_id", "oiriginal-db": "original_db", "system-type": "system_type", "install-date": "install_date", "federal-cong-dist": "fed_congressional_district", "state-senate-dist": "state_senate_district", "state-house-dist": "state_house_district"})

In [52]:
installs_clean

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
1204,southface_RS21,Southface,30368.0,13121,Palmetto,Georgia,Solar Electric,Residential,2012-07-01,,,,,33.51672,-84.734571


### 3.2.2 Remove Duplicated Observations
#### 3.2.2.1 Define
There are six duplicated observations with the same sea_install_id that need to be removed.

#### 3.2.2.2 Code

In [53]:
installs_clean.drop([1220], inplace=True)

#### 3.2.2.3 Test

In [54]:
installs_clean[installs_clean.duplicated()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
1204,southface_RS21,Southface,30368.0,13121,Palmetto,Georgia,Solar Electric,Residential,2012-07-01,,,,,33.51672,-84.734571


### 3.2.3 Convert install_date to datetime format
#### 3.2.3.1 Define
The install_date column should be in datetime format.

#### 3.2.3.2 Code

In [None]:
installs_clean.info()

In [None]:
installs_clean['install_year'] = installs_clean['install_date'].str.extract('(....)', expand=True)

In [None]:
installs_clean['install_year'].value_counts()

In [None]:
installs_clean['install_year'] = installs_clean['install_year'].str.replace('Unkn','Unknown')

In [None]:
installs_clean['install_year'].value_counts()

In [None]:
installs_clean.drop(['install_date'], axis=1,inplace= True)

In [None]:
installs_clean.info()

In [None]:
installs_clean['install_year'] = pd.to_numeric(installs_clean.install_year)