In [1]:
import requests
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm as cm

import seaborn as sns 
import datetime
from datetime import datetime

# 1 Data Gathering - SolarView

https://github.com/social-energy-atlas/solarview-data

## 1.1 Installations

In [2]:
installs = pd.read_csv('data/sea-solar-installations.csv', encoding='utf-8')
installs.head(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
0,southface_S53,Southface,31408.0,13051,Garden City,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601.0,13059,Athens,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030.0,13089,Decatur,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005.0,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2008-01-01,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281.0,13151,Stockbridge,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


In [3]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


# 2 Data Assessment
## 2.1 Installations

In [12]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


- install-date is not in datetime format
- fips is not a string
- error in spelling of original database column

In [13]:
installs.sample(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
1904,southface_AES061,Southface,30606.0,13059,Athens,Georgia,Solar Electric,Residential,2016-08-30,,,,,33.926032,-83.421936
1483,southface_SF124,Southface,31409.0,13051,,Georgia,Solar Electric,Non-Residential,2014-03-25,,,,,32.028773,-81.126069
1673,southface_AES006,Southface,30329.0,13089,Atlanta,Georgia,Solar Electric,Residential,2015-06-08,,,,,33.812353,-84.334551
1011,southface_M415,Southface,30153.0,13233,Rockmart,Georgia,Solar Electric,Residential,2011-12-17,,,,,33.999967,-85.05772
277,southface_S198,Southface,30349.0,13121,Atlanta,Georgia,Solar Hot Water,Residential,2009-09-12,Unknown,13.0,39.0,64.0,33.629506,-84.522003


In [14]:
installs.shape

(2147, 15)

In [15]:
installs.isnull().sum()

sea-install-id          0
oiriginal-db            0
zip                     1
fips                    0
town                   40
state                   0
system-type             0
sector                  0
install-date            0
utility              1544
federal-cong-dist    1380
state-senate-dist    1380
state-house-dist     1380
lat                    22
long                   22
dtype: int64

In [16]:
installs.duplicated().sum()

1

- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.
- zip is an integer and not a string

# 3 Data Cleaning

## 3.1 Copy All Dataframes

### 3.1.1 Define
Copy all dataframes for cleaning purposes.

#### 3.1.1.2 Code

In [4]:
installs_clean = installs.copy()

#### 3.1.1.3 Test

In [5]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


## 3.2 Column Naming
### 3.2.1 Installations
#### 3.2.1.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully, as well as correcting any spelling errors.

#### 3.2.1.2 Code

In [None]:
installs_clean = installs_clean.rename(columns={"sea-install-id": "sea_install_id", 
                                                "oiriginal-db": "original_db", 
                                                "system-type": "system_type", 
                                                "install-date": "install_date", 
                                                "federal-cong-dist": "fed_congressional_district", 
                                                "state-senate-dist": "state_senate_district", 
                                                "state-house-dist": "state_house_district"})

#### 3.2.1.3 Test

In [None]:
installs_clean.head()

### 3.2.2 Census
#### 3.2.2.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully.

#### 3.2.2.2 Code

In [None]:
census_clean.info()

In [None]:
census_clean = census_clean.rename(columns={"med-income": "med_income",
                                            "owner-occ": "owner_occ", 
                                            "pop-tot": "pop_tot", 
                                            "dem-fem-pct": "dem_fem_pct", 
                                            "dem-male-pct": "dem_male_pct", 
                                            "dem-white-pct": "dem_white_pct",
                                            "dem-baa-pct": "dem_baa_pct",
                                            "dem-aian-pct": "dem_aian_pct",
                                            "dem-a-pct": "dem_a_pct",
                                            "dem-nhpi-pct": "dem_nhpi_pct",
                                            "dem-two-pct": "dem_two_pct",
                                            "dem-hl-pct": "dem_hl_pct",
                                            "dem-vet": "dem_vet",
                                            "dem-hh": "dem_hh"
                                           })

#### 3.2.2.3 Test

In [None]:
census_clean.head()

### 3.2.2 Convert install_date to datetime format
#### 3.2.2.1 Define
The install_date column should be in datetime format. Drop any unknown dates and convert those values to datetime.

#### 3.2.2.2 Code

In [None]:
installs_clean.info()

In [None]:
installs_clean['install_date'] = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

#### 3.2.2.3 Test

In [None]:
installs_clean.info()

### 3.2.3 Convert fips, zip, and congressional district columns to strings
#### 3.2.3.1 Define
Convert the fips, zip, fed_congressional_district, state_senate_district, and state_house_district columns to string format all columns.

#### 3.2.3.2 Code

In [None]:
print(installs_clean[installs_clean['zip'].isnull()])

In [None]:
installs_clean.drop([1833], inplace=True)
installs_clean['zip'] = installs_clean['zip'].round(0).astype(int)

#### 3.2.3.3 Test

In [None]:
installs_clean.info()

In [None]:
installs_clean.head(5)

### 3.2.4 Investigate town NaN values
#### 3.2.4.1 Define
Query the dataset and investigate NaN values for the town column to see if data can be retrieved to complete. Otherwise, remove observations from the dataset.

#### 3.2.4.2 Code

In [None]:
installs_clean[installs_clean['town'].isnull()]

In [None]:
print(installs_clean['zip'][36])
print(installs_clean['zip'][43])
print(installs_clean['zip'][684])
print(installs_clean['zip'][885])
print(installs_clean['zip'][1349])

In [None]:
installs_clean.town[36] = 'Atlanta'
installs_clean.town[43] = 'Atlanta'
installs_clean.town[684] = 'Woodstock'
installs_clean.town[885] = 'Columbus'
installs_clean.town[1349] = 'Oxford'

In [None]:
print(installs_clean['zip'][1353])
print(installs_clean['zip'][1370])
print(installs_clean['zip'][1382])
print(installs_clean['zip'][1415])
print(installs_clean['zip'][1451])

In [None]:
installs_clean.town[1382] = 'Savannah'
installs_clean.town[1451] = 'Peachtree City'
installs_clean.town[1353] = 'Cumming'
installs_clean.town[1370] = 'Jasper'
installs_clean.town[1415] = 'Davisboro'

In [None]:
print(installs_clean['zip'][1472])
print(installs_clean['zip'][1473])
print(installs_clean['zip'][1483])
print(installs_clean['zip'][1490])
print(installs_clean['zip'][1493])

In [None]:
installs_clean.town[1472] = 'Metter'
installs_clean.town[1473] = 'Butler'
installs_clean.town[1483] = 'Savannah'
installs_clean.town[1490] = 'Hogansville'
installs_clean.town[1493] = 'Madison'

In [None]:
print(installs_clean['zip'][1497])
print(installs_clean['zip'][1506])
print(installs_clean['zip'][1507])
print(installs_clean['zip'][1508])
print(installs_clean['zip'][1509])

In [None]:
installs_clean.town[1497] = 'Lawrenceville'
installs_clean.town[1506] = 'Newborn'
installs_clean.town[1507] = 'Cartersville'
installs_clean.town[1508] = 'Greenville'
installs_clean.town[1509] = 'Dahlonega'

In [None]:
print(installs_clean['zip'][1517])
print(installs_clean['zip'][1518])
print(installs_clean['zip'][1529])
print(installs_clean['zip'][1551])
print(installs_clean['zip'][1552])

In [None]:
installs_clean.town[1517] = 'Augusta'
installs_clean.town[1518] = 'Lavonia'
installs_clean.town[1529] = 'Buckhead'
installs_clean.town[1551] = 'Decatur'
installs_clean.town[1552] = 'Claxton'

In [None]:
print(installs_clean['zip'][1558])
print(installs_clean['zip'][1561])
print(installs_clean['zip'][1562])
print(installs_clean['zip'][1565])
print(installs_clean['zip'][1577])

In [None]:
installs_clean.town[1558] = 'Decatur'
installs_clean.town[1561] = 'Decatur'
installs_clean.town[1562] = 'Brunswick'
installs_clean.town[1565] = 'Jefferson'
installs_clean.town[1577] = 'Cave Spring'

In [None]:
print(installs_clean['zip'][1581])
print(installs_clean['zip'][1583])
print(installs_clean['zip'][1584])
print(installs_clean['zip'][1585])
print(installs_clean['zip'][1587])

In [None]:
installs_clean.town[1581] = 'Augusta'
installs_clean.town[1583] = 'Donalsonville'
installs_clean.town[1584] = 'Acorn Pond'
installs_clean.town[1585] = 'Chester'
installs_clean.town[1587] = 'Decatur'

In [None]:
print(installs_clean['zip'][1606])
print(installs_clean['zip'][1607])
print(installs_clean['zip'][1611])
print(installs_clean['zip'][1991])

In [None]:
installs_clean.town[1606] = 'Unadilla'
installs_clean.town[1607] = 'Rome'
installs_clean.town[1611] = 'Sandy Springs'
installs_clean.town[1991] = 'Butler'

#### 3.2.4.3 Test

In [None]:
print(installs_clean[installs_clean['town'].isnull()])

### 3.2.5 Remove all observations with no latitude or longitude
#### 3.2.5.1 Define
Query the dataset and remove all NaN lat/long observations.

#### 3.2.5.2 Code

In [None]:
installs_clean[installs_clean['lat'].isnull()]

In [None]:
installs_clean[installs_clean['lat'].isnull()].shape

In [None]:
installs_clean.drop([29,31,90,217,284,290,339,341,358,388,409,455,549,577,583,762,763,1001,1007,1260,1337,1644], inplace=True)

#### 3.2.5.2 Test

In [None]:
installs_clean[installs_clean['lat'].isnull()]

In [None]:
installs_clean[installs_clean['utility'].isnull()]

In [None]:
util_rates_clean.info()

In [None]:
util_rates_clean['zip'] = util_rates_clean['zip'].round(0).astype(int)

In [None]:
util_rates_clean['zip'] = util_rates_clean['zip'].astype(str)

In [None]:
util_rates_clean.utility_id[util_rates_clean.zip =='30318']

## 3.2 Installations

Installations

- install-date is not in datetime format
- fips is not a string
- zip is an integer and not a string
- congressional districts (federal and state) are not strings
- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.


### 3.2.1 Rename Columns
#### 3.2.1.1 Define
