In [97]:
import numpy as np
import pandas as pd
import os

import datetime
from datetime import datetime

# 1 Data Gathering - SolarView

https://github.com/social-energy-atlas/solarview-data

## 1.1 Installations

In [98]:
installs = pd.read_csv('data/sea-solar-installations.csv', encoding='utf-8')
installs.head(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
0,southface_S53,Southface,31408.0,13051,Garden City,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601.0,13059,Athens,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030.0,13089,Decatur,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005.0,13121,Alpharetta,Georgia,Solar Hot Water,Residential,01/01/2008,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281.0,13151,Stockbridge,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


In [99]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 15 columns):
sea-install-id       2139 non-null object
oiriginal-db         2139 non-null object
zip                  2138 non-null float64
fips                 2139 non-null int64
town                 2099 non-null object
state                2139 non-null object
system-type          2139 non-null object
sector               2139 non-null object
install-date         2139 non-null object
utility              601 non-null object
federal-cong-dist    765 non-null float64
state-senate-dist    765 non-null float64
state-house-dist     765 non-null float64
lat                  2117 non-null float64
long                 2117 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 250.8+ KB


# 2 Data Assessment
## 2.1 Installations

In [100]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 15 columns):
sea-install-id       2139 non-null object
oiriginal-db         2139 non-null object
zip                  2138 non-null float64
fips                 2139 non-null int64
town                 2099 non-null object
state                2139 non-null object
system-type          2139 non-null object
sector               2139 non-null object
install-date         2139 non-null object
utility              601 non-null object
federal-cong-dist    765 non-null float64
state-senate-dist    765 non-null float64
state-house-dist     765 non-null float64
lat                  2117 non-null float64
long                 2117 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 250.8+ KB


- install-date is not in datetime format
- fips is not a string
- error in spelling of original database column

In [101]:
installs.sample(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
1557,southface_UR10,Southface,30477.0,13163,Wadley,Georgia,Solar Electric,Utility,12/16/2014,Georgia Power Company,10.0,23.0,128.0,32.884425,-82.40895
2135,southface_CS113,Southface,30306.0,13121,Atlanta,Georgia,Solar Electric,Residential,08/30/2017,,,,,33.778986,-84.354015
1080,southface_M450,Southface,30067.0,13067,Marietta,Georgia,Solar Electric,Non-Residential,02/27/2012,,6.0,33.0,42.0,33.949305,-84.499942
1704,southface_CS15,Southface,31709.0,13261,Americus,Georgia,Solar Electric,Non-Residential,08/01/2015,,,,,32.072874,-84.227576
1885,southface_AES056,Southface,30606.0,13059,Athens,Georgia,Solar Electric,Residential,08/12/2016,,,,,33.932947,-83.387305


In [102]:
installs.shape

(2139, 15)

In [103]:
installs.isnull().sum()

sea-install-id          0
oiriginal-db            0
zip                     1
fips                    0
town                   40
state                   0
system-type             0
sector                  0
install-date            0
utility              1538
federal-cong-dist    1374
state-senate-dist    1374
state-house-dist     1374
lat                    22
long                   22
dtype: int64

In [104]:
installs.duplicated().sum()

1

- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.
- zip is an integer and not a string

# 3 Data Cleaning

## 3.1 Copy All Dataframes

### 3.1.1 Define
Copy all dataframes for cleaning purposes.

#### 3.1.1.2 Code

In [105]:
installs_clean = installs.copy()

#### 3.1.1.3 Test

In [106]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 15 columns):
sea-install-id       2139 non-null object
oiriginal-db         2139 non-null object
zip                  2138 non-null float64
fips                 2139 non-null int64
town                 2099 non-null object
state                2139 non-null object
system-type          2139 non-null object
sector               2139 non-null object
install-date         2139 non-null object
utility              601 non-null object
federal-cong-dist    765 non-null float64
state-senate-dist    765 non-null float64
state-house-dist     765 non-null float64
lat                  2117 non-null float64
long                 2117 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 250.8+ KB


## 3.2 Installations

Installations

- install-date is not in datetime format - DONE
- fips is not a string
- zip is an integer and not a string
- Drop congressional districts, as not useful.
- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.


### 3.2.1 Rename Columns
#### 3.2.1.1 Define


## 3.2 Column Naming
### 3.2.1 Installations
#### 3.2.1.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully, as well as correcting any spelling errors.

#### 3.2.1.2 Code

In [107]:
installs_clean = installs_clean.rename(columns={"sea-install-id": "sea_install_id", 
                                                "oiriginal-db": "original_db", 
                                                "system-type": "system_type", 
                                                "install-date": "install_date", 
                                                "federal-cong-dist": "fed_congressional_district", 
                                                "state-senate-dist": "state_senate_district", 
                                                "state-house-dist": "state_house_district"})

#### 3.2.1.3 Test

In [108]:
installs_clean.head()

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
0,southface_S53,Southface,31408.0,13051,Garden City,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601.0,13059,Athens,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030.0,13089,Decatur,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005.0,13121,Alpharetta,Georgia,Solar Hot Water,Residential,01/01/2008,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281.0,13151,Stockbridge,Georgia,Solar Hot Water,Residential,01/01/2008,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


### 3.2.2 Convert install_date to datetime format
#### 3.2.2.1 Define
The install_date column should be in datetime format. Drop any unknown dates and convert those values to datetime.

#### 3.2.2.2 Code

In [109]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 15 columns):
sea_install_id                2139 non-null object
original_db                   2139 non-null object
zip                           2138 non-null float64
fips                          2139 non-null int64
town                          2099 non-null object
state                         2139 non-null object
system_type                   2139 non-null object
sector                        2139 non-null object
install_date                  2139 non-null object
utility                       601 non-null object
fed_congressional_district    765 non-null float64
state_senate_district         765 non-null float64
state_house_district          765 non-null float64
lat                           2117 non-null float64
long                          2117 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 250.8+ KB


In [111]:
installs_clean['install_date'] = pd.to_datetime(installs_clean['install_date'])

#### 3.2.2.3 Test

In [115]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 15 columns):
sea_install_id                2139 non-null object
original_db                   2139 non-null object
zip                           2138 non-null float64
fips                          2139 non-null int64
town                          2099 non-null object
state                         2139 non-null object
system_type                   2139 non-null object
sector                        2139 non-null object
install_date                  2139 non-null datetime64[ns]
utility                       601 non-null object
fed_congressional_district    765 non-null float64
state_senate_district         765 non-null float64
state_house_district          765 non-null float64
lat                           2117 non-null float64
long                          2117 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(1), object(7)
memory usage: 250.8+ KB


### 3.2.3 Convert fips, zip, and congressional district columns to strings
#### 3.2.3.1 Define
Convert the fips, zip, fed_congressional_district, state_senate_district, and state_house_district columns to string format all columns.

#### 3.2.3.2 Code

In [116]:
print(installs_clean[installs_clean['zip'].isnull()])

       sea_install_id original_db  zip   fips town    state     system_type  \
1833  southface_SF137   Southface  NaN  13053  NaN  Georgia  Solar Electric   

       sector install_date utility  fed_congressional_district  \
1833  Utility   2016-06-01     NaN                         NaN   

      state_senate_district  state_house_district        lat       long  
1833                    NaN                   NaN  32.318057 -84.972077  


In [117]:
installs_clean.drop([1833], inplace=True)
installs_clean['zip'] = installs_clean['zip'].round(0).astype(int)
installs_clean['zip'] = installs_clean['zip'].astype(str)
installs_clean['fips'] = installs_clean['fips'].astype(str)

#### 3.2.3.3 Test

In [118]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2138 entries, 0 to 2138
Data columns (total 15 columns):
sea_install_id                2138 non-null object
original_db                   2138 non-null object
zip                           2138 non-null object
fips                          2138 non-null object
town                          2099 non-null object
state                         2138 non-null object
system_type                   2138 non-null object
sector                        2138 non-null object
install_date                  2138 non-null datetime64[ns]
utility                       601 non-null object
fed_congressional_district    765 non-null float64
state_senate_district         765 non-null float64
state_house_district          765 non-null float64
lat                           2116 non-null float64
long                          2116 non-null float64
dtypes: datetime64[ns](1), float64(5), object(9)
memory usage: 267.2+ KB


In [119]:
installs_clean.head(5)

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
0,southface_S53,Southface,31408,13051,Garden City,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601,13059,Athens,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030,13089,Decatur,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2008-01-01,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281,13151,Stockbridge,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


### 3.2.4 Investigate town NaN values
#### 3.2.4.1 Define
Query the dataset and investigate NaN values for the town column to see if data can be retrieved to complete. Otherwise, remove observations from the dataset.

#### 3.2.4.2 Code

In [120]:
installs_clean[installs_clean['town'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
36,southface_SF131a,Southface,30305,13121,,Georgia,Solar Electric,Residential,2005-12-31,,,,,33.823333,-84.399232
43,southface_SF131b,Southface,30305,13121,,Georgia,Solar Electric,Residential,2006-12-31,,,,,33.823333,-84.399232
684,southface_SF126,Southface,30188,13067,,Georgia,Solar Electric,Non-Residential,2010-12-14,,11.0,37.0,34.0,33.983053,-84.577952
885,southface_UR01,Southface,31906,40101,,Georgia,Solar Electric,Non-Residential,2011-08-01,,,,,32.47531,-84.944636
1349,southface_M1053,Southface,30054,13297,,Georgia,Solar Electric,Residential,2013-02-25,,,,,33.689491,-83.819327
1353,southface_M1055,Southface,30040,13117,,Georgia,Solar Electric,Residential,2013-02-28,,,,,34.26531,-84.094711
1370,southface_M1051,Southface,30143,13227,,Georgia,Solar Electric,Residential,2013-05-04,,,,,34.469552,-84.428989
1382,southface_M1056,Southface,31406,13051,,Georgia,Solar Electric,Residential,2013-07-12,,,,,31.975155,-81.057734
1415,southface_SF125,Southface,31018,13303,,Georgia,Solar Electric,Utility,2013-12-01,Cobb EMC,10.0,26.0,128.0,33.006887,-82.558476
1451,southface_M1054,Southface,30269,13113,,Georgia,Solar Electric,Non-Residential,2013-12-16,,3.0,16.0,72.0,33.3635,-84.567805


In [121]:
print(installs_clean['zip'][36])
print(installs_clean['zip'][43])
print(installs_clean['zip'][684])
print(installs_clean['zip'][885])
print(installs_clean['zip'][1349])

30305
30305
30188
31906
30054


In [124]:
installs_clean.at[36, 'town'] = 'Atlanta'
installs_clean.at[43, 'town'] = 'Atlanta'
installs_clean.at[684, 'town'] = 'Woodstock'
installs_clean.at[885, 'town'] = 'Columbus'
installs_clean.at[1349, 'town'] = 'Oxford'

In [125]:
print(installs_clean['zip'][1353])
print(installs_clean['zip'][1370])
print(installs_clean['zip'][1382])
print(installs_clean['zip'][1415])
print(installs_clean['zip'][1451])

30040
30143
31406
31018
30269


In [126]:
installs_clean.at[1382, 'town'] = 'Savannah'
installs_clean.at[1451, 'town'] = 'Peachtree City'
installs_clean.at[1353, 'town'] = 'Cumming'
installs_clean.at[1370, 'town'] = 'Jasper'
installs_clean.at[1415, 'town'] = 'Davisboro'

In [127]:
print(installs_clean['zip'][1472])
print(installs_clean['zip'][1473])
print(installs_clean['zip'][1483])
print(installs_clean['zip'][1490])
print(installs_clean['zip'][1493])

30439
31006
31409
30230
30650


In [128]:
installs_clean.at[1472, 'town'] = 'Metter'
installs_clean.at[1473, 'town'] = 'Butler'
installs_clean.at[1483, 'town'] = 'Savannah'
installs_clean.at[1490, 'town'] = 'Hogansville'
installs_clean.at[1493, 'town'] = 'Madison'

In [129]:
print(installs_clean['zip'][1497])
print(installs_clean['zip'][1506])
print(installs_clean['zip'][1507])
print(installs_clean['zip'][1508])
print(installs_clean['zip'][1509])

30044
30056
30120
30222
30533


In [130]:
installs_clean.at[1497, 'town'] = 'Lawrenceville'
installs_clean.at[1506, 'town'] = 'Newborn'
installs_clean.at[1507, 'town'] = 'Cartersville'
installs_clean.at[1508, 'town'] = 'Greenville'
installs_clean.at[1509, 'town'] = 'Dahlonega'

In [131]:
print(installs_clean['zip'][1517])
print(installs_clean['zip'][1518])
print(installs_clean['zip'][1529])
print(installs_clean['zip'][1551])
print(installs_clean['zip'][1552])

30805
30553
30625
30030
30417


In [132]:
installs_clean.at[1517, 'town'] = 'Augusta'
installs_clean.at[1518, 'town'] = 'Lavonia'
installs_clean.at[1529, 'town'] = 'Buckhead'
installs_clean.at[1551, 'town'] = 'Decatur'
installs_clean.at[1552, 'town'] = 'Claxton'

In [133]:
print(installs_clean['zip'][1558])
print(installs_clean['zip'][1561])
print(installs_clean['zip'][1562])
print(installs_clean['zip'][1565])
print(installs_clean['zip'][1577])

30030
30030
31525
30549
30124


In [134]:
installs_clean.at[1558, 'town'] = 'Decatur'
installs_clean.at[1561, 'town'] = 'Decatur'
installs_clean.at[1562, 'town'] = 'Brunswick'
installs_clean.at[1565, 'town'] = 'Jefferson'
installs_clean.at[1577, 'town'] = 'Cave Spring'

In [135]:
print(installs_clean['zip'][1581])
print(installs_clean['zip'][1583])
print(installs_clean['zip'][1584])
print(installs_clean['zip'][1585])
print(installs_clean['zip'][1587])

30909
39845
31796
31012
30030


In [136]:
installs_clean.at[1581, 'town'] = 'Augusta'
installs_clean.at[1583, 'town'] = 'Donalsonville'
installs_clean.at[1584, 'town'] = 'Acorn Pond'
installs_clean.at[1585, 'town'] = 'Chester'
installs_clean.at[1587, 'town'] = 'Decatur'

In [137]:
print(installs_clean['zip'][1606])
print(installs_clean['zip'][1607])
print(installs_clean['zip'][1611])
print(installs_clean['zip'][1991])

31091
30165
30328
31006


In [138]:
installs_clean.at[1606, 'town'] = 'Unadilla'
installs_clean.at[1607, 'town'] = 'Rome'
installs_clean.at[1611, 'town'] = 'Sandy Springs'
installs_clean.at[1991, 'town'] = 'Butler'

#### 3.2.4.3 Test

In [139]:
print(installs_clean[installs_clean['town'].isnull()])

Empty DataFrame
Columns: [sea_install_id, original_db, zip, fips, town, state, system_type, sector, install_date, utility, fed_congressional_district, state_senate_district, state_house_district, lat, long]
Index: []


### 3.2.5 Remove all observations with no latitude or longitude
#### 3.2.5.1 Define
Query the dataset and remove all NaN lat/long observations.

#### 3.2.5.2 Code

In [140]:
installs_clean[installs_clean['lat'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
29,nrel_NREL1,NREL,31405,13051,Savannah,Georgia,Solar Electric,Residential,1997-10-13,,,,,,
31,nrel_NREL2,NREL,30308,13121,Atlanta,Georgia,Solar Electric,Non-Residential,2002-01-01,,,,,,
90,nrel_NREL3,NREL,30022,13121,Alpharetta,Georgia,Solar Electric,Residential,2008-09-02,,,,,,
217,nrel_NREL4,NREL,30305,13121,Atlanta,Georgia,Solar Electric,Residential,2009-07-03,,,,,,
284,nrel_NREL5,NREL,31321,13029,Pembroke,Georgia,Solar Electric,Non-Residential,2009-09-25,,,,,,
290,nrel_NREL6,NREL,30541,13111,Epworth,Georgia,Solar Electric,Unknown,2009-09-30,,,,,,
339,nrel_NREL7,NREL,30513,13111,Blue Ridge,Georgia,Solar Electric,Unknown,2009-12-08,,,,,,
341,nrel_NREL8,NREL,30269,13113,Peachtree City,Georgia,Solar Electric,Non-Residential,2009-12-15,,,,,,
358,nrel_NREL9,NREL,30055,13159,Mansfield,Georgia,Solar Electric,Residential,2010-01-03,,,,,,
388,nrel_NREL10,NREL,30721,13313,Dalton,Georgia,Solar Electric,Unknown,2010-01-08,,,,,,


In [141]:
installs_clean[installs_clean['lat'].isnull()].shape

(22, 15)

In [142]:
installs_clean.drop([29,31,90,217,284,290,339,341,358,388,409,455,549,577,583,762,763,1001,1007,1260,1337,1644], inplace=True)

#### 3.2.5.2 Test

In [143]:
installs_clean[installs_clean['lat'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long


In [144]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2116 entries, 0 to 2138
Data columns (total 15 columns):
sea_install_id                2116 non-null object
original_db                   2116 non-null object
zip                           2116 non-null object
fips                          2116 non-null object
town                          2116 non-null object
state                         2116 non-null object
system_type                   2116 non-null object
sector                        2116 non-null object
install_date                  2116 non-null datetime64[ns]
utility                       601 non-null object
fed_congressional_district    765 non-null float64
state_senate_district         765 non-null float64
state_house_district          765 non-null float64
lat                           2116 non-null float64
long                          2116 non-null float64
dtypes: datetime64[ns](1), float64(5), object(9)
memory usage: 264.5+ KB


In [145]:
installs_clean.drop(["fed_congressional_district", "state_senate_district", "state_house_district"], axis = 1, inplace = True) 

In [146]:
installs_clean[installs_clean['utility'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,lat,long
27,southface_SF24,Southface,30318,13121,Atlanta,Georgia,Solar Electric,Non-Residential,1996-01-01,,33.775772,-84.402532
28,southface_SF25,Southface,30318,13121,Atlanta,Georgia,Solar Hot Water,Non-Residential,1996-01-01,,33.775772,-84.402532
30,southface_SF33,Southface,30240,13285,LaGrange,Georgia,Solar Electric,Non-Residential,1999-02-01,,33.000525,-85.056206
32,southface_SF20,Southface,30064,13067,Marietta,Georgia,Solar Electric,Residential,2003-01-01,,33.934810,-84.640828
33,southface_SF96,Southface,31774,13155,Ocilla,Georgia,Solar Electric,Non-Residential,2005-08-18,,31.581931,-83.240312
...,...,...,...,...,...,...,...,...,...,...,...,...
2134,southface_RS17-1-0003,Southface,30339,13067,Atlanta,Georgia,Solar Electric,Non-Residential,2017-08-30,,33.880656,-84.470292
2135,southface_CS113,Southface,30306,13121,Atlanta,Georgia,Solar Electric,Residential,2017-08-30,,33.778986,-84.354015
2136,southface_AS10,Southface,30268,13077,Palmetto,Georgia,Solar Electric,Residential,2017-09-01,,33.498070,-84.718732
2137,southface_SF161,Southface,30810,13125,Gibson,Georgia,Solar Electric,Utility,2017-09-13,,33.210600,-82.602739


In [147]:
utilities_needed = installs_clean.copy()

In [148]:
utilities_needed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2116 entries, 0 to 2138
Data columns (total 12 columns):
sea_install_id    2116 non-null object
original_db       2116 non-null object
zip               2116 non-null object
fips              2116 non-null object
town              2116 non-null object
state             2116 non-null object
system_type       2116 non-null object
sector            2116 non-null object
install_date      2116 non-null datetime64[ns]
utility           601 non-null object
lat               2116 non-null float64
long              2116 non-null float64
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 214.9+ KB


In [149]:
utilities_needed.drop(["sea_install_id", "original_db","fips","town","state","system_type","sector","install_date","lat","long"], axis = 1, inplace = True) 

In [150]:
utilities_needed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2116 entries, 0 to 2138
Data columns (total 2 columns):
zip        2116 non-null object
utility    601 non-null object
dtypes: object(2)
memory usage: 49.6+ KB


In [151]:
utilities_needed = utilities_needed[utilities_needed['utility'].isnull()]

In [152]:
utilities_needed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1515 entries, 27 to 2138
Data columns (total 2 columns):
zip        1515 non-null object
utility    0 non-null object
dtypes: object(2)
memory usage: 35.5+ KB


In [153]:
utilities_needed.to_csv('utilities_needed.csv', encoding='utf-8', index=False)

In [154]:
utils_identified = pd.read_csv('utils_identified.csv', encoding='utf-8')
utils_identified.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 2 columns):
zip        360 non-null int64
utility    360 non-null object
dtypes: int64(1), object(1)
memory usage: 5.8+ KB


In [155]:
utils_identified["zip"] = utils_identified['zip'].astype(str)
utils_identified.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 2 columns):
zip        360 non-null object
utility    360 non-null object
dtypes: object(2)
memory usage: 5.8+ KB


In [156]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2116 entries, 0 to 2138
Data columns (total 12 columns):
sea_install_id    2116 non-null object
original_db       2116 non-null object
zip               2116 non-null object
fips              2116 non-null object
town              2116 non-null object
state             2116 non-null object
system_type       2116 non-null object
sector            2116 non-null object
install_date      2116 non-null datetime64[ns]
utility           601 non-null object
lat               2116 non-null float64
long              2116 non-null float64
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 214.9+ KB


In [157]:
utils_identified_merge = pd.merge(installs_clean, utils_identified, on = 'zip', how = 'left')

In [158]:
utils_identified_merge.sample(5)

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility_x,lat,long,utility_y
1164,southface_OG18,Southface,30071,13135,Norcross,Georgia,Solar Electric,Non-Residential,2011-12-28,Georgia Power Company,33.936297,-84.234006,
866,southface_M280,Southface,30269,13113,Peachtree City,Georgia,Solar Electric,Non-Residential,2011-02-14,,33.367106,-84.575252,
63,southface_S56,Southface,31405,13051,Savannah,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,32.049878,-81.101236,
342,southface_S104,Southface,30606,13059,Athens,Georgia,Solar Electric,Non-Residential,2009-10-05,,33.963472,-83.414115,
1565,southface_M1036,Southface,30607,13059,Athens,Georgia,Solar Electric,Residential,2013-06-17,,33.992773,-83.486875,


In [159]:
utils_identified_merge[utils_identified_merge['utility_x'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility_x,lat,long,utility_y
31,southface_SF24,Southface,30318,13121,Atlanta,Georgia,Solar Electric,Non-Residential,1996-01-01,,33.775772,-84.402532,
32,southface_SF25,Southface,30318,13121,Atlanta,Georgia,Solar Hot Water,Non-Residential,1996-01-01,,33.775772,-84.402532,
33,southface_SF33,Southface,30240,13285,LaGrange,Georgia,Solar Electric,Non-Residential,1999-02-01,,33.000525,-85.056206,
34,southface_SF20,Southface,30064,13067,Marietta,Georgia,Solar Electric,Residential,2003-01-01,,33.934810,-84.640828,
35,southface_SF96,Southface,31774,13155,Ocilla,Georgia,Solar Electric,Non-Residential,2005-08-18,,31.581931,-83.240312,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2470,southface_SF161,Southface,30810,13125,Gibson,Georgia,Solar Electric,Utility,2017-09-13,,33.210600,-82.602739,Washington Elec Member Corp
2471,southface_AES118,Southface,30017,13135,Grayson,Georgia,Solar Electric,Residential,2017-02-28,,33.891688,-83.964515,City of Lawrenceville - (GA)
2472,southface_AES118,Southface,30017,13135,Grayson,Georgia,Solar Electric,Residential,2017-02-28,,33.891688,-83.964515,Georgia Power Co
2473,southface_AES118,Southface,30017,13135,Grayson,Georgia,Solar Electric,Residential,2017-02-28,,33.891688,-83.964515,Jackson Electric Member Corp - (GA)


In [160]:
utils_identified_merge.utility_x.fillna(utils_identified_merge.utility_y, inplace=True)
del utils_identified_merge['utility_y']
utils_identified_merge.head(4)

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility_x,lat,long
0,southface_S53,Southface,31408,13051,Garden City,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,32.11929,-81.151748
1,southface_S55,Southface,30601,13059,Athens,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,33.976445,-83.368683
2,southface_S59,Southface,30030,13089,Decatur,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,33.767515,-84.308954
3,southface_S52,Southface,30005,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2008-01-01,Sawnee EMC,34.070288,-84.202196


In [161]:
utils_identified_merge["utility_x"].fillna("Unknown", inplace = True)

In [162]:
utils_identified_merge.head(25)

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility_x,lat,long
0,southface_S53,Southface,31408,13051,Garden City,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,32.11929,-81.151748
1,southface_S55,Southface,30601,13059,Athens,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,33.976445,-83.368683
2,southface_S59,Southface,30030,13089,Decatur,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,33.767515,-84.308954
3,southface_S52,Southface,30005,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2008-01-01,Sawnee EMC,34.070288,-84.202196
4,southface_S49,Southface,30281,13151,Stockbridge,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,33.501071,-84.257491
5,southface_S54,Southface,30549,13157,Jefferson,Georgia,Solar Hot Water,Residential,2008-01-01,Jackson EMC,34.072211,-83.638933
6,southface_S54,Southface,30549,13157,Jefferson,Georgia,Solar Hot Water,Residential,2008-01-01,Jackson EMC,34.072211,-83.638933
7,southface_S64,Southface,30549,13157,Jefferson,Georgia,Solar Hot Water,Residential,2008-01-01,Jackson EMC,34.084172,-83.65703
8,southface_S64,Southface,30549,13157,Jefferson,Georgia,Solar Hot Water,Residential,2008-01-01,Jackson EMC,34.084172,-83.65703
9,southface_S58,Southface,30650,13211,Madison,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,33.595032,-83.460743


In [163]:
utils_identified_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2475 entries, 0 to 2474
Data columns (total 12 columns):
sea_install_id    2475 non-null object
original_db       2475 non-null object
zip               2475 non-null object
fips              2475 non-null object
town              2475 non-null object
state             2475 non-null object
system_type       2475 non-null object
sector            2475 non-null object
install_date      2475 non-null datetime64[ns]
utility_x         2475 non-null object
lat               2475 non-null float64
long              2475 non-null float64
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 251.4+ KB


In [164]:
utils_identified_merge['sea_install_id'].duplicated().sum()

365

In [165]:
utils_identified_merge['sea_install_id'].drop_duplicates(keep=False,inplace=True) 

In [166]:
utils_identified_merge['sea_install_id'].duplicated().sum()

0

In [167]:
utils_identified_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2475 entries, 0 to 2474
Data columns (total 12 columns):
sea_install_id    2475 non-null object
original_db       2475 non-null object
zip               2475 non-null object
fips              2475 non-null object
town              2475 non-null object
state             2475 non-null object
system_type       2475 non-null object
sector            2475 non-null object
install_date      2475 non-null datetime64[ns]
utility_x         2475 non-null object
lat               2475 non-null float64
long              2475 non-null float64
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 246.9+ KB


In [168]:
utils_identified_merge.sample(45)

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility_x,lat,long
1645,southface_HS48,Southface,31068,13193,Oglethorpe,Georgia,Solar Electric,Utility,2013-12-10,Georgia Power Company,32.281278,-84.063277
1996,southface_IS10,Southface,31815,13259,Richland,Georgia,Solar Electric,Utility,2015-09-30,Sumter Electric Member Corp,32.067511,-84.714017
153,southface_S13,Southface,30281,13151,Stockbridge,Georgia,Solar Electric,Non-Residential,2008-12-18,Unknown,33.542441,-84.261681
65,southface_S15,Southface,30607,13059,Athens,Georgia,Solar Electric,Residential,2008-01-01,Unknown,33.991432,-83.493784
1845,southface_GPC-10779,Southface,30909,13245,Augusta,Georgia,Solar Electric,Non-Residential,2015-01-12,Unknown,33.468797,-82.080681
1690,southface_HS33,Southface,39826,13273,Bronwood,Georgia,Solar Electric,Non-Residential,2013-12-31,Unknown,31.821586,-84.382792
2070,southface_CS28,Southface,30721,13313,Dalton,Georgia,Solar Electric,Non-Residential,2016-02-01,Unknown,34.74152,-84.961889
225,southface_SF8,Southface,30601,13059,Athens,Georgia,Solar Electric,Non-Residential,2009-06-01,Unknown,33.958566,-83.369814
1704,southface_SF130,Southface,30477,13163,Wadley,Georgia,Solar Electric,Utility,2014-02-12,Jefferson Electric Member Corp,32.883144,-82.408748
892,southface_M290,Southface,31778,13275,Pavo,Georgia,Solar Electric,Residential,2011-03-14,Colquitt Electric Membership Corp,30.981116,-83.756554


In [169]:
utils_identified_merge['utility'] = utils_identified_merge['utility_x']

In [170]:
utils_identified_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2475 entries, 0 to 2474
Data columns (total 13 columns):
sea_install_id    2475 non-null object
original_db       2475 non-null object
zip               2475 non-null object
fips              2475 non-null object
town              2475 non-null object
state             2475 non-null object
system_type       2475 non-null object
sector            2475 non-null object
install_date      2475 non-null datetime64[ns]
utility_x         2475 non-null object
lat               2475 non-null float64
long              2475 non-null float64
utility           2475 non-null object
dtypes: datetime64[ns](1), float64(2), object(10)
memory usage: 270.7+ KB


In [171]:
utils_identified_merge.drop(["utility_x"], axis = 1, inplace = True) 

In [172]:
installs_clean.to_csv('.csv', encoding='utf-8', index=False)