In [1]:
import requests
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm as cm

import seaborn as sns 
import datetime
from datetime import datetime

# 1 Data Gathering - SolarView

https://github.com/social-energy-atlas/solarview-data

## 1.1 Installations

In [2]:
installs = pd.read_csv('data/sea-solar-installations.csv', encoding='utf-8')
installs.head(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
0,southface_S53,Southface,31408.0,13051,Garden City,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601.0,13059,Athens,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030.0,13089,Decatur,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005.0,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2008-01-01,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281.0,13151,Stockbridge,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


In [3]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


# 2 Data Assessment
## 2.1 Installations

In [4]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


- install-date is not in datetime format
- fips is not a string
- error in spelling of original database column

In [5]:
installs.sample(5)

Unnamed: 0,sea-install-id,oiriginal-db,zip,fips,town,state,system-type,sector,install-date,utility,federal-cong-dist,state-senate-dist,state-house-dist,lat,long
2132,southface_CS125,Southface,30030.0,13089,Decatur,Georgia,Solar Electric,Residential,2017-08-15,,,,,33.78301,-84.287334
231,southface_S120,Southface,30214.0,13113,Fayetteville,Georgia,Solar Electric,Residential,2009-07-17,,,,,33.51756,-84.494319
1182,southface_M929,Southface,30809.0,13073,Evans,Georgia,Solar Electric,Residential,2012-06-14,,12.0,24.0,122.0,33.570021,-82.09729
424,southface_S243,Southface,30076.0,13121,Roswell,Georgia,Solar Electric,Non-Residential,2010-02-02,,,,,34.027356,-84.320103
1083,southface_SF32,Southface,30030.0,13089,Decatur,Georgia,Solar Electric,Non-Residential,2012-03-01,,,,,33.772304,-84.299335


In [6]:
installs.shape

(2147, 15)

In [7]:
installs.isnull().sum()

sea-install-id          0
oiriginal-db            0
zip                     1
fips                    0
town                   40
state                   0
system-type             0
sector                  0
install-date            0
utility              1544
federal-cong-dist    1380
state-senate-dist    1380
state-house-dist     1380
lat                    22
long                   22
dtype: int64

In [8]:
installs.duplicated().sum()

1

- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.
- zip is an integer and not a string

# 3 Data Cleaning

## 3.1 Copy All Dataframes

### 3.1.1 Define
Copy all dataframes for cleaning purposes.

#### 3.1.1.2 Code

In [9]:
installs_clean = installs.copy()

#### 3.1.1.3 Test

In [10]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea-install-id       2147 non-null object
oiriginal-db         2147 non-null object
zip                  2146 non-null float64
fips                 2147 non-null int64
town                 2107 non-null object
state                2147 non-null object
system-type          2147 non-null object
sector               2147 non-null object
install-date         2147 non-null object
utility              603 non-null object
federal-cong-dist    767 non-null float64
state-senate-dist    767 non-null float64
state-house-dist     767 non-null float64
lat                  2125 non-null float64
long                 2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


## 3.2 Installations

Installations

- install-date is not in datetime format - DONE
- fips is not a string
- zip is an integer and not a string
- Drop congressional districts, as not useful.
- NaN present in zip, town, lat, utility, federal-cong-dist, state-senate-dist, state-house-dist, lat, and long.
- Can't check for duplicated id because of column name.


### 3.2.1 Rename Columns
#### 3.2.1.1 Define


## 3.2 Column Naming
### 3.2.1 Installations
#### 3.2.1.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully, as well as correcting any spelling errors.

#### 3.2.1.2 Code

In [11]:
installs_clean = installs_clean.rename(columns={"sea-install-id": "sea_install_id", 
                                                "oiriginal-db": "original_db", 
                                                "system-type": "system_type", 
                                                "install-date": "install_date", 
                                                "federal-cong-dist": "fed_congressional_district", 
                                                "state-senate-dist": "state_senate_district", 
                                                "state-house-dist": "state_house_district"})

#### 3.2.1.3 Test

In [12]:
installs_clean.head()

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
0,southface_S53,Southface,31408.0,13051,Garden City,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601.0,13059,Athens,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030.0,13089,Decatur,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005.0,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2008-01-01,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281.0,13151,Stockbridge,Georgia,Solar Hot Water,Residential,2008-01-01,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


### 3.2.2 Convert install_date to datetime format
#### 3.2.2.1 Define
The install_date column should be in datetime format. Drop any unknown dates and convert those values to datetime.

#### 3.2.2.2 Code

In [13]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea_install_id                2147 non-null object
original_db                   2147 non-null object
zip                           2146 non-null float64
fips                          2147 non-null int64
town                          2107 non-null object
state                         2147 non-null object
system_type                   2147 non-null object
sector                        2147 non-null object
install_date                  2147 non-null object
utility                       603 non-null object
fed_congressional_district    767 non-null float64
state_senate_district         767 non-null float64
state_house_district          767 non-null float64
lat                           2125 non-null float64
long                          2125 non-null float64
dtypes: float64(6), int64(1), object(8)
memory usage: 251.7+ KB


In [14]:
installs_clean['install_date'] = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

#### 3.2.2.3 Test

In [15]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2147 entries, 0 to 2146
Data columns (total 15 columns):
sea_install_id                2147 non-null object
original_db                   2147 non-null object
zip                           2146 non-null float64
fips                          2147 non-null int64
town                          2107 non-null object
state                         2147 non-null object
system_type                   2147 non-null object
sector                        2147 non-null object
install_date                  2147 non-null datetime64[ns]
utility                       603 non-null object
fed_congressional_district    767 non-null float64
state_senate_district         767 non-null float64
state_house_district          767 non-null float64
lat                           2125 non-null float64
long                          2125 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(1), object(7)
memory usage: 251.7+ KB


### 3.2.3 Convert fips, zip, and congressional district columns to strings
#### 3.2.3.1 Define
Convert the fips, zip, fed_congressional_district, state_senate_district, and state_house_district columns to string format all columns.

#### 3.2.3.2 Code

In [16]:
print(installs_clean[installs_clean['zip'].isnull()])

       sea_install_id original_db  zip   fips town    state     system_type  \
1833  southface_SF137   Southface  NaN  13053  NaN  Georgia  Solar Electric   

       sector        install_date utility  fed_congressional_district  \
1833  Utility 2005-06-01 13:33:00     NaN                         NaN   

      state_senate_district  state_house_district        lat       long  
1833                    NaN                   NaN  32.318057 -84.972077  


In [17]:
installs_clean.drop([1833], inplace=True)
installs_clean['zip'] = installs_clean['zip'].round(0).astype(int)
installs_clean['zip'] = installs_clean['zip'].astype(str)
installs_clean['fips'] = installs_clean['fips'].astype(str)

#### 3.2.3.3 Test

In [18]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2146 entries, 0 to 2146
Data columns (total 15 columns):
sea_install_id                2146 non-null object
original_db                   2146 non-null object
zip                           2146 non-null object
fips                          2146 non-null object
town                          2107 non-null object
state                         2146 non-null object
system_type                   2146 non-null object
sector                        2146 non-null object
install_date                  2146 non-null datetime64[ns]
utility                       603 non-null object
fed_congressional_district    767 non-null float64
state_senate_district         767 non-null float64
state_house_district          767 non-null float64
lat                           2124 non-null float64
long                          2124 non-null float64
dtypes: datetime64[ns](1), float64(5), object(9)
memory usage: 268.2+ KB


In [19]:
installs_clean.head(5)

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
0,southface_S53,Southface,31408,13051,Garden City,Georgia,Solar Hot Water,Residential,2005-06-01 13:33:00,Georgia Power Company,1.0,2.0,162.0,32.11929,-81.151748
1,southface_S55,Southface,30601,13059,Athens,Georgia,Solar Hot Water,Residential,2005-06-01 13:33:00,Georgia Power Company,10.0,46.0,118.0,33.976445,-83.368683
2,southface_S59,Southface,30030,13089,Decatur,Georgia,Solar Hot Water,Residential,2005-06-01 13:33:00,Georgia Power Company,5.0,42.0,83.0,33.767515,-84.308954
3,southface_S52,Southface,30005,13121,Alpharetta,Georgia,Solar Hot Water,Residential,2005-06-01 13:33:00,Sawnee EMC,6.0,48.0,25.0,34.070288,-84.202196
4,southface_S49,Southface,30281,13151,Stockbridge,Georgia,Solar Hot Water,Residential,2005-06-01 13:33:00,Georgia Power Company,13.0,10.0,111.0,33.501071,-84.257491


### 3.2.4 Investigate town NaN values
#### 3.2.4.1 Define
Query the dataset and investigate NaN values for the town column to see if data can be retrieved to complete. Otherwise, remove observations from the dataset.

#### 3.2.4.2 Code

In [20]:
installs_clean[installs_clean['town'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
36,southface_SF131a,Southface,30305,13121,,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,33.823333,-84.399232
43,southface_SF131b,Southface,30305,13121,,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,33.823333,-84.399232
684,southface_SF126,Southface,30188,13067,,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,11.0,37.0,34.0,33.983053,-84.577952
885,southface_UR01,Southface,31906,40101,,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,,,,32.47531,-84.944636
1349,southface_M1053,Southface,30054,13297,,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,33.689491,-83.819327
1353,southface_M1055,Southface,30040,13117,,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,34.26531,-84.094711
1370,southface_M1051,Southface,30143,13227,,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,34.469552,-84.428989
1382,southface_M1056,Southface,31406,13051,,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,31.975155,-81.057734
1415,southface_SF125,Southface,31018,13303,,Georgia,Solar Electric,Utility,2005-06-01 13:33:00,Cobb EMC,10.0,26.0,128.0,33.006887,-82.558476
1451,southface_M1054,Southface,30269,13113,,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,3.0,16.0,72.0,33.3635,-84.567805


In [21]:
print(installs_clean['zip'][36])
print(installs_clean['zip'][43])
print(installs_clean['zip'][684])
print(installs_clean['zip'][885])
print(installs_clean['zip'][1349])

30305
30305
30188
31906
30054


In [22]:
installs_clean.town[36] = 'Atlanta'
installs_clean.town[43] = 'Atlanta'
installs_clean.town[684] = 'Woodstock'
installs_clean.town[885] = 'Columbus'
installs_clean.town[1349] = 'Oxford'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [23]:
print(installs_clean['zip'][1353])
print(installs_clean['zip'][1370])
print(installs_clean['zip'][1382])
print(installs_clean['zip'][1415])
print(installs_clean['zip'][1451])

30040
30143
31406
31018
30269


In [24]:
installs_clean.town[1382] = 'Savannah'
installs_clean.town[1451] = 'Peachtree City'
installs_clean.town[1353] = 'Cumming'
installs_clean.town[1370] = 'Jasper'
installs_clean.town[1415] = 'Davisboro'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [25]:
print(installs_clean['zip'][1472])
print(installs_clean['zip'][1473])
print(installs_clean['zip'][1483])
print(installs_clean['zip'][1490])
print(installs_clean['zip'][1493])

30439
31006
31409
30230
30650


In [26]:
installs_clean.town[1472] = 'Metter'
installs_clean.town[1473] = 'Butler'
installs_clean.town[1483] = 'Savannah'
installs_clean.town[1490] = 'Hogansville'
installs_clean.town[1493] = 'Madison'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [27]:
print(installs_clean['zip'][1497])
print(installs_clean['zip'][1506])
print(installs_clean['zip'][1507])
print(installs_clean['zip'][1508])
print(installs_clean['zip'][1509])

30044
30056
30120
30222
30533


In [28]:
installs_clean.town[1497] = 'Lawrenceville'
installs_clean.town[1506] = 'Newborn'
installs_clean.town[1507] = 'Cartersville'
installs_clean.town[1508] = 'Greenville'
installs_clean.town[1509] = 'Dahlonega'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [29]:
print(installs_clean['zip'][1517])
print(installs_clean['zip'][1518])
print(installs_clean['zip'][1529])
print(installs_clean['zip'][1551])
print(installs_clean['zip'][1552])

30805
30553
30625
30030
30417


In [30]:
installs_clean.town[1517] = 'Augusta'
installs_clean.town[1518] = 'Lavonia'
installs_clean.town[1529] = 'Buckhead'
installs_clean.town[1551] = 'Decatur'
installs_clean.town[1552] = 'Claxton'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [31]:
print(installs_clean['zip'][1558])
print(installs_clean['zip'][1561])
print(installs_clean['zip'][1562])
print(installs_clean['zip'][1565])
print(installs_clean['zip'][1577])

30030
30030
31525
30549
30124


In [32]:
installs_clean.town[1558] = 'Decatur'
installs_clean.town[1561] = 'Decatur'
installs_clean.town[1562] = 'Brunswick'
installs_clean.town[1565] = 'Jefferson'
installs_clean.town[1577] = 'Cave Spring'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [33]:
print(installs_clean['zip'][1581])
print(installs_clean['zip'][1583])
print(installs_clean['zip'][1584])
print(installs_clean['zip'][1585])
print(installs_clean['zip'][1587])

30909
39845
31796
31012
30030


In [34]:
installs_clean.town[1581] = 'Augusta'
installs_clean.town[1583] = 'Donalsonville'
installs_clean.town[1584] = 'Acorn Pond'
installs_clean.town[1585] = 'Chester'
installs_clean.town[1587] = 'Decatur'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [35]:
print(installs_clean['zip'][1606])
print(installs_clean['zip'][1607])
print(installs_clean['zip'][1611])
print(installs_clean['zip'][1991])

31091
30165
30328
31006


In [36]:
installs_clean.town[1606] = 'Unadilla'
installs_clean.town[1607] = 'Rome'
installs_clean.town[1611] = 'Sandy Springs'
installs_clean.town[1991] = 'Butler'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


#### 3.2.4.3 Test

In [37]:
print(installs_clean[installs_clean['town'].isnull()])

Empty DataFrame
Columns: [sea_install_id, original_db, zip, fips, town, state, system_type, sector, install_date, utility, fed_congressional_district, state_senate_district, state_house_district, lat, long]
Index: []


### 3.2.5 Remove all observations with no latitude or longitude
#### 3.2.5.1 Define
Query the dataset and remove all NaN lat/long observations.

#### 3.2.5.2 Code

In [38]:
installs_clean[installs_clean['lat'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long
29,nrel_NREL1,NREL,31405,13051,Savannah,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,,
31,nrel_NREL2,NREL,30308,13121,Atlanta,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,,,,,
90,nrel_NREL3,NREL,30022,13121,Alpharetta,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,,
217,nrel_NREL4,NREL,30305,13121,Atlanta,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,,
284,nrel_NREL5,NREL,31321,13029,Pembroke,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,,,,,
290,nrel_NREL6,NREL,30541,13111,Epworth,Georgia,Solar Electric,Unknown,2005-06-01 13:33:00,,,,,,
339,nrel_NREL7,NREL,30513,13111,Blue Ridge,Georgia,Solar Electric,Unknown,2005-06-01 13:33:00,,,,,,
341,nrel_NREL8,NREL,30269,13113,Peachtree City,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,,,,,
358,nrel_NREL9,NREL,30055,13159,Mansfield,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,,,,,
388,nrel_NREL10,NREL,30721,13313,Dalton,Georgia,Solar Electric,Unknown,2005-06-01 13:33:00,,,,,,


In [39]:
installs_clean[installs_clean['lat'].isnull()].shape

(22, 15)

In [40]:
installs_clean.drop([29,31,90,217,284,290,339,341,358,388,409,455,549,577,583,762,763,1001,1007,1260,1337,1644], inplace=True)

#### 3.2.5.2 Test

In [41]:
installs_clean[installs_clean['lat'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,fed_congressional_district,state_senate_district,state_house_district,lat,long


In [42]:
installs_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2124 entries, 0 to 2146
Data columns (total 15 columns):
sea_install_id                2124 non-null object
original_db                   2124 non-null object
zip                           2124 non-null object
fips                          2124 non-null object
town                          2124 non-null object
state                         2124 non-null object
system_type                   2124 non-null object
sector                        2124 non-null object
install_date                  2124 non-null datetime64[ns]
utility                       603 non-null object
fed_congressional_district    767 non-null float64
state_senate_district         767 non-null float64
state_house_district          767 non-null float64
lat                           2124 non-null float64
long                          2124 non-null float64
dtypes: datetime64[ns](1), float64(5), object(9)
memory usage: 265.5+ KB


In [43]:
installs_clean.drop(["fed_congressional_district", "state_senate_district", "state_house_district"], axis = 1, inplace = True) 

In [44]:
installs_clean[installs_clean['utility'].isnull()]

Unnamed: 0,sea_install_id,original_db,zip,fips,town,state,system_type,sector,install_date,utility,lat,long
27,southface_SF24,Southface,30318,13121,Atlanta,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,33.775772,-84.402532
28,southface_SF25,Southface,30318,13121,Atlanta,Georgia,Solar Hot Water,Non-Residential,2005-06-01 13:33:00,,33.775772,-84.402532
30,southface_SF33,Southface,30240,13285,LaGrange,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,33.000525,-85.056206
32,southface_SF20,Southface,30064,13067,Marietta,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,33.934810,-84.640828
33,southface_SF96,Southface,31774,13155,Ocilla,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,31.581931,-83.240312
34,southface_SF111,Southface,30290,13113,Tyrone,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,33.493975,-84.573428
35,southface_SF106,Southface,30677,13219,Watkinsville,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,33.861476,-83.456300
36,southface_SF131a,Southface,30305,13121,Atlanta,Georgia,Solar Electric,Residential,2005-06-01 13:33:00,,33.823333,-84.399232
37,southface_SF93,Southface,30094,13247,Conyers,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,33.627055,-84.047691
38,southface_SF94,Southface,30141,13223,Hiram,Georgia,Solar Electric,Non-Residential,2005-06-01 13:33:00,,33.878397,-84.771694


In [53]:
utilities_needed = installs_clean.copy()

In [54]:
utilities_needed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2124 entries, 0 to 2146
Data columns (total 12 columns):
sea_install_id    2124 non-null object
original_db       2124 non-null object
zip               2124 non-null object
fips              2124 non-null object
town              2124 non-null object
state             2124 non-null object
system_type       2124 non-null object
sector            2124 non-null object
install_date      2124 non-null datetime64[ns]
utility           603 non-null object
lat               2124 non-null float64
long              2124 non-null float64
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 215.7+ KB


In [47]:
utilities_needed.drop(["sea_install_id", "original_db","fips","town","state","system_type","sector","install_date","lat","long"], axis = 1, inplace = True) 

In [49]:
utilities_needed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2124 entries, 0 to 2146
Data columns (total 2 columns):
zip        2124 non-null object
utility    603 non-null object
dtypes: object(2)
memory usage: 49.8+ KB


In [50]:
utilities_needed = utilities_needed[utilities_needed['utility'].isnull()]

In [51]:
utilities_needed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1521 entries, 27 to 2146
Data columns (total 2 columns):
zip        1521 non-null object
utility    0 non-null object
dtypes: object(2)
memory usage: 35.6+ KB


In [52]:
utilities_needed.to_csv('utilities_needed.csv', encoding='utf-8', index=False)

In [55]:
utils_identified = pd.read_csv('utils_identified.csv', encoding='utf-8')
utils_identified.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 2 columns):
zip        354 non-null int64
utility    354 non-null object
dtypes: int64(1), object(1)
memory usage: 5.6+ KB


In [None]:
utils_identified["zip"] = utils_identified['zip'].astype(str)
utils_identified["utility_new"] = utils_identified['name']
utils_identified.drop(["name"], axis = 1, inplace = True) 
utils_identified.info()

In [None]:
installs_clean.info()

In [None]:
utils_identified_merge = pd.merge(installs_clean, utils_identified, on = 'zip', how = 'left')

In [None]:
utils_identified_merge.sample(5)

In [None]:
utils_identified_merge[utils_identified_merge['utility_new'].isnull()]

In [None]:
utils_identified_merge.utility.fillna(utils_identified_merge.utility_new, inplace=True)
del utils_identified_merge['utility_new']
utils_identified_merge.head(4)

In [None]:
utils_identified_merge[utils_identified_merge['utility'].isnull()]

In [None]:
utils_identified_merge.dropna()

In [None]:
utils_identified_merge.info()

In [None]:
utils_identified_merge.duplicated().sum()

In [None]:
utils_identified_merge.drop_duplicates(keep=False,inplace=True) 

In [None]:
utils_identified_merge.duplicated().sum()

In [None]:
utils_identified_merge.info()

In [None]:
utils_identified_merge.sample(45)