In [1]:
import numpy as np
import pandas as pd
import os

import datetime
from datetime import datetime

# 1 Data Gathering - SolarView

https://github.com/social-energy-atlas/solarview-data

## 1.2 County-level Census Data

In [2]:
census = pd.read_csv('data/sea-county-census.csv', encoding='utf-8')
census.head(5)

Unnamed: 0,fips,med-income,owner-occ,pop-tot,dem-fem-pct,dem-male-pct,dem-white-pct,dem-baa-pct,dem-aian-pct,dem-a-pct,dem-nhpi-pct,dem-two-pct,dem-hl-pct,dem-vet,dem-hh
0,13001,"$37,135.00",71.10%,18428,50.10%,49.90%,69.50%,19.40%,0.60%,0.90%,0.20%,1.20%,9.60%,1053,8381
1,13003,"$30,933.00",72.00%,8273,49.90%,50.10%,56.20%,17.50%,1.40%,0.90%,1.10%,2.10%,25.20%,385,3429
2,13005,"$37,162.00",68.60%,11372,50.70%,49.30%,73.10%,16.60%,0.30%,0.60%,0.20%,1.60%,8.70%,559,4715
3,13007,"$44,297.00",73.60%,3150,51.40%,48.60%,47.00%,45.90%,0.40%,1.10%,0.10%,1.10%,5.60%,178,1620
4,13009,"$32,460.00",53.80%,45144,49.40%,50.60%,52.60%,42.40%,0.30%,1.80%,0.10%,1.20%,2.20%,2911,20277


# 2 Data Assessment
## 2.2 County-level Census Data

In [3]:
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 15 columns):
fips             159 non-null int64
med-income       159 non-null object
owner-occ        159 non-null object
pop-tot          159 non-null object
dem-fem-pct      159 non-null object
dem-male-pct     159 non-null object
dem-white-pct    159 non-null object
dem-baa-pct      159 non-null object
dem-aian-pct     159 non-null object
dem-a-pct        159 non-null object
dem-nhpi-pct     159 non-null object
dem-two-pct      159 non-null object
dem-hl-pct       159 non-null object
dem-vet          159 non-null object
dem-hh           159 non-null object
dtypes: int64(1), object(14)
memory usage: 18.7+ KB


- owner-occ,, dem-vet, and all pct columns are objects/strings and not floats
- median income, pop-tot, dem-hh, and dem-vet are objects/strings and not integers.

In [4]:
census.sample(5)

Unnamed: 0,fips,med-income,owner-occ,pop-tot,dem-fem-pct,dem-male-pct,dem-white-pct,dem-baa-pct,dem-aian-pct,dem-a-pct,dem-nhpi-pct,dem-two-pct,dem-hl-pct,dem-vet,dem-hh
100,13205,"$31,111.00",66.20%,22459,50.30%,49.70%,72.40%,23.00%,0.30%,1.00%,0,1.30%,2.30%,2009,9589
158,13321,"$38,684.00",70.30%,20748,51.70%,48.30%,52.60%,46.00%,1.40%,0.00%,0.00%,0.00%,0.00%,1499,7873
103,13211,"$54,506.00",74.80%,18170,50.50%,49.50%,83.00%,1.20%,1.00%,0.50%,0.30%,1.30%,14.60%,1943,14266
17,13035,"$41,667.00",71.70%,23817,46.80%,53.20%,66.90%,28.20%,0.30%,0.60%,0.00%,1.40%,3.20%,1511,7774
7,13015,"$48,893.00",64.30%,103807,50.70%,49.30%,78.10%,11.10%,0.60%,0.90%,0.10%,2.00%,8.50%,6928,40196


In [5]:
census.isnull().sum()

fips             0
med-income       0
owner-occ        0
pop-tot          0
dem-fem-pct      0
dem-male-pct     0
dem-white-pct    0
dem-baa-pct      0
dem-aian-pct     0
dem-a-pct        0
dem-nhpi-pct     0
dem-two-pct      0
dem-hl-pct       0
dem-vet          0
dem-hh           0
dtype: int64

In [6]:
census.duplicated().sum()

0

# 3 Data Cleaning

## 3.2.2 Copy Dataframe

### 3.2.2.1 Define
Copy all dataframes for cleaning purposes.

#### 3.2.2.2 Code

In [7]:
census_clean = census.copy()

#### 3.2.1.3 Test

In [8]:
census_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 15 columns):
fips             159 non-null int64
med-income       159 non-null object
owner-occ        159 non-null object
pop-tot          159 non-null object
dem-fem-pct      159 non-null object
dem-male-pct     159 non-null object
dem-white-pct    159 non-null object
dem-baa-pct      159 non-null object
dem-aian-pct     159 non-null object
dem-a-pct        159 non-null object
dem-nhpi-pct     159 non-null object
dem-two-pct      159 non-null object
dem-hl-pct       159 non-null object
dem-vet          159 non-null object
dem-hh           159 non-null object
dtypes: int64(1), object(14)
memory usage: 18.7+ KB


## 3.2 Cleaning
### 3.2.1 Column Naming
#### 3.2.2.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully.

#### 3.2.2.2 Code

In [9]:
census_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 15 columns):
fips             159 non-null int64
med-income       159 non-null object
owner-occ        159 non-null object
pop-tot          159 non-null object
dem-fem-pct      159 non-null object
dem-male-pct     159 non-null object
dem-white-pct    159 non-null object
dem-baa-pct      159 non-null object
dem-aian-pct     159 non-null object
dem-a-pct        159 non-null object
dem-nhpi-pct     159 non-null object
dem-two-pct      159 non-null object
dem-hl-pct       159 non-null object
dem-vet          159 non-null object
dem-hh           159 non-null object
dtypes: int64(1), object(14)
memory usage: 18.7+ KB


In [10]:
census_clean = census_clean.rename(columns={"med-income": "med_income",
                                            "owner-occ": "owner_occ", 
                                            "pop-tot": "pop_tot", 
                                            "dem-fem-pct": "dem_fem_pct", 
                                            "dem-male-pct": "dem_male_pct", 
                                            "dem-white-pct": "dem_white_pct",
                                            "dem-baa-pct": "dem_baa_pct",
                                            "dem-aian-pct": "dem_aian_pct",
                                            "dem-a-pct": "dem_a_pct",
                                            "dem-nhpi-pct": "dem_nhpi_pct",
                                            "dem-two-pct": "dem_two_pct",
                                            "dem-hl-pct": "dem_hl_pct",
                                            "dem-vet": "dem_vet",
                                            "dem-hh": "dem_hh"
                                           })

#### 3.2.2.3 Test

In [11]:
census_clean.head()

Unnamed: 0,fips,med_income,owner_occ,pop_tot,dem_fem_pct,dem_male_pct,dem_white_pct,dem_baa_pct,dem_aian_pct,dem_a_pct,dem_nhpi_pct,dem_two_pct,dem_hl_pct,dem_vet,dem_hh
0,13001,"$37,135.00",71.10%,18428,50.10%,49.90%,69.50%,19.40%,0.60%,0.90%,0.20%,1.20%,9.60%,1053,8381
1,13003,"$30,933.00",72.00%,8273,49.90%,50.10%,56.20%,17.50%,1.40%,0.90%,1.10%,2.10%,25.20%,385,3429
2,13005,"$37,162.00",68.60%,11372,50.70%,49.30%,73.10%,16.60%,0.30%,0.60%,0.20%,1.60%,8.70%,559,4715
3,13007,"$44,297.00",73.60%,3150,51.40%,48.60%,47.00%,45.90%,0.40%,1.10%,0.10%,1.10%,5.60%,178,1620
4,13009,"$32,460.00",53.80%,45144,49.40%,50.60%,52.60%,42.40%,0.30%,1.80%,0.10%,1.20%,2.20%,2911,20277


## 3.2.3 Convert strings to numbers
### 3.2.3.1 Define
Convert med_income, pop_tot, dem_vet, and dem_hh to an integer and all of the census population proportions into floats.

### 3.2.3.2 Code

In [12]:
census_clean['med_income'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
census_clean['pop_tot'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
census_clean['dem_vet'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
census_clean['dem_hh'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
census_clean['owner_occ'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_fem_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_male_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_white_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_baa_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_aian_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_a_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_nhpi_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_two_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')
census_clean['dem_hl_pct'].replace(regex=True,inplace=True,to_replace=r'.$',value=r'')

In [13]:
census_clean.head()

Unnamed: 0,fips,med_income,owner_occ,pop_tot,dem_fem_pct,dem_male_pct,dem_white_pct,dem_baa_pct,dem_aian_pct,dem_a_pct,dem_nhpi_pct,dem_two_pct,dem_hl_pct,dem_vet,dem_hh
0,13001,3713500,71.1,18428,50.1,49.9,69.5,19.4,0.6,0.9,0.2,1.2,9.6,1053,8381
1,13003,3093300,72.0,8273,49.9,50.1,56.2,17.5,1.4,0.9,1.1,2.1,25.2,385,3429
2,13005,3716200,68.6,11372,50.7,49.3,73.1,16.6,0.3,0.6,0.2,1.6,8.7,559,4715
3,13007,4429700,73.6,3150,51.4,48.6,47.0,45.9,0.4,1.1,0.1,1.1,5.6,178,1620
4,13009,3246000,53.8,45144,49.4,50.6,52.6,42.4,0.3,1.8,0.1,1.2,2.2,2911,20277


In [14]:
census_clean['fips'] = census_clean['fips'].astype(str)
census_clean['med_income'] = census_clean['med_income'].astype(int)
census_clean['pop_tot'] = census_clean['pop_tot'].astype(int)
census_clean['dem_vet'] = census_clean['dem_vet'].astype(int)
census_clean['dem_hh'] = census_clean['dem_hh'].astype(int)
census_clean['owner_occ'] = census_clean['owner_occ'].astype(float)
census_clean['dem_fem_pct'] = census_clean['dem_fem_pct'].astype(float)
census_clean['dem_male_pct'] = census_clean['dem_male_pct'].astype(float)
census_clean['dem_white_pct'] = census_clean['dem_white_pct'].astype(float)
census_clean['dem_baa_pct'] = census_clean['dem_baa_pct'].astype(float)
census_clean['dem_aian_pct'] = census_clean['dem_aian_pct'].astype(float)
census_clean['dem_a_pct'] = census_clean['dem_a_pct'].astype(float)
census_clean['dem_two_pct'] = census_clean['dem_two_pct'].astype(float)

#### 3.2.3.3 Test

In [15]:
census_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 15 columns):
fips             159 non-null object
med_income       159 non-null int64
owner_occ        159 non-null float64
pop_tot          159 non-null int64
dem_fem_pct      159 non-null float64
dem_male_pct     159 non-null float64
dem_white_pct    159 non-null float64
dem_baa_pct      159 non-null float64
dem_aian_pct     159 non-null float64
dem_a_pct        159 non-null float64
dem_nhpi_pct     159 non-null object
dem_two_pct      159 non-null float64
dem_hl_pct       159 non-null object
dem_vet          159 non-null int64
dem_hh           159 non-null int64
dtypes: float64(8), int64(4), object(3)
memory usage: 18.7+ KB


In [16]:
census_clean.head()

Unnamed: 0,fips,med_income,owner_occ,pop_tot,dem_fem_pct,dem_male_pct,dem_white_pct,dem_baa_pct,dem_aian_pct,dem_a_pct,dem_nhpi_pct,dem_two_pct,dem_hl_pct,dem_vet,dem_hh
0,13001,3713500,71.1,18428,50.1,49.9,69.5,19.4,0.6,0.9,0.2,1.2,9.6,1053,8381
1,13003,3093300,72.0,8273,49.9,50.1,56.2,17.5,1.4,0.9,1.1,2.1,25.2,385,3429
2,13005,3716200,68.6,11372,50.7,49.3,73.1,16.6,0.3,0.6,0.2,1.6,8.7,559,4715
3,13007,4429700,73.6,3150,51.4,48.6,47.0,45.9,0.4,1.1,0.1,1.1,5.6,178,1620
4,13009,3246000,53.8,45144,49.4,50.6,52.6,42.4,0.3,1.8,0.1,1.2,2.2,2911,20277


## 3.2.4 Replace empty cells with zeros for NHPI
### 3.2.4.1 Define
Insert zeros into empty cells for the NHPI demographic proportion.
### 3.2.4.2 Code

In [17]:
census_clean[census_clean.dem_nhpi_pct =='']

Unnamed: 0,fips,med_income,owner_occ,pop_tot,dem_fem_pct,dem_male_pct,dem_white_pct,dem_baa_pct,dem_aian_pct,dem_a_pct,dem_nhpi_pct,dem_two_pct,dem_hl_pct,dem_vet,dem_hh
47,13097,5738400,65.7,142224,53.1,46.9,46.0,50.2,0.5,0.7,,0.9,2.2,628,4023
50,13103,6282000,77.0,58712,51.9,48.1,63.9,29.1,0.4,0.8,,1.2,5.6,1612,7611
51,13105,3573900,70.0,19143,50.7,49.3,59.8,34.5,0.4,0.6,,1.0,4.6,1254,8227
53,13109,4059400,64.5,10670,51.3,48.7,95.0,0.8,0.4,0.5,,1.3,2.2,2117,9927
58,13119,3729800,67.7,22320,51.6,48.4,40.1,44.5,0.3,7.0,,2.1,7.3,44718,385103
60,13123,4377500,75.2,29733,50.6,49.4,88.6,9.2,0.5,0.1,,1.5,1.5,165,1497
68,13139,5190200,65.8,196637,45.1,54.9,24.2,71.8,0.5,1.0,,0.8,2.2,489,2813
69,13141,2638600,76.9,8640,51.4,48.6,91.2,4.8,0.3,0.7,,1.5,1.7,1855,10886
71,13145,6533600,83.6,33652,50.5,49.5,75.0,19.0,0.2,1.0,,1.5,3.7,1696,10016
72,13147,3798300,72.5,25553,50.3,49.7,84.8,10.1,0.5,0.6,,1.9,2.7,886,4364


In [18]:
census_clean.dem_nhpi_pct = census_clean.dem_nhpi_pct[census_clean.dem_nhpi_pct ==''] = '0'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
census_clean['dem_hl_pct'] = census_clean['dem_hl_pct'].astype(float)

#### 3.2.4.3 Test

In [20]:
census_clean[census_clean.dem_nhpi_pct =='']

Unnamed: 0,fips,med_income,owner_occ,pop_tot,dem_fem_pct,dem_male_pct,dem_white_pct,dem_baa_pct,dem_aian_pct,dem_a_pct,dem_nhpi_pct,dem_two_pct,dem_hl_pct,dem_vet,dem_hh


In [21]:
census_clean.head()

Unnamed: 0,fips,med_income,owner_occ,pop_tot,dem_fem_pct,dem_male_pct,dem_white_pct,dem_baa_pct,dem_aian_pct,dem_a_pct,dem_nhpi_pct,dem_two_pct,dem_hl_pct,dem_vet,dem_hh
0,13001,3713500,71.1,18428,50.1,49.9,69.5,19.4,0.6,0.9,0,1.2,9.6,1053,8381
1,13003,3093300,72.0,8273,49.9,50.1,56.2,17.5,1.4,0.9,0,2.1,25.2,385,3429
2,13005,3716200,68.6,11372,50.7,49.3,73.1,16.6,0.3,0.6,0,1.6,8.7,559,4715
3,13007,4429700,73.6,3150,51.4,48.6,47.0,45.9,0.4,1.1,0,1.1,5.6,178,1620
4,13009,3246000,53.8,45144,49.4,50.6,52.6,42.4,0.3,1.8,0,1.2,2.2,2911,20277


## 3.2.5 Turn _pct columns into proportions
### 3.2.5.1 Define
Multiply all _pct columns by 0.01 in order to finish the transformation into a proportion rate.

### 3.2.5.2 Code

In [22]:
census_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 15 columns):
fips             159 non-null object
med_income       159 non-null int64
owner_occ        159 non-null float64
pop_tot          159 non-null int64
dem_fem_pct      159 non-null float64
dem_male_pct     159 non-null float64
dem_white_pct    159 non-null float64
dem_baa_pct      159 non-null float64
dem_aian_pct     159 non-null float64
dem_a_pct        159 non-null float64
dem_nhpi_pct     159 non-null object
dem_two_pct      159 non-null float64
dem_hl_pct       159 non-null float64
dem_vet          159 non-null int64
dem_hh           159 non-null int64
dtypes: float64(9), int64(4), object(2)
memory usage: 18.7+ KB


In [23]:
census_clean['owner_occ'] = census_clean['owner_occ']*0.01
census_clean['dem_fem_pct'] = census_clean['dem_fem_pct']*0.01
census_clean['dem_male_pct'] = census_clean['dem_male_pct']*0.01
census_clean['dem_white_pct'] = census_clean['dem_white_pct']*0.01
census_clean['dem_baa_pct'] = census_clean['dem_baa_pct']*0.01
census_clean['dem_aian_pct'] = census_clean['dem_aian_pct']*0.01
census_clean['dem_a_pct'] = census_clean['dem_a_pct']*0.01
census_clean['dem_two_pct'] = census_clean['dem_two_pct']*0.01
census_clean['dem_hl_pct'] = census_clean['dem_hl_pct']*0.01

#### 3.2.5.3 Test

In [24]:
census_clean.head(5)

Unnamed: 0,fips,med_income,owner_occ,pop_tot,dem_fem_pct,dem_male_pct,dem_white_pct,dem_baa_pct,dem_aian_pct,dem_a_pct,dem_nhpi_pct,dem_two_pct,dem_hl_pct,dem_vet,dem_hh
0,13001,3713500,0.711,18428,0.501,0.499,0.695,0.194,0.006,0.009,0,0.012,0.096,1053,8381
1,13003,3093300,0.72,8273,0.499,0.501,0.562,0.175,0.014,0.009,0,0.021,0.252,385,3429
2,13005,3716200,0.686,11372,0.507,0.493,0.731,0.166,0.003,0.006,0,0.016,0.087,559,4715
3,13007,4429700,0.736,3150,0.514,0.486,0.47,0.459,0.004,0.011,0,0.011,0.056,178,1620
4,13009,3246000,0.538,45144,0.494,0.506,0.526,0.424,0.003,0.018,0,0.012,0.022,2911,20277


## Store the Data

In [25]:
census_clean.to_csv('clean_data/census_clean.csv', encoding='utf-8', index=False)