In [1]:
import requests
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import cm as cm

import seaborn as sns 
import datetime
from datetime import datetime

# 1 Data Gathering - SolarView

https://github.com/social-energy-atlas/solarview-data

## 1.1 Installations

## 1.9 Zipcode Table

In [2]:
zipcode = pd.read_csv('data/sea-zipcode-county.csv', encoding='utf-8')
zipcode.head(5)

Unnamed: 0,sea-zip-id,zip,fips,year
0,zip-2018-1,31557,13001,2018
1,zip-2018-2,31513,13001,2018
2,zip-2018-3,31518,13001,2018
3,zip-2018-4,31539,13001,2018
4,zip-2018-5,31560,13001,2018


# 2 Data Assessment
## 2.1 Installations

## 2.9 Zipcode Table

In [3]:
zipcode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13051 entries, 0 to 13050
Data columns (total 4 columns):
sea-zip-id    13051 non-null object
zip           13051 non-null int64
fips          13051 non-null int64
year          13051 non-null int64
dtypes: int64(3), object(1)
memory usage: 407.9+ KB


In [4]:
zipcode.head(5)

Unnamed: 0,sea-zip-id,zip,fips,year
0,zip-2018-1,31557,13001,2018
1,zip-2018-2,31513,13001,2018
2,zip-2018-3,31518,13001,2018
3,zip-2018-4,31539,13001,2018
4,zip-2018-5,31560,13001,2018


In [5]:
zipcode.isnull().sum()

sea-zip-id    0
zip           0
fips          0
year          0
dtype: int64

In [6]:
zipcode.duplicated().sum()

0

- zip and fips are integers and not strings
- sea-zip-id needs to be renamed

# 3 Data Cleaning

## 3.1 Copy All Dataframes

### 3.1.1 Define
Copy all dataframes for cleaning purposes.

#### 3.1.1.2 Code

In [7]:
zipcode_clean = zipcode.copy()

#### 3.1.1.3 Test

In [8]:
zipcode_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13051 entries, 0 to 13050
Data columns (total 4 columns):
sea-zip-id    13051 non-null object
zip           13051 non-null int64
fips          13051 non-null int64
year          13051 non-null int64
dtypes: int64(3), object(1)
memory usage: 407.9+ KB


## 3.2 Column Naming
### 3.2.1 Installations
#### 3.2.1.1 Define
Rename all columns so that hyphens are replaced with underscores so that future codes can be run successfully, as well as correcting any spelling errors.

#### 3.2.1.2 Code

In [9]:
zipcode_clean = zipcode_clean.rename(columns={"sea-zip-id": "sea_zip_id"})

#### 3.2.1.3 Test

In [10]:
zipcode_clean.head()

Unnamed: 0,sea_zip_id,zip,fips,year
0,zip-2018-1,31557,13001,2018
1,zip-2018-2,31513,13001,2018
2,zip-2018-3,31518,13001,2018
3,zip-2018-4,31539,13001,2018
4,zip-2018-5,31560,13001,2018


### 3.2.2 zipcode and fips string
#### 3.2.2.1 Define
 

#### 3.2.2.2 Code

In [11]:
zipcode_clean['zip'] = zipcode_clean['zip'].astype(str)
zipcode_clean['fips'] = zipcode_clean['fips'].astype(str)

In [12]:
zipcode_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13051 entries, 0 to 13050
Data columns (total 4 columns):
sea_zip_id    13051 non-null object
zip           13051 non-null object
fips          13051 non-null object
year          13051 non-null int64
dtypes: int64(1), object(3)
memory usage: 407.9+ KB


DateTime

In [13]:
zipcode_clean['year'] = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')

In [14]:
zipcode_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13051 entries, 0 to 13050
Data columns (total 4 columns):
sea_zip_id    13051 non-null object
zip           13051 non-null object
fips          13051 non-null object
year          13051 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 407.9+ KB


## Store Data

In [15]:
zipcode_clean.to_csv('clean_data/zipcode_clean.csv', encoding='utf-8', index=False)