In [1]:
import os
os.chdir("/Users/tessleggio/GoogleDrive/GeorgiaTech/2018-Fall/01-ISYE-6414/6414Project")
print(os.getcwd())

In [2]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandasql as ps
import glob
from dateutil import parser

import warnings
warnings.filterwarnings('ignore')

In [3]:
states = pd.read_csv("data/States.csv")

In [4]:
def cap(string):
    if isinstance(string, str):
        return string.upper().replace(" COUNTY", "").capitalize().strip()
    else:
        return ""

In [5]:
#--------------------READING AND PRE-PROCESSING GREENHOUSE GAS INCOME DATA--------------------
#---------------------------------------------------------------------------------------------

def get_emissions(filename, year):
    
    #Loading data, keeping only relevant columns, and adding year as a constant variable
    xls = pd.ExcelFile(filename)
    data_test = xls.parse("Direct Emitters", skiprows=3, index_col=None, na_values=[np.nan])
    data_test = data_test.loc[:, "City":"Stationary Combustion"]
    data_test["Year"] = year
    return data_test


def get_all_emissions():
    files = glob.glob("data/ghgp_*")
    
    #Combining data from all different years
    for i in range(len(files)):
        year = int(files[i].split("_")[2])
        #print(year)
        print(files[i])
        
        if i == 0:
            data4 = get_emissions(files[i], year)
        else:
            upd = get_emissions(files[i], year)
            data4 = pd.concat([data4, upd])
    
    return data4

#Getting emissions data
data4 = get_all_emissions()

#Get full state name from abbreviation
data4 = pd.merge(data4, states, how="left", left_on=["State"], right_on=["Abbreviation"]).drop(["State_x", "Abbreviation"], axis=1)

#Dropping variables
data4 = data4.iloc[:, [23, 24, 3] + list(range(9, 23))]

#Renaming variables
data4.columns = ["Year", "State", "County", "Total Emissions", "CO2", "Methane",
                 "Nitrous Oxide", "HFC", "PFC", "SF6", "NF3", "Other Fluorane", 
                 "HFE", "Short Lived Compounds", "Other GHG", "Biogenic CO2", "Stationary Combustion"]

#Removing trailing blanks from county and state variables
data4["County"] = data4["County"].apply(cap)
data4["State"] = data4["State"].apply(cap)

#Grouping data into key variables and obtaining emission totals
data4 = data4.groupby(["Year", "State", "County"]).agg({"Total Emissions":[np.nansum],
                                               "CO2":[np.nansum],
                                              "Methane":[np.nansum],
                                               "Nitrous Oxide":[np.nansum],
                                               "HFC":[np.nansum],
                                               "PFC":[np.nansum],
                                               "SF6":[np.nansum],
                                               "NF3":[np.nansum],
                                               "Other Fluorane":[np.nansum],
                                               "HFE":[np.nansum],
                                               "Short Lived Compounds":[np.nansum],
                                               "Other GHG":[np.nansum],
                                               "Biogenic CO2":[np.nansum],
                                               "Stationary Combustion":[np.nansum]}).reset_index()

#Renaming after grouping
data4.columns = data4.columns.droplevel(1)

data/ghgp_data_2014_8_19_2018.xlsx
data/ghgp_data_2013_8_19_2018.xlsx
data/ghgp_data_2012_8_19_2018.xlsx
data/ghgp_data_2015_8_19_2018.xlsx
data/ghgp_data_2010_8_19_2018.xlsx
data/ghgp_data_2011_8_19_2018.xlsx
data/ghgp_data_2016_8_19_2018.xlsx


In [6]:
data4.head()

Unnamed: 0,Year,State,County,Total Emissions,CO2,Methane,Nitrous Oxide,HFC,PFC,SF6,NF3,Other Fluorane,HFE,Short Lived Compounds,Other GHG,Biogenic CO2,Stationary Combustion
0,2010,,Arecibo,302168.146,148171.2,153638.75,358.196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,148679.646
1,2010,,Barceloneta,117314.526,116917.8,117.5,279.226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117314.526
2,2010,,Carolina,24387.246,24301.8,25.25,60.196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24387.246
3,2010,,Carolina municipio,67573.35,0.6,67572.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6
4,2010,,Ceiba,1340.728,1336.2,1.25,3.278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1340.728


In [23]:
ghg = data4.copy()

#Replace DC
ghg['State'] = ghg['State'].str.replace('Washington dc', 'district of columbia')
ghg['State'] = ghg['State'].str.lower().str.strip()
ghg['County'] = ghg['County'].str.lower().str.strip()

#Replace South Dakota
ghg['State'] = np.where(ghg['State']=='sout','south dakota',ghg['State'])

#drop NA states
ghg = ghg.replace('', np.nan)
ghg.head()

Unnamed: 0,Year,State,County,Total Emissions,CO2,Methane,Nitrous Oxide,HFC,PFC,SF6,NF3,Other Fluorane,HFE,Short Lived Compounds,Other GHG,Biogenic CO2,Stationary Combustion
0,2010,,arecibo,302168.146,148171.2,153638.75,358.196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,148679.646
1,2010,,barceloneta,117314.526,116917.8,117.5,279.226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117314.526
2,2010,,carolina,24387.246,24301.8,25.25,60.196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24387.246
3,2010,,carolina municipio,67573.35,0.6,67572.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6
4,2010,,ceiba,1340.728,1336.2,1.25,3.278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1340.728


In [24]:
query="""SELECT DISTINCT State FROM ghg
"""
ps.sqldf(query=query)

Unnamed: 0,State
0,
1,alabama
2,alaska
3,arizona
4,arkansas
5,california
6,colorado
7,connecticut
8,delaware
9,florida


In [8]:
ghg.to_csv("./cleaned_data/ghg_cleaning.csv", index=None, header=True)

In [9]:
#Checking Number of records for each combination of key variables
query="""SELECT DISTINCT Count FROM (SELECT Year, State, County, COUNT(*) AS Count FROM data4 GROUP BY Year, State, County)"""
ps.sqldf(query=query)

Unnamed: 0,Count
0,1


In [10]:
#Checking Number of Counties for each State
query = """SELECT State, COUNT(County) from data4 GROUP BY State"""
ps.sqldf(query=query)

Unnamed: 0,State,COUNT(County)
0,,178
1,Alabama,367
2,Alaska,119
3,Arizona,100
4,Arkansas,289
5,California,296
6,Colorado,216
7,Connecticut,56
8,Delaware,21
9,Florida,353


In [11]:
#Obtaining map from State Code/County Code to County Name:
xls = pd.ExcelFile("data/all-geocodes-v2016.xlsx")
data_geo = xls.parse("Sheet1", skiprows=4, index_col=None, na_values=[np.nan])

statecd = data_geo[(data_geo["County Code (FIPS)"] == 0)*(data_geo["Place Code (FIPS)"] == 0)*(data_geo["Consolidtated City Code (FIPS)"] == 0)].reset_index(drop=True)

statecd = statecd[["State Code (FIPS)", "Area Name (including legal/statistical area description)"]]

countycd = data_geo[data_geo["County Code (FIPS)"] != 0][["State Code (FIPS)", "County Code (FIPS)", "Area Name (including legal/statistical area description)"]]

data_geo = pd.merge(countycd, statecd, how="inner", on="State Code (FIPS)")

In [12]:
data_geo

Unnamed: 0,State Code (FIPS),County Code (FIPS),Area Name (including legal/statistical area description)_x,Area Name (including legal/statistical area description)_y
0,1,1,Autauga County,Alabama
1,1,3,Baldwin County,Alabama
2,1,5,Barbour County,Alabama
3,1,7,Bibb County,Alabama
4,1,9,Blount County,Alabama
5,1,11,Bullock County,Alabama
6,1,13,Butler County,Alabama
7,1,15,Calhoun County,Alabama
8,1,17,Chambers County,Alabama
9,1,19,Cherokee County,Alabama
