In [1]:
import os
os.chdir("/Users/tessleggio/GoogleDrive/GeorgiaTech/2018-Fall/01-ISYE-6414/6414Project")
print(os.getcwd())

/Users/tessleggio/GoogleDrive/GeorgiaTech/2018-Fall/01-ISYE-6414/6414Project


In [2]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandasql as ps
import glob
from dateutil import parser

import warnings
warnings.filterwarnings('ignore')

In [5]:
states = pd.read_csv("data/States.csv")

In [7]:
def cap(string):
    if isinstance(string, str):
        return string.upper().replace(" COUNTY", "").capitalize().strip()
    else:
        return ""

In [8]:
#--------------------READING AND PRE-PROCESSING GREENHOUSE GAS INCOME DATA--------------------
#---------------------------------------------------------------------------------------------

def get_emissions(filename, year):
    
    #Loading data, keeping only relevant columns, and adding year as a constant variable
    xls = pd.ExcelFile(filename)
    data_test = xls.parse("Direct Emitters", skiprows=3, index_col=None, na_values=[np.nan])
    data_test = data_test.loc[:, "City":"Stationary Combustion"]
    data_test["Year"] = year
    return data_test


def get_all_emissions():
    files = glob.glob("data/ghgp_*")
    
    #Combining data from all different years
    for i in range(len(files)):
        year = int(files[i].split("_")[2])
        #print(year)
        print(files[i])
        
        if i == 0:
            data4 = get_emissions(files[i], year)
        else:
            upd = get_emissions(files[i], year)
            data4 = pd.concat([data4, upd])
    
    return data4

#Getting emissions data
data4 = get_all_emissions()

#Get full state name from abbreviation
data4 = pd.merge(data4, states, how="left", left_on=["State"], right_on=["Abbreviation"]).drop(["State_x", "Abbreviation"], axis=1)

#Dropping variables
data4 = data4.iloc[:, [23, 24, 3] + list(range(9, 23))]

#Renaming variables
data4.columns = ["Year", "State", "County", "Total Emissions", "CO2", "Methane",
                 "Nitrous Oxide", "HFC", "PFC", "SF6", "NF3", "Other Fluorane", 
                 "HFE", "Short Lived Compounds", "Other GHG", "Biogenic CO2", "Stationary Combustion"]

#Removing trailing blanks from county and state variables
data4["County"] = data4["County"].apply(cap)
data4["State"] = data4["State"].apply(cap)

#Grouping data into key variables and obtaining emission totals
data4 = data4.groupby(["Year", "State", "County"]).agg({"Total Emissions":[np.nansum],
                                               "CO2":[np.nansum],
                                              "Methane":[np.nansum],
                                               "Nitrous Oxide":[np.nansum],
                                               "HFC":[np.nansum],
                                               "PFC":[np.nansum],
                                               "SF6":[np.nansum],
                                               "NF3":[np.nansum],
                                               "Other Fluorane":[np.nansum],
                                               "HFE":[np.nansum],
                                               "Short Lived Compounds":[np.nansum],
                                               "Other GHG":[np.nansum],
                                               "Biogenic CO2":[np.nansum],
                                               "Stationary Combustion":[np.nansum]}).reset_index()

#Renaming after grouping
data4.columns = data4.columns.droplevel(1)

data/ghgp_data_2014_8_19_2018.xlsx
data/ghgp_data_2013_8_19_2018.xlsx
data/ghgp_data_2012_8_19_2018.xlsx
data/ghgp_data_2015_8_19_2018.xlsx
data/ghgp_data_2010_8_19_2018.xlsx
data/ghgp_data_2011_8_19_2018.xlsx
data/ghgp_data_2016_8_19_2018.xlsx


In [9]:
data4

Unnamed: 0,Year,State,County,Total Emissions,CO2,Methane,Nitrous Oxide,HFC,PFC,SF6,NF3,Other Fluorane,HFE,Short Lived Compounds,Other GHG,Biogenic CO2,Stationary Combustion
0,2010,,Arecibo,3.021681e+05,148171.2,153638.75000,358.196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,148679.646
1,2010,,Barceloneta,1.173145e+05,116917.8,117.50000,279.226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117314.526
2,2010,,Carolina,2.438725e+04,24301.8,25.25000,60.196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24387.246
3,2010,,Carolina municipio,6.757335e+04,0.6,67572.75000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.600
4,2010,,Ceiba,1.340728e+03,1336.2,1.25000,3.278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1340.728
5,2010,,Fajardo,3.058360e+04,6.6,30577.00000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.600
6,2010,,Guam,1.270954e+06,1266667.0,1272.00000,3014.568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1270953.568
7,2010,,Guayama,3.291173e+06,3264320.3,9826.00000,17027.124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3291173.424
8,2010,,Guayanilla,3.199043e+06,3188445.7,3131.50000,7465.794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3199042.994
9,2010,,Guaynabo,6.413260e+04,150.1,63982.50000,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,150.350


In [10]:
#Checking Number of records for each combination of key variables
query="""SELECT DISTINCT Count FROM (SELECT Year, State, County, COUNT(*) AS Count FROM data4 GROUP BY Year, State, County)"""
ps.sqldf(query=query)

Unnamed: 0,Count
0,1


In [11]:
#Checking Number of Counties for each State
query = """SELECT State, COUNT(County) from data4 GROUP BY County"""
ps.sqldf(query=query)

Unnamed: 0,State,COUNT(County)
0,Alabama,108
1,Louisiana,7
2,Louisiana,7
3,Virginia,7
4,Idaho,7
5,Colorado,63
6,Vermont,7
7,South carolina,7
8,Florida,7
9,North carolina,7


In [12]:
#Checking that variable values are not repeating
query="""SELECT * FROM data4 WHERE State='Louisiana' AND County = 'Ascension'"""
ps.sqldf(query=query)

Unnamed: 0,Year,State,County,Total Emissions,CO2,Methane,Nitrous Oxide,HFC,PFC,SF6,NF3,Other Fluorane,HFE,Short Lived Compounds,Other GHG,Biogenic CO2,Stationary Combustion
0,2010,Louisiana,Ascension,9963890.008,7512631.8,1498.25,2449759.958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3330578.956
1,2011,Louisiana,Ascension,9932573.316,7584968.3,1513.0,2346092.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3388865.584
2,2012,Louisiana,Ascension,641313.822,641134.3,115.75,63.772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97471.932
3,2013,Louisiana,Ascension,632636.174,632531.4,48.75,56.024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88435.08
4,2014,Louisiana,Ascension,680510.048,680312.7,130.0,67.348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,101253.456
5,2015,Louisiana,Ascension,561838.016,561746.0,42.25,49.766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88823.616
6,2016,Louisiana,Ascension,630600.554,630501.5,47.5,51.554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90208.86


In [13]:
#Obtaining map from State Code/County Code to County Name:
xls = pd.ExcelFile("data/all-geocodes-v2016.xlsx")
data_geo = xls.parse("Sheet1", skiprows=4, index_col=None, na_values=[np.nan])

statecd = data_geo[(data_geo["County Code (FIPS)"] == 0)*(data_geo["Place Code (FIPS)"] == 0)*(data_geo["Consolidtated City Code (FIPS)"] == 0)].reset_index(drop=True)

statecd = statecd[["State Code (FIPS)", "Area Name (including legal/statistical area description)"]]

countycd = data_geo[data_geo["County Code (FIPS)"] != 0][["State Code (FIPS)", "County Code (FIPS)", "Area Name (including legal/statistical area description)"]]

data_geo = pd.merge(countycd, statecd, how="inner", on="State Code (FIPS)")

In [14]:
data_geo

Unnamed: 0,State Code (FIPS),County Code (FIPS),Area Name (including legal/statistical area description)_x,Area Name (including legal/statistical area description)_y
0,1,1,Autauga County,Alabama
1,1,3,Baldwin County,Alabama
2,1,5,Barbour County,Alabama
3,1,7,Bibb County,Alabama
4,1,9,Blount County,Alabama
5,1,11,Bullock County,Alabama
6,1,13,Butler County,Alabama
7,1,15,Calhoun County,Alabama
8,1,17,Chambers County,Alabama
9,1,19,Cherokee County,Alabama
