In [1]:
import pandas as pd
import json
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [2]:
data = pd.read_csv('etl_filtered_v1.csv', index_col=[0]).fillna(0)
rows_before_merge = len(data)
data['AVG_TEMP'] = (data['TMAX'] - data['TMIN'])/2
data.columns = map(str.lower, data.columns)
features = ['latitude_x', 'longitude_x', 'avg_temp', 'awnd', 'prcp', 'datetime', 'fire_size']
data = data.drop(columns=[x for x in data.columns if x not in features])
data.columns.values

array(['fire_size', 'latitude_x', 'longitude_x', 'datetime', 'awnd',
       'prcp', 'avg_temp'], dtype=object)

In [3]:
def loc_match(row):
    lat, long = row[0], row[1]
    coords = row[2]
    point = Point(lat, long)
    for poly in coords:
        polygon = Polygon([(x[1], x[0]) for x in poly])
        if polygon.contains(point):
            return poly

In [4]:
with open('county.geojson', 'r') as f:
    geojson = json.load(f)
geo = pd.json_normalize(geojson['features'])
geo = geo.drop(geo[geo['geometry.coordinates'].map(len) > 1].index)
geo.astype({'properties.GEOID': 'str'}).dtypes
geo['geometry.coordinates'] = geo['geometry.coordinates'].apply(lambda x: x[0])
geo['properties.GEOID'] = geo['properties.GEOID'].apply(lambda x: x if len(x) > 4 else '0'+x)
geofeatures = ['geometry.coordinates', 'properties.GEOID']
geo = geo.drop(columns=[x for x in geo.columns if x not in geofeatures])
coord_list = geo['geometry.coordinates'].to_list()
geo.head()

Unnamed: 0,geometry.coordinates,properties.GEOID
0,"[[-85.6577, 31.8803], [-85.6488, 31.9116], [-8...",1005
1,"[[-88.4732, 31.8939], [-88.4313, 32.2277], [-8...",1023
2,"[[-86.9059, 31.753], [-86.9065, 31.6326], [-86...",1035
3,"[[-86.375, 32.7536], [-86.0072, 32.755], [-85....",1051
4,"[[-87.7157, 33.0068], [-87.4219, 33.0034], [-8...",1065


In [5]:
data['coords'] = [coord_list for _ in range(len(data))]
data['polygon'] = data[['latitude_x', 'longitude_x', 'coords']].apply(loc_match, axis=1)
data = data.dropna()
data['polygon'] = [''.join(map(str, l)) for l in data['polygon']]
geo['geometry.coordinates'] = [''.join(map(str, l)) for l in geo['geometry.coordinates']]
data = pd.merge(data, geo, how='left', left_on='polygon', right_on='geometry.coordinates', suffixes=('_data', '_geo'))
rows_after_merge = len(data)
print("rows after merge %i"%rows_after_merge)
print(rows_after_merge/rows_before_merge*100)

rows after merge 358571
97.5056017229377


In [6]:
features += ['properties.GEOID']
features = list(set(features))
data = data[features]
data.to_csv('checkpoint.csv')
data['month'] = data['datetime'].apply(lambda x: int(x.split('-')[1]))
data['year'] = data['datetime'].apply(lambda x: int(x.split('-')[0]))
data = data.groupby(['properties.GEOID', 'month', 'year'], dropna=False).mean().reset_index().rename(columns={'properties.GEOID':'geoid','latitude_x':'latitude','longitude_x':'longitude'})
data.head()

Unnamed: 0,geoid,month,year,longitude,awnd,latitude,prcp,avg_temp,fire_size
0,1001,1,2011,-86.526475,8.065464,32.429962,0.0,13.836333,3.0
1,1001,1,2012,-86.70261,7.930789,32.429285,0.046667,13.752779,0.75
2,1001,1,2013,-86.8181,7.800357,32.392246,0.0,13.670974,0.1
3,1001,1,2014,-86.603375,7.654442,32.529284,0.0,13.939294,7.977778
4,1001,1,2015,-86.616028,7.51426,32.547684,0.0,12.678677,2.06


In [31]:
data = pd.read_csv('checkpoint.csv', index_col=[0], dtype={'properties.GEOID':'string'}).fillna(0)
print(len(data))
data['month'] = data['datetime'].apply(lambda x: int(x.split('-')[1]))
data['year'] = data['datetime'].apply(lambda x: int(x.split('-')[0]))
data = data.groupby(['properties.GEOID', 'month', 'year'], dropna=False).mean().reset_index().rename(columns={'properties.GEOID':'geoid','latitude_x':'latitude','longitude_x':'longitude'})
print(len(data))
# for year in range(data['year'].min(), data['year'].max()+1):
#     for month in range(1,13):
#         data.loc[(data['year']==year) & (data['month']==month)].to_csv('data/%i_%i_firesize_by_county.csv'%(year, month), index=False)

358571
67463
