# Generating Input Files for Repast Models

This notebook generates the input files for the agents in the Repast4py-Models project. 

It takes some spatial input data, and some census data to model the population flows between census blocks and OSM buildings.

In [1]:
from random import random
from bisect import bisect
from itertools import repeat
import numpy as np
import pandas as pd
import geopandas as gpd

### Control Files

In [2]:
# Input files
buildings_path = './buildings_assignment_input_files/buildings_CT/bldgs_ct.shp'
ct_path = './buildings_assignment_input_files/ffx_county_city_ct/ffx_county_city_ct.shp'
flows_path = './buildings_assignment_input_files/commuting_flows_gt.csv'

# Output files
building_ffx = './repast4py/input/input_buildings_ffx_2.csv'
input_agents_ffx = './repast4py/input/input_agents_ffx_2.csv'

## Processing the buildings
Needs to read the OSM shapefile, reproject the coords, and rename some columns

In [3]:
buildings = gpd.read_file(buildings_path)
buildings = buildings.reset_index()
buildings = buildings.rename(columns={"index":"building_id"})

# Strange Projection to reproject to... Why not WGS84 or something more specific to location. 
buildings['x_centroid'] = np.floor(buildings['geometry'].to_crs(epsg=32610).centroid.x).astype(np.int64)
buildings['y_centroid'] = np.floor(buildings['geometry'].to_crs(epsg=32610).centroid.y).astype(np.int64)

buildings = buildings[['building_id','x_centroid','y_centroid','predicted','GEOID10']]
buildings = buildings.rename(columns={"predicted":"building_type", "GEOID10":"ct_id"})
buildings['ct_id'] = buildings['ct_id'].astype(np.int64)

##### building types: 0 - non-residential; 1 - residential

In [4]:
buildings.to_csv(building_ffx, index=False)

## Calculating Buildings per Census Tract?

In [5]:
d = {'ct_id': buildings['ct_id'].drop_duplicates().reset_index(drop=True),
     'res_count': [0] * buildings['ct_id'].nunique(), 
     'non_res_count': [0] * buildings['ct_id'].nunique()}

buildings_per_ct = pd.DataFrame(data=d)

In [6]:
buildings_per_ct['res_count']=buildings_per_ct['ct_id'].map(buildings[buildings['building_type'] == 1].groupby('ct_id').count()['building_type'])
buildings_per_ct['non_res_count']=buildings_per_ct['ct_id'].map(buildings[buildings['building_type'] == 0].groupby('ct_id').count()['building_type'])

In [7]:
buildings_per_ct = buildings_per_ct.fillna(0)
buildings_per_ct['res_count'] = buildings_per_ct['res_count'].astype(np.int64)
buildings_per_ct['non_res_count'] = buildings_per_ct['non_res_count'].astype(np.int64)

## Creating Simulated Work <> Home trips

In [8]:
ct_2010 = gpd.read_file(ct_path)

In [9]:
ct_2010.dtypes

GEOID10       object
geometry    geometry
dtype: object

In [10]:
ct_2010['GEOID10'] = ct_2010['GEOID10'].astype(np.int64)

In [11]:
commuting_flows = pd.read_csv(flows_path)
commuting_flows.shape

(34366, 3)

In [12]:
commuting_flows = commuting_flows[commuting_flows['h_geoid'].isin(ct_2010['GEOID10'])]
commuting_flows = commuting_flows[commuting_flows['w_geoid'].isin(ct_2010['GEOID10'])]
commuting_flows.shape

(34366, 3)

In [13]:
commuting_flows = commuting_flows[commuting_flows['h_geoid'].isin
                                  (buildings_per_ct[buildings_per_ct['res_count'] != 0]['ct_id'])]
commuting_flows = commuting_flows[commuting_flows['w_geoid'].isin
                                  (buildings_per_ct[buildings_per_ct['non_res_count'] != 0]['ct_id'])]
commuting_flows.shape

(33757, 3)

In [14]:
pop_per_ct = buildings_per_ct.copy()
pop_per_ct = pop_per_ct.rename(columns={'res_count': 'res_pop', 'non_res_count': 'work_pop'})
pop_per_ct['res_pop'].values[:] = 0
pop_per_ct['work_pop'].values[:] = 0

In [15]:
pop_per_ct['res_pop'] = pop_per_ct['ct_id'].map(
    commuting_flows.groupby('h_geoid').sum()['count'])
pop_per_ct['work_pop'] = pop_per_ct['ct_id'].map(
    commuting_flows.groupby('w_geoid').sum()['count'])

In [16]:
pop_per_ct = pop_per_ct.fillna(0)
pop_per_ct['res_pop'] = pop_per_ct['res_pop'].astype(np.int64)
pop_per_ct['work_pop'] = pop_per_ct['work_pop'].astype(np.int64)

## Looping through census tracks to generate work/home links?
Not sure what's going on here. Must this be in a loop? Seems to run rather slow. 

In [17]:
# synthetic population generation: 2-pass weighted probability for selecting work census tracts
home_id = []
work_id = []
for index, row in pop_per_ct.iterrows():
    home_id.extend(buildings[(buildings['ct_id'] == row['ct_id']) & 
              (buildings['building_type'] == 1)]['building_id'].sample(row['res_pop'], 
                                                                       replace=True).values.tolist())

    work_flows = commuting_flows[commuting_flows['h_geoid'] == row['ct_id']]
    idx_list = []
    size = work_flows['count'].sum()
    arr = (work_flows['count']/size).cumsum().to_numpy()

    idx_list = [bisect(arr,round(random(), 8)) 
                    for _ in repeat(None, size)] 
    w_ids = work_flows.reset_index()['w_geoid'].iloc[idx_list].values.tolist()

    work_id += [buildings[(buildings['ct_id'] == w_id) & 
                            (buildings['building_type'] == 0)]['building_id'].sample
                            (1).reset_index().at[0,'building_id']
                            for w_id in w_ids]

In [18]:
input_agents = pd.DataFrame(
    {'home_id': home_id,
     'work_id': work_id,
    })
input_agents = input_agents.reset_index().rename(columns={'index': 'agent_id'})

In [19]:
input_agents.to_csv(input_agents_ffx, index=False)

In [20]:
input_agents

Unnamed: 0,agent_id,home_id,work_id
0,0,11384,31035
1,1,11384,137774
2,2,11397,8786
3,3,11395,13355
4,4,11395,4826
...,...,...,...
258431,258431,91220,307
258432,258432,91214,7362
258433,258433,91222,2544
258434,258434,91214,1719
