In [0]:
import os
import PIL
import matplotlib as mpl
import statsmodels.formula.api as smf
import mectools.data as dt

In [0]:
plt = plotter(backend='Agg')
%matplotlib inline

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
# args
seed = 2384923
samp = 0.01
BATCH_SIZE = 32
size = 1024
imsize = 256

In [0]:
# spatial scale
pixel = 15 # m/pixel
impixel = pixel*(size/imsize)
print(impixel)

In [0]:
def load_path(tag, base='../tiles/density', ext='jpg'):
    tag = f'{tag:07d}'
    sub = tag[:4]
    return f'{base}/{size}px/{sub}/{tag}.{ext}'

In [0]:
# evaluation
def eval_model(y, yhat, ymin=-np.inf, ymax=np.inf, nbins=10):
    fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))

    res = pd.DataFrame({'y': y, 'yhat': yhat}).astype(np.float)
    res['err'] = res['yhat'] - res['y']
    res1 = res.query(f'y > {ymin} and y < {ymax} and yhat > {ymin} and yhat < {ymax}')
    ax0.hexbin(res1['y'], res1['yhat'], cmap=mpl.cm.Blues, gridsize=20);
    
    bins = np.linspace(ymin, ymax, nbins)
    res['ybin'] = np.digitize(res['y'], bins)
    res['ybin'] = np.minimum(nbins-1, res['ybin'])
    bmean = res.groupby('ybin')['yhat'].mean()
    bmean.plot(ax=ax1);

In [0]:
# load in firm and location data
firms = pd.read_csv('../firms/census_2004_geocode.csv', usecols=['id', 'industry', 'income', 'total_assets', 'employees'])
targ = pd.read_csv('../targets/census_firms_2004.csv', usecols=['id', 'lat_wgs84', 'lon_wgs84'])
firms = pd.merge(firms, targ, on='id', how='left').dropna()

# resolve image paths
firms['file'] = firms['id'].apply(load_path)
firms['fexist'] = firms['file'].apply(os.path.exists)
firms = firms[firms['fexist']].drop('fexist', axis=1)

# calculate outcome stats
firms['prod'] = firms['income']/firms['employees']
firms['lprod'] = dt.log(firms['prod'])
firms = firms.dropna(subset=['lprod'])

# calculate residual performance
reg_ind = smf.ols('lprod ~ 0 + C(industry)', data=firms).fit()
firms['lprod_resid'] = reg_ind.resid

In [0]:
density = np.stack([np.array(PIL.Image.open(fn)) for fn in firms['file']])

In [0]:
imgrid = np.arange(imsize)
grid_x, grid_y = np.meshgrid(imgrid, imgrid)
zero_x, zero_y = imsize // 2, imsize // 2
zdist = np.sqrt((grid_x-zero_x)**2+(grid_y-zero_y)**2)
def radial_density(dense, rad0, rad1):
    mask = (zdist >= rad0) & (zdist <= rad1)
    return (mask[None,:,:]*density).mean(axis=(1, 2))
for rad0, rad1 in ((0, 32), (32, 64), (64, 128)):
    firms[f'dense_{rad0}_{rad1}'] = radial_density(density, rad0, rad1)

In [0]:
reg_radial = smf.ols('lprod_resid ~ 1 + dense_0_32 + dense_32_64 + dense_64_128', data=firms).fit()
reg_radial.summary()

In [0]:
yhat_radial = reg_radial.predict()
eval_model(firms['lprod_resid'].values, yhat_radial, ymin=-2, ymax=2)