In [0]:
import os
import numpy as np
import pandas as pd
import PIL
import matplotlib as mpl
import seaborn as sns
from scipy.ndimage.filters import gaussian_filter
import statsmodels.formula.api as smf
from statsmodels.nonparametric.kde import kdensityfft
import sklearn.model_selection as sk
import mectools.data as dt
import tensorflow as tf
from tensorflow import keras

In [0]:
gpu = tf.config.experimental.list_physical_devices('GPU')[0]
tf.config.experimental.set_memory_growth(gpu, True)

In [0]:
plt = plotter(backend='Agg')
%matplotlib inline

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
save = False

In [0]:
# args
seed = 2384923
samp = 0.01
BATCH_SIZE = 32
imsize = 256
pixel = 60

In [0]:
# random init
state = np.random.RandomState(seed)

In [0]:
# functions
def load_path(tag, base='../data/tiles_fast/census2004/density', size=1024, ext='jpg'):
    tag = f'{tag:07d}'
    sub = tag[:4]
    return f'{base}/{size}px/{sub}/{tag}.{ext}'

In [0]:
# evaluation
def eval_model(y, yhat, ymin=-2, ymax=2, nbins=10, axs=None):
    if axs is None:
        _, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
    else:
        ax0, ax1 = axs

    res = pd.DataFrame({'y': y, 'yhat': yhat}).astype(np.float)
    res['err'] = res['yhat'] - res['y']
    res1 = res.query(f'y > {ymin} and y < {ymax} and yhat > {ymin} and yhat < {ymax}')
    ax0.hexbin(res1['y'], res1['yhat'], cmap=mpl.cm.Blues, gridsize=20);
    
    bins = np.linspace(ymin, ymax, nbins)
    res['ybin'] = np.digitize(res['y'], bins)
    res['ybin'] = np.minimum(nbins-1, res['ybin'])
    bmean = res.groupby('ybin')['yhat'].mean()
    bmean = bmean.reindex(np.arange(nbins))
    bmean.index = bins
    bmean.plot(ax=ax1);
    
    ax0.set_xlabel('True Productivity')
    ax0.set_ylabel('Predicted Productivity')
    ax0.set_title('Joint Distribution')
    ax1.set_xlabel('True Productivity')
    # ax1.set_ylabel('Predicted Productivity')
    ax1.set_title(f'Binned Results ({nbins})')

In [0]:
# load in firm and location data
firms = pd.read_csv('../data/firms/census2004_geocode.csv', usecols=['id', 'industry', 'income', 'total_assets', 'employees'])
targ = pd.read_csv('../index/firms/census2004_mincloud2002.csv', usecols=['id', 'lat_wgs84', 'lon_wgs84', 'prod_id'])
firms = pd.merge(firms, targ, on='id', how='left').dropna(subset=['id', 'prod_id'])

# downsample for now
firms = firms.sample(frac=samp, random_state=seed)
print(len(firms))

# calculate outcome stats
firms['prod'] = firms['income']/firms['employees']
firms['lprod'] = dt.log(firms['prod'])
firms = firms.dropna(subset=['lprod'])
print(len(firms))

# calculate residual performance
reg_ind = smf.ols('lprod ~ 0 + C(industry)', data=firms).fit()
firms['lprod_resid'] = reg_ind.resid

In [0]:
# load in image features
density_256 = np.stack([np.array(PIL.Image.open(load_path(i))) for i in firms['id']])
density_1024 = np.stack([np.array(PIL.Image.open(load_path(i))) for i in firms['id']])

In [0]:
# features = np.stack([density_256, density_1024], axis=-1).astype(np.float32)/255 # single channel image
features = np.stack([density_1024], axis=-1).astype(np.float32)/255 # single channel image

In [0]:
# construct outcome variable
labels = firms['lprod'].values
labels_resid = firms['lprod_resid'].values
labels = labels[:,None].astype(np.float32)
labels_resid = labels_resid[:,None].astype(np.float32)

In [0]:
# do train/test split
X_train, X_valid, y_train, y_valid, yr_train, yr_valid = sk.train_test_split(features, labels, labels_resid, test_size=0.2, random_state=seed)

### CNN Model

In [0]:
# CIFAR like model (1024px)
model = keras.Sequential([
    keras.layers.Conv2D(filters=16, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.25),
    keras.layers.Conv2D(filters=32, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# CIFAR like model (1024px) - multichannel
model = keras.Sequential([
    keras.layers.DepthwiseConv2D(depth_multiplier=4, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.25),
    keras.layers.DepthwiseConv2D(depth_multiplier=4, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# CIFAR like model (256px)
model = keras.Sequential([
    keras.layers.Conv2D(filters=16, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.25),
    keras.layers.Conv2D(filters=32, kernel_size=4, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=16),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# train keras model
history = model.fit(X_train, y_train, epochs=10, validation_data=[X_valid, y_valid])

In [0]:
model.summary()

In [0]:
yhat_valid = model.predict(X_valid)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
eval_model(y_valid[:,0], yhat_valid[:,0], ymin=2, ymax=6, axs=(ax0, ax1))
if save: fig.savefig('../docs/images/cnn_results_valid.png', dpi=300, bbox_inches='tight')

In [0]:
yhat_train = model.predict(X_train)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
eval_model(y_train[:,0], yhat_train[:,0], ymin=2, ymax=6, axs=(ax0, ax1))
if save: fig.savefig('../docs/images/cnn_results_train.png', dpi=300, bbox_inches='tight')

### Classical

In [0]:
model_mean = keras.Sequential([
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(1)
])
model_mean.compile(loss='mean_squared_error', optimizer='adam')
history = model_mean.fit(X_train, y_train, epochs=25, validation_data=[X_valid, y_valid])

In [0]:
yhat_valid = model_mean.predict(X_valid)
eval_model(y_valid[:,0], yhat_valid[:,0], ymin=2, ymax=6)

In [0]:
yhat_train = model_mean.predict(X_train)
eval_model(y_train[:,0], yhat_train[:,0], ymin=2, ymax=6)

### Radial

In [0]:
class RadialPooling2D(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        size0 = self.add_weight(name='size0', shape=(1,), initializer='uniform', trainable=True)
        size = 128*keras.activations.sigmoid(size0)
        _, span_x, span_y, _ = input_shape
        zero_x, zero_y = int(span_x//2), int(span_y//2)
        vals_x, vals_y = tf.cast(tf.range(span_x), tf.float32), tf.cast(tf.range(span_y), dtype=tf.float32)
        grid_x, grid_y = tf.meshgrid(vals_x, vals_y)
        radius = tf.sqrt((grid_x-zero_x)**2+(grid_y-zero_y)**2)
        self.mask = keras.activations.sigmoid(-(radius-size)/10)[None,:,:,None]
        super().build(input_shape)

    def call(self, x):
        return tf.reshape(tf.reduce_mean(tf.multiply(x, self.mask)), (-1, 1))

    def compute_output_shape(self, input_shape):
        return (1,)

In [0]:
imsize = 256
inputs = keras.layers.Input(shape=(imsize, imsize, 1))
pool = keras.layers.Concatenate()([RadialPooling2D()(inputs) for _ in range(5)])
outputs = keras.layers.Dense(1)(pool)
model_radial = keras.models.Model(inputs=inputs, outputs=outputs)
model_radial.compile(loss='mean_squared_error', optimizer='adam')
history = model_radial.fit(X_train, y_train, epochs=25, validation_data=[X_valid, y_valid])

In [0]:
yhat_valid = model_radial.predict(X_valid)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
eval_model(y_valid[:,0], yhat_valid[:,0], ymin=2, ymax=6, axs=(ax0, ax1))
fig.savefig('../slides/images/radial_results.png', dpi=300, bbox_inches='tight')

In [0]:
yhat_train = model_radial.predict(X_train)
eval_model(y_train[:,0], yhat_train[:,0], ymin=2, ymax=6)

### Classical

In [0]:
imgrid = np.arange(imsize)
grid_x, grid_y = np.meshgrid(imgrid, imgrid)
zero_x, zero_y = imsize // 2, imsize // 2
zdist = np.sqrt((grid_x-zero_x)**2+(grid_y-zero_y)**2)
def radial_density(dense, rad0, rad1):
    mask = (zdist >= rad0) & (zdist <= rad1)
    return (mask[None,:,:]*dense).mean(axis=(1, 2))
for rad0, rad1 in ((0, 32), (32, 64), (64, 128)):
    firms[f'dense_{rad0}_{rad1}'] = radial_density(features[:,:,:,0], rad0, rad1)

In [0]:
reg_radial = smf.ols('lprod ~ 1 + dense_0_32 + dense_32_64 + dense_64_128', data=firms).fit()
reg_radial.summary()

In [0]:
yhat_radial = reg_radial.predict()
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
eval_model(firms['lprod'].values, yhat_radial, ymin=2, ymax=6, axs=(ax0, ax1))
fig.savefig('../slides/images/ols_results.png', dpi=300, bbox_inches='tight')

### Residual Productivity

In [0]:
firms1 = dt.winsorize(firms[['lprod', 'lprod_resid']], level=0.001)
jp = sns.jointplot('lprod', 'lprod_resid', data=firms1, kind='hex');
jp.set_axis_labels('Log Productivity', 'Residual Log Productivity');
jp.savefig('../slides/images/residual_distribution.png', dpi=300, bbox_inches='tight')

In [0]:
history = model.fit(X_train, yr_train, epochs=25, validation_data=[X_valid, yr_valid])

In [0]:
yrhat_valid = model.predict(X_valid)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
eval_model(yr_valid[:,0], yrhat_valid[:,0], ymin=-1.5, ymax=1.5, axs=(ax0, ax1))
fig.savefig('../slides/images/resid_results_valid.png', dpi=300, bbox_inches='tight')

In [0]:
yrhat_train = model.predict(X_train)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
eval_model(yr_train[:,0], yrhat_train[:,0], ymin=-1.5, ymax=1.5, axs=(ax0, ax1))
fig.savefig('../slides/images/resid_results_train.png', dpi=300, bbox_inches='tight')

### Inspection

In [0]:
norm = 300
eps = 1e-16

In [0]:
normalize = lambda x: x/(x+norm)
quantize = lambda x: (256*(x-eps)).astype(np.uint8)

In [0]:
X_empty = np.zeros((1, imsize, imsize, 1))
model.predict(X_empty)[0,0]

In [0]:
def gen_density(dat, sigma=2):
    count, lon_bins, lat_bins = np.histogram2d(dat[:,0], dat[:,1], bins=imsize, range=((0, 1), (0, 1)))
    dens = count/(4*pixel/1e3)**2 # firms per square kilometer
    dens = gaussian_filter(dens, sigma=sigma)
    dimg = quantize(normalize(dens))
    return dimg

In [0]:
nsim = len(features)
features1 = features[:nsim,:,:,0]
dense = np.stack([gen_density(0.5+0.1*np.random.randn(50, 2)) for _ in range(nsim)])
augment = np.stack([f+d for f, d in zip(features1, dense)])
aug_pred = model.predict(augment[:,:,:,None])[:,0].astype(np.float)
bas_pred = model.predict(features1[:,:,:,None])[:,0].astype(np.float)
dif_pred = aug_pred - bas_pred

In [0]:
print(dif_pred.mean())
print((dif_pred>0).mean())
plt.hist(dif_pred);

In [0]:
dif_pred1 = dif_pred[(dif_pred<3.5)&(dif_pred>-2)]
kde_vals, kde_grid, kde_bw = kdensityfft(dif_pred1)
fig, ax = plt.subplots(figsize=(6, 5))
ax.plot(kde_grid, kde_vals)
ax.set_xlabel('Change in Log Productivity')
ax.set_ylabel('Density')
ax.set_title('Adding 50 Additional Firms')
fig.savefig('../slides/images/impulse_response.png', dpi=300, bbox_inches='tight');