In [0]:
import os
import numpy as np
import pandas as pd
import PIL
import matplotlib as mpl
import seaborn as sns
import statsmodels.formula.api as smf
import sklearn.model_selection as sk
import mectools.data as dt
import tensorflow as tf
from tensorflow import keras

In [0]:
plt = plotter(backend='Agg')
%matplotlib inline

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
# args
seed = 2384923
samp = 0.01
BATCH_SIZE = 32
size = 1024

In [0]:
# random init
state = np.random.RandomState(seed)

In [0]:
# functions
def load_path(tag, base='../tiles/density', ext='jpg'):
    tag = f'{tag:07d}'
    sub = tag[:4]
    return f'{base}/{size}px/{sub}/{tag}.{ext}'

In [0]:
# evaluation
def eval_model(y, yhat, ymin=-2, ymax=2, nbins=10):
    fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))

    res = pd.DataFrame({'y': y, 'yhat': yhat}).astype(np.float)
    res['err'] = res['yhat'] - res['y']
    res1 = res.query(f'y > {ymin} and y < {ymax} and yhat > {ymin} and yhat < {ymax}')
    ax0.hexbin(res1['y'], res1['yhat'], cmap=mpl.cm.Blues, gridsize=20);
    
    bins = np.linspace(ymin, ymax, nbins)
    res['ybin'] = np.digitize(res['y'], bins)
    res['ybin'] = np.minimum(nbins-1, res['ybin'])
    bmean = res.groupby('ybin')['yhat'].mean()
    bmean.plot(ax=ax1);

In [0]:
# load in firm and location data
firms = pd.read_csv('../firms/census_2004_geocode.csv', usecols=['id', 'industry', 'income', 'total_assets', 'employees'])
targ = pd.read_csv('../targets/census_firms_2004.csv', usecols=['id', 'lat_wgs84', 'lon_wgs84'])
firms = pd.merge(firms, targ, on='id', how='left').dropna()

# downsample for now
firms = firms.sample(frac=samp)

# resolve image paths
firms['file'] = firms['id'].apply(load_path)
firms['fexist'] = firms['file'].apply(os.path.exists)
firms = firms[firms['fexist']]

# calculate outcome stats
firms['prod'] = firms['income']/firms['employees']
firms['lprod'] = dt.log(firms['prod'])
firms = firms.dropna(subset=['lprod'])

# calculate residual performance
reg_ind = smf.ols('lprod ~ 0 + C(industry)', data=firms).fit()
firms['lprod_resid'] = reg_ind.resid

In [0]:
# load in image features
features = np.stack([np.array(PIL.Image.open(fn)) for fn in firms['file']])
features = features[:,:,:,None].astype(np.float32)/255 # single channel image

In [0]:
# construct outcome variable
labels = firms['lprod'].values
# labels = firms['lprod_resid'].values
labels = labels[:,None].astype(np.float32)

In [0]:
# do train/test split
X_train, X_valid, y_train, y_valid = sk.train_test_split(features, labels, test_size=0.2, random_state=seed)

### CNN Model

In [0]:
# CIFAR like model (1024px)
model = keras.Sequential([
    keras.layers.Conv2D(filters=16, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.25),
    keras.layers.Conv2D(filters=32, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# CIFAR like model (256px)
model = keras.Sequential([
    keras.layers.Conv2D(filters=16, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.25),
    keras.layers.Conv2D(filters=32, kernel_size=4, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=16),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# train keras model
history = model.fit(X_train, y_train, epochs=25, validation_data=[X_valid, y_valid])

In [0]:
yhat_valid = model.predict(X_valid)
eval_model(y_valid[:,0], yhat_valid[:,0], ymin=2, ymax=6)

In [0]:
yhat_train = model.predict(X_train)
eval_model(y_train[:,0], yhat_train[:,0], ymin=2, ymax=6)

### Classical

In [0]:
model_mean = keras.Sequential([
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(1)
])
model_mean.compile(loss='mean_squared_error', optimizer='adam')
history = model_mean.fit(X_train, y_train, epochs=25, validation_data=[X_valid, y_valid])

In [0]:
yhat_valid = model_mean.predict(X_valid)
eval_model(y_valid[:,0], yhat_valid[:,0])

In [0]:
yhat_train = model_mean.predict(X_train)
eval_model(y_train[:,0], yhat_train[:,0])

In [0]:
model_mean.layers[1].get_weights()

In [0]:
firms['dense_1024'] = features.reshape((len(firms), -1)).mean(axis=1)

In [0]:
reg = smf.ols('lprod_resid ~ dense_1024', data=firms).fit()
reg.summary()

In [0]:
eval_model(firms['lprod_resid'], reg.predict())

### Radial

In [0]:
class RadialPooling2D(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        size0 = self.add_weight(name='size0', shape=(1,), initializer='uniform', trainable=True)
        size = 128*keras.activations.sigmoid(size0)
        _, span_x, span_y, _ = input_shape
        zero_x, zero_y = int(span_x//2), int(span_y//2)
        vals_x, vals_y = tf.cast(tf.range(span_x), tf.float32), tf.cast(tf.range(span_y), dtype=tf.float32)
        grid_x, grid_y = tf.meshgrid(vals_x, vals_y)
        radius = tf.sqrt((grid_x-zero_x)**2+(grid_y-zero_y)**2)
        self.mask = keras.activations.sigmoid(-(radius-size)/10)[None,:,:,None]
        super().build(input_shape)

    def call(self, x):
        return tf.reshape(tf.reduce_mean(tf.multiply(x, self.mask)), (-1, 1))

    def compute_output_shape(self, input_shape):
        return (1,)

In [0]:
imsize = 256
inputs = keras.layers.Input(shape=(imsize, imsize, 1))
pool = keras.layers.Concatenate()([RadialPooling2D()(inputs) for _ in range(5)])
outputs = keras.layers.Dense(1)(pool)
model_radial = keras.models.Model(inputs=inputs, outputs=outputs)
model_radial.compile(loss='mean_squared_error', optimizer='adam')
history = model_radial.fit(X_train, y_train, epochs=25, validation_data=[X_valid, y_valid])

In [0]:
yhat_valid = model_radial.predict(X_valid)
print(pd.Series(yhat_valid[:,0]).describe())
eval_model(y_valid[:,0], yhat_valid[:,0])

### Inspection