In [0]:
import os
from glob import glob
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from PIL import Image
import mectools.data as dt
import sklearn.model_selection as sk

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
import matplotlib as mpl
plt = plotter(backend='agg')
%matplotlib inline

In [0]:
for gpu in tf.config.experimental.list_physical_devices('GPU'):
    print(gpu)
    tf.config.experimental.set_memory_growth(gpu, True)

In [0]:
K = 256 # image size in pixels

In [0]:
# evaluation
def eval_model(y, yhat, ymin=-2, ymax=2, nbins=10, axs=None):
    if axs is None:
        _, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))
    else:
        ax0, ax1 = axs

    res = pd.DataFrame({'y': y, 'yhat': yhat}).astype(np.float)
    res['err'] = res['yhat'] - res['y']
    res1 = res.query(f'y > {ymin} and y < {ymax} and yhat > {ymin} and yhat < {ymax}')
    ax0.hexbin(res1['y'], res1['yhat'], cmap=mpl.cm.Blues, gridsize=20);
    
    bins = np.linspace(ymin, ymax, nbins)
    res['ybin'] = np.digitize(res['y'], bins)
    res['ybin'] = np.minimum(nbins-1, res['ybin'])
    bmean = res.groupby('ybin')['yhat'].mean()
    bmean = bmean.reindex(np.arange(nbins))
    bmean.index = bins
    bmean.plot(ax=ax1);
    
    ax0.set_xlabel('True Productivity')
    ax0.set_ylabel('Predicted Productivity')
    ax0.set_title('Joint Distribution')
    ax1.set_xlabel('True Productivity')
    # ax1.set_ylabel('Predicted Productivity')
    ax1.set_title(f'Binned Results ({nbins})')

In [0]:
def predict_data(data, steps):
    it = iter(data)
    x_test, y_test = zip(*[next(it) for _ in range(steps)])
    yh_test = [model.predict(x) for x in x_test]
    return np.concat(x_test), np.concat(y_test), np.concat(yh_test)

In [0]:
# functions
def load_path(tag, source, size, base='../tiles', ext='jpg'):
    tag = f'{tag:07d}'
    sub = tag[:4]
    return f'{base}/{source}/{size}px/{sub}/{tag}.{ext}'

In [0]:
# load in firm and location data
firms = pd.read_csv('../firms/census_2004_geocode.csv', usecols=['id', 'industry', 'income', 'total_assets', 'employees'])
targ = pd.read_csv('../index/census2004_mincloud2002.csv', usecols=['id', 'lat_wgs84', 'lon_wgs84', 'prod_id'])
firms = pd.merge(firms, targ, on='id', how='left').dropna()
# firms = firms.sample(n=1000)

# calculate outcome stats
firms['prod'] = firms['income']/firms['employees']
firms['lprod'] = dt.log(firms['prod'])
firms = firms.dropna(subset=['lprod'])
N = len(firms)
print(N)

In [0]:
# split into training and validation
df_train, df_valid = sk.train_test_split(firms, test_size=0.2)

In [0]:
def parse_function(fpden, lprod):
    imden = tf.image.decode_jpeg(tf.io.read_file(fpden), channels=1)
    return imden, lprod

In [0]:
def make_dataset(df):
    fpath = tf.constant([load_path(fid, 'density', 1024) for fid in df['id']])
    labels = tf.reshape(tf.cast(tf.constant(df['lprod']), tf.float32), (-1, 1))
    data = tf.data.Dataset.from_tensor_slices((fpath, labels))
    data = data.map(parse_function)
    data = data.shuffle(buffer_size=10000)
    data = data.batch(32)
    data = data.repeat()
    return data

In [0]:
train, valid = make_dataset(df_train), make_dataset(df_valid)

In [0]:
# iterator = train.make_one_shot_iterator()
# batch = iterator.get_next()
# with tf.Session() as sess:
#     den, pr = sess.run(batch)
# print(den.shape, pr.shape)
# Image.fromarray(den[0,:,:,0])

In [0]:
# CIFAR like model (1024px)
model = keras.Sequential([
    keras.layers.Input(shape=(K, K, 1)),
    keras.layers.Conv2D(filters=16, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.25),
    keras.layers.Conv2D(filters=32, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# average density model (1024px)
# input_dense = keras.layers.Input(shape=(K, K, 1), name='density1024')
# output_prod = keras.layers.GlobalAveragePooling2D()(input_dense)
# model = keras.Model(inputs=[input_dense], outputs=[output_prod])
# model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# train keras model
history = model.fit(train, epochs=10, steps_per_epoch=500, validation_data=valid, validation_steps=20)

In [0]:
x_test, y_test, yh_test = predict_data(valid, 20)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10, 5))
eval_model(y_test[:,0], yh_test[:,0], ymin=2, ymax=6, axs=(ax0, ax1))

In [0]:
x_test, y_test, yh_test = predict_data(train, 20)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(10, 5))
eval_model(y_test[:,0], yh_test[:,0], ymin=2, ymax=6, axs=(ax0, ax1))