In [0]:
import os
import numpy as np
import pandas as pd
import PIL
import seaborn as sns
import statsmodels.formula.api as smf
import sklearn.model_selection as sk
import mectools.data as dt
import tensorflow as tf
from tensorflow import keras

In [0]:
plt = plotter(backend='Agg')
%matplotlib inline

In [0]:
# args
seed = 2384923
samp = 0.05
BATCH_SIZE = 32
size = 256

In [0]:
# random init
state = np.random.RandomState(seed)

In [0]:
# functions
def load_path(tag, base='tiles/density', ext='jpg'):
    tag = f'{tag:07d}'
    sub = tag[:4]
    return f'{base}/{size}px/{sub}/{tag}.{ext}'

In [0]:
# load in firm and location data
firms = pd.read_csv('firms/census_2004_geocode.csv', usecols=['id', 'industry', 'income', 'total_assets', 'employees'])
targ = pd.read_csv('targets/census_firms_2004.csv', usecols=['id', 'lat_wgs84', 'lon_wgs84'])
firms = pd.merge(firms, targ, on='id', how='left').dropna()

# downsample for now
firms = firms.sample(frac=samp)

# resolve image paths
firms['file'] = firms['id'].apply(load_path)
firms['fexist'] = firms['file'].apply(os.path.exists)
firms = firms[firms['fexist']]

# calculate outcome stats
firms['prod'] = firms['income']/firms['employees']
firms['lprod'] = dt.log(firms['prod'])
firms = firms.dropna(subset=['lprod'])

# calculate residual performance
reg_ind = smf.ols('lprod ~ 0 + C(industry)', data=firms).fit()
firms['lprod_resid'] = reg_ind.resid

In [0]:
# load in image features
features = np.stack([np.array(PIL.Image.open(fn)) for fn in firms['file']])
features = features[:,:,:,None].astype(np.float32)/255 # single channel image

In [0]:
# construct outcome variable
# labels = firms['lprod_resid'].values
labels = firms['lprod_resid'].values
labels = labels[:,None].astype(np.float32)

In [0]:
# do train/test split
X_train, X_valid, y_train, y_valid = sk.train_test_split(features, labels, test_size=0.2, random_state=seed)

In [0]:
# CIFAR like model (1024px)
model = keras.Sequential([
    keras.layers.Conv2D(filters=16, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.25),
    keras.layers.Conv2D(filters=32, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=64, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# CIFAR like model (256px)
model = keras.Sequential([
    keras.layers.Conv2D(filters=16, kernel_size=8, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=8),
    keras.layers.Dropout(0.5),
    keras.layers.Conv2D(filters=16, kernel_size=4, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=4),
    keras.layers.Dropout(0.5),
    keras.layers.Flatten(),
    keras.layers.Dense(units=32, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=16),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=1)
])
model.compile(loss='mean_squared_error', optimizer='adam')

In [0]:
# train keras model
history = model.fit(X_train, y_train, epochs=10, validation_data=[X_valid, y_valid])

In [0]:
# evaluate model
yhat_valid = model.predict(X_valid)
res_valid = pd.DataFrame({'y': y_valid[:,0], 'yhat': yhat_valid[:,0]}).astype(np.float)
res_valid['err'] = res_valid['yhat'] - res_valid['y']
res_valid1 = res_valid.query('y > -2 and y < 2 and yhat > -2 and yhat < 2')
sns.jointplot('y', 'yhat', data=res_valid1, kind='hex');

In [0]:
# evaluate fit
yhat_train = model.predict(X_train)
res_train = pd.DataFrame({'y': y_train[:,0], 'yhat': yhat_train[:,0]}).astype(np.float)
res_train['err'] = res_train['yhat'] - res_train['y']
res_train1 = res_train.query('y > -2 and y < 2 and yhat > -2 and yhat < 2')
sns.jointplot('y', 'yhat', data=res_train1, kind='hex');

In [0]:
nbins = 10
bins = np.linspace(-2, 2, nbins)
res_valid['ybin'] = np.digitize(res_valid['y'], bins)
res_valid['ybin'] = np.minimum(nbins-1, res_valid['ybin'])
bmean = res_valid.groupby('ybin')['yhat'].mean()
bmean.index = bins
bmean.plot();

In [0]:
nbins = 10
bins = np.linspace(-2, 2, nbins)
res_train['ybin'] = np.digitize(res_train['y'], bins)
res_train['ybin'] = np.minimum(nbins-1, res_train['ybin'])
bmean = res_train.groupby('ybin')['yhat'].mean()
bmean.index = bins
bmean.plot();

### CIFAR

In [0]:
from tensorflow.keras.datasets import cifar10

In [0]:
(cx_train, cy_train), (cx_test, cy_test) = cifar10.load_data()
cy_train = keras.utils.to_categorical(cy_train, 10)
cy_test = keras.utils.to_categorical(cy_test, 10)
cx_train = cx_train.astype('float32')/255
cx_test = cx_test.astype('float32')/255

In [0]:
# CIFAR like model
cifar = keras.Sequential([
    keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu'),
    keras.layers.Conv2D(filters=32, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=2),
    keras.layers.Dropout(0.25),
    keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'),
    keras.layers.Conv2D(filters=64, kernel_size=3, activation='relu'),
    keras.layers.MaxPooling2D(pool_size=2),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(units=512, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=10, activation='softmax')
])

In [0]:
cifar.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
chist = cifar.fit(cx_train, cy_train, batch_size=BATCH_SIZE, epochs=10, validation_data=(cx_test, cy_test))

### Test

In [0]:
t_input = np.random.randn(10000)[:,None]
t_output = 1 + 2*t_input

In [0]:
tX_train, tX_valid, ty_train, ty_valid = sk.train_test_split(t_input, t_output, test_size=0.2, random_state=seed)

In [0]:
# testing model
model = keras.Sequential([keras.layers.Dense(units=1)])
model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(tX_train, ty_train, epochs=20, validation_data=[tX_valid, ty_valid])

In [0]:
tyhat_valid = model.predict(tX_valid)
tres = pd.DataFrame({'y': ty_valid[:,0], 'yhat': tyhat_valid[:,0]})
tres['err'] = tres['yhat'] - tres['y']
tres.std()

In [0]:
sns.jointplot('y', 'yhat', data=tres, kind='reg')