## Models evaluation

This notebook runs different models on image features extracted from satellite images using the CNN. Models regress to predict prevalence of mental health issues within cities.

In [2]:
import pandas as pd
import numpy as np
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
from keras.models import Model, Sequential
import os, sys
import glob
import matplotlib.pyplot as plt
from PIL import Image
from PIL import ImageOps
import csv

In [77]:
# Los Angeles

# Loading VGG extracted features
x_lacity = np.loadtxt('./pretrained_output/X_lacity_fc7_vggf_z18.txt')
print(x_lacity.shape)

# Loading target variables
la_df = pd.DataFrame.from_csv('./data/lacity/500_cities_lacity_mental_health.csv')
print(la_df.shape)

(993, 4096)
(994, 14)


  


In [4]:
vgg16_model = VGG16(weights='imagenet', include_top=True)


Instructions for updating:
Colocations handled automatically by placer.


In [None]:
vgg16_model.summary()

In [None]:
'''
Image processing helper function
'''
#Size of images
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224
imgdir = "./data/lacity/img/*png"

def transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT):

    r,g,b = img.split()
    #Histogram Equalization
    # img[:, :, 0] = cv2.equalizeHist(img[:, :, 0])
    # img[:, :, 1] = cv2.equalizeHist(img[:, :, 1])
    # img[:, :, 2] = cv2.equalizeHist(img[:, :, 2])
    rnew = ImageOps.equalize(r)
    gnew = ImageOps.equalize(g)
    bnew = ImageOps.equalize(b)

    #Image Resizing
    # img = cv2.resize(img, (img_width, img_height), interpolation = cv2.INTER_CUBIC)
    img = Image.merge("RGB", (rnew,gnew,bnew))
    img = img.resize((img_width, img_height), resample = Image.BICUBIC)
    return img

def vis_square(data):
    data = (data - data.min()) / (data.max() - data.min())
    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = (((0, n ** 2 - data.shape[0]),
               (0, 1), (0, 1))
               + ((0, 0),) * (data.ndim - 3))
    data = np.pad(data, padding, mode = 'constant', constant_values = 1)

    data = data.reshape((n, n) + data.shape[1:]).transpose((0,2,1,3) + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
    return(data)

## Process images

In [None]:
feature_extract_model = Model(inputs=vgg16_model.input,
                              outputs=vgg16_model.get_layer('fc2').output)

In [None]:
test_img_paths = [img_path for img_path in glob.glob(imgdir)]

# Extract features
raw_features = []
tractFIPS = []
finalimgs = []
count = 0
split = 0
outdir = './out'
city = 'lacity'

In [None]:
for img_path in test_img_paths:
    img = image.load_img(img_path, target_size=(224,224))
    img = transform_img(img, img_width = IMAGE_WIDTH, img_height = IMAGE_HEIGHT)
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = feature_extract_model.predict(img_data)
    raw_features.append(features)
    finalimgs.append(img_path.split()[-1])
    tractFIPS.append(img_path.split('_')[1])
    count += 1

raw_features = np.vstack(raw_features)
np.save(os.path.join(outdir, city, 'features_tracts_fc7_z18_vgg_' + str(split) + '.npy'), raw_features)
np.save(os.path.join(outdir, city, 'filenames_tracts_fc7_z18_vgg_' + str(split) + '.npy'), finalimgs)

In [None]:
test_df = pd.DataFrame(raw_features)
tractFIPS = [row.lstrip('0') for row in tractFIPS]
test_df['TractFIPS'] = tractFIPS

In [None]:
la_df.TractFIPS = la_df.TractFIPS.astype('O')

In [None]:
test_df.to_csv('la_vgg.csv')

## Visualizing layers

In [None]:
from keras.utils import plot_model
!conda install pydot

In [None]:
img = image.load_img(test_img_paths[0], target_size=(224,224))
img = transform_img(img, img_width = IMAGE_WIDTH, img_height = IMAGE_HEIGHT)
img_data = image.img_to_array(img)
img_data = np.expand_dims(img_data, axis=0)
img_data = preprocess_input(img_data)
features = feature_extract_model.predict(img_data)

In [None]:
plot_model(feature_extract_model)

### Checking to see 2 dataframes are in order

In [5]:
test_df = test_df.sort_values(by=["TractFIPS"])

NameError: name 'test_df' is not defined

In [None]:
la_df = la_df.sort_values(by=["TractFIPS"])

In [None]:
test_df.TractFIPS

In [None]:
la_df.TractFIPS

## Training regression model

In [78]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn import metrics
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr
X = pd.DataFrame.from_csv('la_vgg.csv').sort_values(by=['TractFIPS']).reset_index().drop(["index"], axis=1)

  


In [79]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,TractFIPS
0,1.535566,1.229616,0.658255,1.507081,2.624575,1.061752,3.536164,-0.0,1.219561,-0.0,...,-0.0,-0.0,-0.0,-0.0,2.447415,0.510135,-0.0,-0.0,-0.0,6037101110
1,1.408049,3.88546,0.551711,-0.0,8.767652,-0.0,0.880536,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,0.288755,1.299185,2.104688,-0.0,0.312325,6037101122
2,1.34059,5.63362,2.135561,1.38806,4.021668,1.746387,0.374989,-0.0,1.078838,-0.0,...,-0.0,-0.0,-0.0,-0.0,1.080878,1.162551,-0.0,-0.0,-0.0,6037101210
3,4.680807,2.878426,1.926969,-0.0,3.836767,-0.0,1.900073,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,3.632332,1.926144,-0.0,3.650725,-0.0,6037101220
4,2.665981,6.372698,1.211612,1.325631,4.293569,-0.0,2.716865,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,2.999381,-0.0,-0.0,-0.0,-0.0,6037101300


In [80]:
target = la_df.sort_values(by="TractFIPS").reset_index().drop(["index"], axis=1)
target.head()

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,UniqueID,DataValueTypeID,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,PopulationCount,GeoLocation,CityFIPS,TractFIPS
0,2016,CA,California,Los Angeles,Census Tract,0644000-06037101110,CrdPrv,13.8,12.6,15.1,4731,"(34.2594736124, -118.292986888)",644000.0,6037101110
1,2016,CA,California,Los Angeles,Census Tract,0644000-06037101122,CrdPrv,10.7,9.7,11.7,3664,"(34.2677215381, -118.290147139)",644000.0,6037101122
2,2016,CA,California,Los Angeles,Census Tract,0644000-06037101210,CrdPrv,15.7,14.3,16.9,5990,"(34.2529723884, -118.29073093)",644000.0,6037101210
3,2016,CA,California,Los Angeles,Census Tract,0644000-06037101220,CrdPrv,14.1,12.9,15.3,3363,"(34.251608492, -118.281632269)",644000.0,6037101220
4,2016,CA,California,Los Angeles,Census Tract,0644000-06037101300,CrdPrv,11.0,9.8,12.3,4199,"(34.2487781261, -118.270998916)",644000.0,6037101300


In [81]:
# Check outstanding rows
s = set(target['TractFIPS']).symmetric_difference(set(X['TractFIPS']))
s

{6037930401}

In [82]:
target = target[target.TractFIPS != 6037930401]

In [83]:
print(target.shape, X.shape)

(993, 14) (993, 4097)


In [84]:
y = target.Data_Value

## Train on LA and test on LA

In [102]:
X.to_csv("processed_la_VGG_X.csv")

In [105]:
target.to_csv("processed_la_VGG_Y.csv")

In [85]:
X = X.drop(['TractFIPS'], axis=1)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [87]:
# LR
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# SVR
svr_model = SVR(gamma='scale', C=1.0, epsilon=0.2)
svr_model.fit(X_train, y_train)

print(lr_model.intercept_, lr_model.coef_)

13.601589975513388 [ 0.02688559  0.02259937  0.10750992 ... -0.00982249 -0.03516909
  0.20924499]


In [88]:
y_pred_lr = lr_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test)

In [89]:
true_pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lr})
true_pred_df.head()

Unnamed: 0,Actual,Predicted
11,12.3,15.150146
139,14.0,14.896165
493,14.6,12.269419
496,9.2,10.444927
46,16.3,13.543347


In [90]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_lr))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_lr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_lr)))

Mean Absolute Error: 2.7725461688683666
Mean Squared Error: 12.315641448723252
Root Mean Squared Error: 3.5093648212637074


In [100]:
true_pred_df_svr = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_svr})
true_pred_df_svr.head()

Unnamed: 0,Actual,Predicted
11,12.3,13.668213
139,14.0,12.336964
493,14.6,13.358708
496,9.2,9.900695
46,16.3,12.962395


In [92]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_svr))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_svr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr)))

Mean Absolute Error: 2.1343798652100423
Mean Squared Error: 7.175764939817218
Root Mean Squared Error: 2.678761829617784


### K-fold

In [93]:
kfold = KFold(n_splits=5, random_state=42)
results = cross_val_score(lr_model, X, y, cv=kfold, scoring='r2')

In [94]:
print(results.mean())

-0.6080600264887941


In [95]:
kfold = KFold(n_splits=5, random_state=42)
results = cross_val_score(svr_model, X, y, cv=kfold, scoring='r2')
print(results.mean())

-0.08288060067752259


## Visualizing regression model estimates

In [31]:
import geopandas as gpd
import descartes
from shapely.geometry import Point, Polygon
crs={'init': 'epsg:4326'}
la_gdf = gpd.read_file('./data/lacity/lacity_census_tracts_2010.shp').to_crs(crs)

In [64]:
temp_df = target.reset_index().drop(["index"], axis=1)

In [57]:
X.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,TractFIPS
0,476,1.535566,1.229616,0.658255,1.507081,2.624575,1.061752,3.536164,-0.0,1.219561,...,-0.0,-0.0,-0.0,-0.0,2.447415,0.510135,-0.0,-0.0,-0.0,6037101110
1,785,1.408049,3.88546,0.551711,-0.0,8.767652,-0.0,0.880536,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,0.288755,1.299185,2.104688,-0.0,0.312325,6037101122
2,171,1.34059,5.63362,2.135561,1.38806,4.021668,1.746387,0.374989,-0.0,1.078838,...,-0.0,-0.0,-0.0,-0.0,1.080878,1.162551,-0.0,-0.0,-0.0,6037101210
3,512,4.680807,2.878426,1.926969,-0.0,3.836767,-0.0,1.900073,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,3.632332,1.926144,-0.0,3.650725,-0.0,6037101220
4,497,2.665981,6.372698,1.211612,1.325631,4.293569,-0.0,2.716865,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,2.999381,-0.0,-0.0,-0.0,-0.0,6037101300


In [44]:
# Make copy of la df
temp_df = la_df

# Change geolocation to latitude longitude
def convertToLat(row):
    tempRow = eval(row)
    return (tempRow[0]) 

def convertToLong(row):
    tempRow = eval(row)
    return (tempRow[1]) 

temp_df['Latitude'] = temp_df['GeoLocation'].apply(lambda row: convertToLat(row))
temp_df['Longitude'] = temp_df['GeoLocation'].apply(lambda row: convertToLong(row))

In [50]:
true_pred_df_svr.Predicted

13158    14.034483
8806     13.629286
5324     14.948726
25423    11.456139
14001    10.288399
2833     13.767477
27535    10.314208
9249     12.010897
24109    13.678319
7140     13.210318
7787     12.696889
17332    14.381163
7557     13.840053
14690    12.337321
8818     11.098270
10529    13.769599
15944    14.354193
6835      9.799437
5774     11.065653
15713    11.397074
1373     13.514615
28552    11.733458
12108    12.652118
5301     14.505180
26150    11.179875
22840    12.026328
28012    10.727224
6829     11.007737
12915    11.265945
2519     13.566649
           ...    
3678     14.257813
14248    11.651352
2247     12.485111
17430    11.582609
14616    13.362966
16256    14.052882
15540    12.449606
19986    11.433552
25099    13.805759
3401     14.527479
3738     12.657967
21785    15.837028
20469    10.496192
23570    14.260149
5226     14.091195
19217    13.640727
8494     15.710560
19421    14.123257
25383     9.840457
27484    17.120489
18152    14.111138
11622    12.