## Models evaluation

This notebook runs different models on image features extracted from satellite images using the CNN. Models regress to predict prevalence of mental health issues within cities.

In [3]:
import pandas as pd
import numpy as np
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing import image
from keras.models import Model, Sequential
import os, sys
import glob
import matplotlib.pyplot as plt
from PIL import Image
from PIL import ImageOps
import csv

In [4]:
# Los Angeles

# Loading VGG extracted features
x_lacity = np.loadtxt('./pretrained_output/X_lacity_fc7_vggf_z18.txt')
print(x_lacity.shape)

# Loading target variables
la_df = pd.DataFrame.from_csv('./data/lacity/500_cities_lacity_mental_health.csv')
print(la_df.shape)

(993, 4096)
(994, 14)


  


In [3]:
vgg16_model = VGG16(weights='imagenet', include_top=True)


Instructions for updating:
Colocations handled automatically by placer.


In [4]:
vgg16_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [5]:
'''
Image processing helper function
'''
#Size of images
IMAGE_WIDTH = 224
IMAGE_HEIGHT = 224
imgdir = "./data/lacity/img/*png"

def transform_img(img, img_width=IMAGE_WIDTH, img_height=IMAGE_HEIGHT):

    r,g,b = img.split()
    #Histogram Equalization
    # img[:, :, 0] = cv2.equalizeHist(img[:, :, 0])
    # img[:, :, 1] = cv2.equalizeHist(img[:, :, 1])
    # img[:, :, 2] = cv2.equalizeHist(img[:, :, 2])
    rnew = ImageOps.equalize(r)
    gnew = ImageOps.equalize(g)
    bnew = ImageOps.equalize(b)

    #Image Resizing
    # img = cv2.resize(img, (img_width, img_height), interpolation = cv2.INTER_CUBIC)
    img = Image.merge("RGB", (rnew,gnew,bnew))
    img = img.resize((img_width, img_height), resample = Image.BICUBIC)
    return img

def vis_square(data):
    data = (data - data.min()) / (data.max() - data.min())
    n = int(np.ceil(np.sqrt(data.shape[0])))
    padding = (((0, n ** 2 - data.shape[0]),
               (0, 1), (0, 1))
               + ((0, 0),) * (data.ndim - 3))
    data = np.pad(data, padding, mode = 'constant', constant_values = 1)

    data = data.reshape((n, n) + data.shape[1:]).transpose((0,2,1,3) + tuple(range(4, data.ndim + 1)))
    data = data.reshape((n * data.shape[1], n * data.shape[3]) + data.shape[4:])
    return(data)

## Process images

In [6]:
feature_extract_model = Model(inputs=vgg16_model.input,
                              outputs=vgg16_model.get_layer('fc2').output)

In [12]:
test_img_paths = [img_path for img_path in glob.glob(imgdir)]

# Extract features
raw_features = []
tractFIPS = []
finalimgs = []
count = 0
split = 0
outdir = './out'
city = 'lacity'

In [13]:
len(test_img_paths)

993

In [14]:
test_img_paths[2].split('_')[1]

'06037240010'

In [17]:
for img_path in test_img_paths:
    img = image.load_img(img_path, target_size=(224,224))
    img = transform_img(img, img_width = IMAGE_WIDTH, img_height = IMAGE_HEIGHT)
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = feature_extract_model.predict(img_data)
    raw_features.append(features)
    finalimgs.append(img_path.split()[-1])
    tractFIPS.append(img_path.split('_')[1])
    count += 1

raw_features = np.vstack(raw_features)
np.save(os.path.join(outdir, city, 'features_tracts_fc7_z18_vgg_' + str(split) + '.npy'), raw_features)
np.save(os.path.join(outdir, city, 'filenames_tracts_fc7_z18_vgg_' + str(split) + '.npy'), finalimgs)

In [20]:
test_df = pd.DataFrame(raw_features)
tractFIPS = [row.lstrip('0') for row in tractFIPS]
test_df['TractFIPS'] = tractFIPS

In [21]:
la_df.TractFIPS = la_df.TractFIPS.astype('O')

In [23]:
test_df.to_csv('la_vgg.csv')

### Checking to see 2 dataframes are in order

In [34]:
test_df = test_df.sort_values(by=["TractFIPS"])

In [35]:
la_df = la_df.sort_values(by=["TractFIPS"])

In [49]:
test_df.TractFIPS

476    6037101110
785    6037101122
171    6037101210
512    6037101220
497    6037101300
123    6037101400
114    6037102103
420    6037102104
230    6037102105
163    6037102107
6      6037103101
300    6037103102
1      6037103200
201    6037103300
661    6037103400
479    6037104103
352    6037104105
936    6037104108
225    6037104124
210    6037104201
169    6037104203
356    6037104204
777    6037104310
403    6037104320
580    6037104401
513    6037104403
949    6037104404
988    6037104500
202    6037104610
598    6037104620
          ...    
807    6037294830
309    6037294900
127    6037295103
100    6037296210
716    6037296220
736    6037296300
451    6037296401
246    6037296402
79     6037296500
278    6037296600
645    6037296901
863    6037296902
889    6037297000
261    6037297110
905    6037297120
842    6037297201
545    6037297202
739    6037297300
145    6037297400
76     6037297500
69     6037297601
363    6037297602
62     6037800204
765    6037980008
581    603

In [46]:
la_df.TractFIPS

7938     6037101110
14686    6037101122
26595    6037101210
21812    6037101220
22840    6037101300
28589    6037101400
3636     6037102103
24987    6037102104
17790    6037102105
20369    6037102107
1296     6037103101
26009    6037103102
13962    6037103200
9098     6037103300
7429     6037103400
3678     6037104103
6952     6037104105
7679     6037104108
26420    6037104124
24751    6037104201
11622    6037104203
8870     6037104204
10369    6037104310
18072    6037104320
17114    6037104401
22442    6037104403
22867    6037104404
15674    6037104500
7657     6037104610
13105    6037104620
            ...    
11796    6037294900
18871    6037295103
938      6037296210
23870    6037296220
24165    6037296300
2833     6037296401
12162    6037296402
23675    6037296500
5897     6037296600
28025    6037296901
3401     6037296902
4286     6037297000
1683     6037297110
5809     6037297120
27808    6037297201
1177     6037297202
5314     6037297300
11185    6037297400
10394    6037297500


## Training regression model

In [5]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, Ridge, Lasso
from sklearn import metrics
from sklearn.svm import SVR
from scipy.stats.stats import pearsonr
X = pd.DataFrame.from_csv('la_vgg.csv').sort_values(by=['TractFIPS']).reset_index()

  


In [27]:
target = la_df.sort_values(by="TractFIPS")
target.TractFIPS

7938     6037101110
14686    6037101122
26595    6037101210
21812    6037101220
22840    6037101300
28589    6037101400
3636     6037102103
24987    6037102104
17790    6037102105
20369    6037102107
1296     6037103101
26009    6037103102
13962    6037103200
9098     6037103300
7429     6037103400
3678     6037104103
6952     6037104105
7679     6037104108
26420    6037104124
24751    6037104201
11622    6037104203
8870     6037104204
10369    6037104310
18072    6037104320
17114    6037104401
22442    6037104403
22867    6037104404
15674    6037104500
7657     6037104610
13105    6037104620
            ...    
11796    6037294900
18871    6037295103
938      6037296210
23870    6037296220
24165    6037296300
2833     6037296401
12162    6037296402
23675    6037296500
5897     6037296600
28025    6037296901
3401     6037296902
4286     6037297000
1683     6037297110
5809     6037297120
27808    6037297201
1177     6037297202
5314     6037297300
11185    6037297400
10394    6037297500


In [None]:
# Check outstanding rows
s = set(target['TractFIPS']).symmetric_difference(set(X['TractFIPS']))
s

In [28]:
target = target[target.TractFIPS != 6037930401]

In [32]:
y = target.Data_Value

## Train on LA and test on LA

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
# LR
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# SVR
svr_model = SVR(gamma='scale', C=1.0, epsilon=0.2)
svr_model.fit(X_train, y_train)

print(lr_model.intercept_, lr_model.coef_)

12677.515540286813 [ 7.86229992e-04 -3.77012247e-02  5.05105994e-03 ...  5.83115275e-02
  8.81374536e-02 -2.09773742e-06]


In [35]:
y_pred_lr = lr_model.predict(X_test)
y_pred_svr = svr_model.predict(X_test)

In [36]:
true_pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lr})
true_pred_df.head()

Unnamed: 0,Actual,Predicted
15434,13.7,11.274867
1899,14.6,16.752663
24987,10.7,14.237731
20381,8.8,9.051664
18056,10.8,11.671513


In [37]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_lr))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_lr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_lr)))

Mean Absolute Error: 2.559877612140193
Mean Squared Error: 10.229721967079504
Root Mean Squared Error: 3.198393654176969


In [38]:
true_pred_df_svr = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_svr})
true_pred_df_svr.head()

Unnamed: 0,Actual,Predicted
15434,13.7,13.059802
1899,14.6,13.279165
24987,10.7,12.374142
20381,8.8,13.288148
18056,10.8,12.452103


In [39]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_svr))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_svr))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_svr)))

Mean Absolute Error: 2.7604386428704415
Mean Squared Error: 10.378346766053545
Root Mean Squared Error: 3.2215441586378333


### K-fold

In [40]:
kfold = KFold(n_splits=5, random_state=42)
results = cross_val_score(lr_model, X, y, cv=kfold, scoring='r2')

In [41]:
print(results.mean())

-0.8666738778252224


In [42]:
kfold = KFold(n_splits=5, random_state=42)
results = cross_val_score(svr_model, X, y, cv=kfold, scoring='r2')
print(results.mean())

-0.47754928802915064
