In [None]:
import pandas as pd 
pd.set_option('display.max_columns', None)
data = pd.read_csv("imageFilenames.csv")
data.head()

In [None]:
from PIL import Image
import numpy as np

# Want to know how we should format the height x width image data dimensions
# for inputting to a keras model
def get_size_statistics(data):
    heights = []
    widths = []
    img_count = 0
    for i, row in data.iterrows():
        if row["imgFilepath"] != "no photo" or row["imgFilepath"] != np.nan:
            try:
                imgData = np.array(Image.open(row["imgFilepath"]))
                heights.append(imgData.shape[0])
                widths.append(imgData.shape[1])
                img_count += 1
            except:
                data.at[i, "imgFilepath"] = "no photo"
    avg_height = sum(heights) / len(heights)
    avg_width = sum(widths) / len(widths)
    print("Average Height: " + str(avg_height))
    print("Max Height: " + str(max(heights)))
    print("Min Height: " + str(min(heights)))
    print('\n')
    print("Average Width: " + str(avg_width))
    print("Max Width: " + str(max(widths)))
    print("Min Width: " + str(min(widths)))

get_size_statistics(data)

In [None]:
noPhotoIndx = data[data["imgFilepath"] == "no photo"].index 

data.drop(noPhotoIndx, inplace=True)

incorrectPrice = data[data["totPurchaseAmt"] < 70000].index

data.drop(incorrectPrice, inplace=True)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data["amtBucket"] = pd.qcut(data["totPurchaseAmt"], 10)

In [None]:
data.head()

In [None]:
data["amtBucket"].value_counts()

In [None]:
bucketList = list(data["amtBucket"].unique())
print(bucketList)
data["amtBucketNum"] = [bucketList.index(i) for i in data["amtBucket"]]

In [None]:
data.head()

In [None]:
IMG_SIZE = 224

def resizeAndLoadData(data):
    imageData = []
    for i, row in data.iterrows():
        img = Image.open(row["imgFilepath"])
        img = img.resize((IMG_SIZE, IMG_SIZE))
        imageData.append(np.array(img))
    data["imgData"] = imageData

resizeAndLoadData(data)

In [None]:
data.head(1)

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
X = np.array(data["imgData"])
#y = to_categorical(data["amtBucketNum"], num_classes=len(bucketList))
y = data["totPurchaseAmt"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X[0])

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization

In [None]:
from keras import applications

vggModel = Sequential()

vggModel.add(applications.VGG16(weights="imagenet", input_shape=(IMG_SIZE, IMG_SIZE, 3), include_top=False))

numLayers = len(vggModel.layers)

for layer in vggModel.layers[:numLayers-2]:
    layer.trainable = False


vggModel.add(Flatten(input_shape=vggModel.output_shape[1:]))
vggModel.add(Dropout(.1))
vggModel.add(Dense(1, activation="linear"))

vggModel.summary()

In [None]:
vggModel.compile(loss='mean_squared_error', optimizer='adam', metrics = ['mae'])

In [None]:
X_train = np.array([i for i in X_train]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)

vggModel.fit(X_train, y_train, batch_size=50, epochs=500, verbose=1, validation_split=0.1)

In [None]:
X_test = np.array([i for i in X_test]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)

loss, mae = vggModel.evaluate(X_test, y_test, verbose=1)

print("Loss (MSE): ", loss)
print("MAE: ", mae)

In [None]:
vggModel.save("ZillowCNN.h5")