# Import standard libraries

This analysis has taken inspiration from the research paper:
https://arxiv.org/pdf/1609.08399.pdf

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import cv2

# Import dataset

The dataset comes in the following form:

HousesInfo.txt: 

Corresponding Images of the houses



Goal: Make the dataset in the following manner:


## Get housing Attributes

In [15]:
def get_house_attributes(iPath):
    cols = ["bedrooms", "bathrooms", "area", "zipcode", "price"]
    df = pd.read_csv(iPath, sep=" ", header=None, names=cols)
    return df

In [47]:
df = get_house_attributes("Houses Dataset/HousesInfo.txt")
df.head()

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226


In [17]:
def get_images(df, iPath):
    images = []

    for i in df.index.values:
        basePath = os.path.sep.join([iPath, "{}_*".format(i + 1)])
        paths = sorted(list(glob.glob(basePath)))
        outImage = np.zeros((64,64,3), dtype="uint8")
        inImages = []

        for p in paths:
            image = cv2.imread(p)
            image = cv2.resize(image, (32, 32))
            inImages.append(image)

        # Tile images
        outImage[0:32, 0:32] = inImages[0]
        outImage[0:32, 32:64] = inImages[1]
        outImage[32:64, 0:32] = inImages[2]
        outImage[32:64, 32:64] = inImages[3]

        images.append(outImage)
    
    return np.array(images)
    

# Explaratory Data Analysis (EDA)

In [20]:
df.head()

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
0,4,4.0,4053,85255,869500
1,4,3.0,3343,36372,865200
2,3,4.0,3923,85266,889000
3,5,5.0,4022,85262,910000
4,3,4.0,4116,85266,971226


In [22]:
df.describe()

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
count,535.0,535.0,535.0,535.0,535.0
mean,3.37757,2.664953,2364.904673,90937.768224,589362.8
std,1.160952,0.995077,1224.556982,7141.857452,509026.1
min,1.0,1.0,701.0,36372.0,22000.0
25%,3.0,2.0,1440.0,92276.0,249200.0
50%,3.0,2.5,2078.0,92880.0,529000.0
75%,4.0,3.0,3067.5,93510.0,728500.0
max,10.0,7.0,9583.0,98021.0,5858000.0


### Explore Zipcode (categorical variable)

There are many zipcodes which do not have many instances. In this case, we try to 'generalize' the zipcodes by dropping 2 (and later, 3) of the least significant digits). Our argument is that the zipcodes with the same most significant digits fall in the same geeographical area and hence might have still retain similarities. This can help the prediction get more accurate. To test this, we will also predict numbers without any processing of the zipcode as a base case. 

In [48]:
df["zipcode"] = df["zipcode"].round(decimals=-2)

In [49]:
df.head()

Unnamed: 0,bedrooms,bathrooms,area,zipcode,price
0,4,4.0,4053,85300,869500
1,4,3.0,3343,36400,865200
2,3,4.0,3923,85300,889000
3,5,5.0,4022,85300,910000
4,3,4.0,4116,85300,971226


In [50]:
zipcodes = df["zipcode"].value_counts().keys().tolist()
counts = df["zipcode"].value_counts().tolist()

In [51]:
d = dict(zip(zipcodes, counts))

In [53]:
print(d)

{92300: 102, 94500: 63, 93500: 60, 93400: 54, 92900: 49, 91900: 33, 85300: 33, 92700: 28, 81500: 12, 93100: 12, 92000: 12, 96000: 12, 62200: 12, 95200: 10, 92800: 9, 60000: 6, 98000: 4, 62000: 3, 91800: 3, 85400: 3, 94600: 2, 81400: 2, 93700: 1, 36400: 1, 92500: 1, 90800: 1, 93900: 1, 62100: 1, 95000: 1, 90300: 1, 90000: 1, 93300: 1, 90200: 1}


We only keep the zipcode areas with instances more than 25. Simultaneouly, we will need to delete the images as well.

In [55]:
for (zipcode, count) in zip(zipcodes, counts):
    if count < 25:
        idxs = df[df["zipcode"] == zipcode].index
        df.drop(idxs, inplace=True)
        

In [58]:
df.size

2110

## Get images based on remaining index

In [59]:
images = get_images(df, "Houses Dataset/")
print(len(images))

422


# Model Building

## Preprocessing

## Training

## Testing

# Conclusion