# Content-based Engines for the for-sale Homes using Python

In [1]:
import numpy as np
import pandas as pd
import requests
import json
import urllib
import datetime as dt

import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

from ipynb.fs.full.Credentials import *

In [None]:
DS_dir = currentdir + '/Data_Source'

try:
    os.mkdir(DS_dir, mode = 0o666)

except FileExistsError:
    
    pass

## Data Collection

In [3]:
where = urllib.parse.quote_plus("""
{
    "State": "NJ",
    "County": "Ocean County"
}
""")
url = 'https://parseapi.back4app.com/classes/US_Zip_Code?count=1&limit=35&order=US_Zip_Code&excludeKeys=Unacceptable_cities,country,Remarks,Timezone&where=%s' % where
headers = {
    'X-Parse-Application-Id': back4app_ID,
    'X-Parse-Master-Key': back4app_key
}

data_cities = json.loads(requests.get(url, headers = headers).content.decode('utf-8'))
print(json.dumps(data_cities, indent = 2))

{
  "results": [
    {
      "objectId": "meQ6vKJetk",
      "US_Zip_Code": 8005,
      "Type": "STANDARD",
      "Primary_city": "Barnegat",
      "State": "NJ",
      "County": "Ocean County",
      "Area_codes": 609,
      "Latitude": 39.75,
      "Longitude": -74.22,
      "estimated_population": 19593,
      "createdAt": "2020-02-11T17:24:50.669Z",
      "updatedAt": "2020-02-11T17:24:50.669Z"
    },
    {
      "objectId": "dtP0eU85lh",
      "US_Zip_Code": 8006,
      "Type": "PO BOX",
      "Primary_city": "Barnegat Light",
      "Acceptable_cities": "Barnegat Lgt",
      "State": "NJ",
      "County": "Ocean County",
      "Area_codes": 609,
      "Latitude": 39.75,
      "Longitude": -74.11,
      "estimated_population": 687,
      "createdAt": "2020-02-11T17:24:50.669Z",
      "updatedAt": "2020-02-11T17:24:50.669Z"
    },
    {
      "objectId": "lc4pUW4mon",
      "US_Zip_Code": 8008,
      "Type": "STANDARD",
      "Primary_city": "Beach Haven",
      "Acceptable_cities":

In [4]:
df_cities = pd.json_normalize(data_cities['results'])
city_names = list(df_cities['Primary_city'].unique())
city_names

['Barnegat',
 'Barnegat Light',
 'Beach Haven',
 'Manahawkin',
 'Tuckerton',
 'West Creek',
 'Jackson',
 'New Egypt',
 'Lakewood',
 'Bayville',
 'Beachwood',
 'Brick',
 'Forked River',
 'Island Heights',
 'Lakehurst',
 'Lanoka Harbor',
 'Lavallette',
 'Mantoloking',
 'Normandy Beach',
 'Ocean Gate',
 'Pine Beach',
 'Point Pleasant Beach',
 'Seaside Heights',
 'Seaside Park',
 'Toms River',
 'Waretown',
 'Manchester Township']

In [12]:
api_key = Realtor_API_KEY_2

features = ['single_story', 'two_or_more_stories', 'garage_1_or_more', 'garage_2_or_more', 'basement']

In [17]:
def getHome(api_key, cities, state, features):
    table = []
    url = "https://realtor.p.rapidapi.com/properties/v2/list-for-sale"
    
    for city in cities:
        for feature in features:
            querystring = {"city":city,
                           "limit":"200",
                           "offset":"0",
                           "beds_min":"1",
                           "baths_min":"1",
                           "sqft_min":"1",
                           "state_code":state,
                           "features":feature
                          }

            headers = {
                'x-rapidapi-key': api_key,
                'x-rapidapi-host': "realtor.p.rapidapi.com"
                }
            try:
        
                response = requests.request("GET", url, headers = headers, params = querystring)

                print(city, feature, len(response.json()['properties']))

                if len(response.json()['properties']) > 0:
                    for item in response.json()['properties']:
                        item['city'] = item['address']['city']
                        item['line'] = item['address']['line']
                        item['zipcode'] = item['address']['postal_code']
                        item['state'] = item['address']['state_code']
                        item['longitude'] = item['address']['lon']
                        item['latitude'] = item['address']['lat']
                        item['size(sqft)'] = item['building_size']['size']
                        item['features'] = feature
                        data = pd.DataFrame.from_dict(item, orient = 'index').T
                        row = data[['property_id', 'price', 'beds', 'baths', 'size(sqft)',
                                    'line', 'city', 'state', 'zipcode', 
                                    'longitude', 'latitude', 'features', 'last_update']]

                        table.append(row)
            except len(response.json()['properties']) == 0:
                continue

    table = pd.concat(table, axis = 0, ignore_index = True, sort = False)
    
    table['price'] = table['price'].astype(int)
    table['price'].describe().apply(lambda x: format(x, 'f'))
    table['size(sqft)'] = table['size(sqft)'].astype(int)
    table['size(sqft)'].describe().apply(lambda x: format(x, 'f'))
    table['longitude'] = table['longitude'].astype(float)
    table['latitude'] = table['latitude'].astype(float)
    table['beds'] = table['beds'].astype(int)
    table['baths'] = table['baths'].astype(int)
    table['last_update'] = pd.to_datetime(table['last_update'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')
    table['zipcode'] = table['zipcode'].astype('category')
    table['features'] = table['features'].astype('category')
        
    return table

In [18]:
df_home = getHome(api_key, city_names, 'NJ', features)
df_home

Unnamed: 0,property_id,price,beds,baths,size(sqft),line,city,state,zipcode,longitude,latitude,last_update,features
0,M6251759680,495000,4,3,3383,187 Emerson Ln,Barnegat,NJ,08005,-74.223409,39.764946,2021-03-18 00:05:52,single_story
1,M6140700387,189000,2,2,1174,3 Tall Hedge Ct,Barnegat,NJ,08005,-74.271988,39.769174,2021-03-18 15:57:18,single_story
2,M5705044185,339000,2,2,2059,40 Lakeland Dr,Barnegat,NJ,08005,-74.264284,39.771079,2021-03-16 18:00:00,single_story
3,M6200124677,310000,2,2,1860,33 Westport Dr,Barnegat,NJ,08005,-74.274185,39.762913,2021-03-18 10:25:29,single_story
4,M6319666484,235000,3,1,1112,117 7th St,Barnegat,NJ,08005,-74.203929,39.770314,2021-03-13 07:42:37,single_story
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5486,M5350598726,299900,3,2,1682,840 Englemere Blvd,Toms River,NJ,08757,-74.268109,40.006305,2021-03-05 12:43:08,basement
5487,M5294372276,369000,2,3,2324,4 Campania Ct,Manchester,NJ,08759,-74.259830,40.045829,2021-01-23 12:05:42,basement
5488,M6100873763,290000,3,2,1404,1909 1st Ave,Toms River,NJ,08757,-74.243452,39.997435,2021-01-19 11:18:09,basement
5489,M5975617856,199900,1,1,864,2001 Manchester St,Toms River,NJ,08757,-74.274287,40.006442,2021-01-29 11:18:35,basement


In [19]:
# df_home.to_csv('./Data_Source/home.csv', index = False)

In [None]:
df_home.info()
df_home.describe().apply(lambda s: s.apply('{0:.2f}'.format))

## Clean Data

In [1]:
pre_home = pd.read_csv('./Data_Source/home.csv').drop_duplicates().reset_index(drop = True)
pre_home['last_update'] = pd.to_datetime(pre_home['last_update'])
pre_home['days_ago'] = (np.datetime64(dt.date.today()) - 
                        pre_home[['last_update']].values.astype('datetime64[D]')).astype(int)
pre_home

Unnamed: 0,property_id,price,beds,baths,size(sqft),line,city,state,zipcode,longitude,latitude,features,last_update,days_ago
0,M6251759680,495000,4,3,3383,187 Emerson Ln,Barnegat,NJ,8005,-74.223409,39.764946,single_story,2021-03-18 00:05:52,8
1,M6140700387,189000,2,2,1174,3 Tall Hedge Ct,Barnegat,NJ,8005,-74.271988,39.769174,single_story,2021-03-18 15:57:18,8
2,M5705044185,339000,2,2,2059,40 Lakeland Dr,Barnegat,NJ,8005,-74.264284,39.771079,single_story,2021-03-16 18:00:00,10
3,M6200124677,310000,2,2,1860,33 Westport Dr,Barnegat,NJ,8005,-74.274185,39.762913,single_story,2021-03-18 10:25:29,8
4,M6319666484,235000,3,1,1112,117 7th St,Barnegat,NJ,8005,-74.203929,39.770314,single_story,2021-03-13 07:42:37,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5333,M5462419635,459000,5,4,2612,320 Commonwealth Blvd,Manchester,NJ,8759,-74.277036,39.998821,basement,2021-03-16 07:40:18,10
5334,M6157159858,289900,3,2,1568,2573 Woodland Rd,Manchester,NJ,8759,-74.255452,40.034131,basement,2021-03-12 11:05:30,14
5335,M6657962521,484900,4,3,2114,1781 New York Ave,Manchester Township,NJ,8759,-74.379836,39.928933,basement,2021-03-05 16:51:31,21
5336,M5565388063,450000,4,3,2283,1661 Delaware Ave,Manchester,NJ,8759,-74.381291,39.923657,basement,2021-02-16 08:22:31,38


In [2]:
cols = pre_home.columns[5:7].tolist() + ['features']
Home_1 = pre_home.groupby(cols).agg({'days_ago':min, 'price':min}).reset_index()
Home_1

Unnamed: 0,line,city,features,days_ago,price
0,0-A Bee Way,Forked River,garage_1_or_more,37,459900
1,0-A Bee Way,Forked River,garage_2_or_more,37,459900
2,0-A Bee Way,Forked River,two_or_more_stories,37,459900
3,1 24th Ave Apt B2,Seaside Park,single_story,41,454500
4,1 24th Ave Unit C,Seaside Park,single_story,49,575000
...,...,...,...,...,...
5262,Court B Unit 1201,Brick,garage_1_or_more,53,209999
5263,Court B Unit 1201,Brick,single_story,53,209999
5264,Motor Rd,Pine Beach,basement,217,550000
5265,Motor Rd,Pine Beach,garage_1_or_more,217,550000


In [3]:
Home_1[Home_1.duplicated(cols)]

Unnamed: 0,line,city,features,days_ago,price


In [4]:
Home_2 = Home_1.groupby(['line', 'city'])['features'].agg(lambda column: ", ".join(column)).reset_index(name = "features")
Home_2

Unnamed: 0,line,city,features
0,0-A Bee Way,Forked River,"garage_1_or_more, garage_2_or_more, two_or_mor..."
1,1 24th Ave Apt B2,Seaside Park,single_story
2,1 24th Ave Unit C,Seaside Park,single_story
3,1 4th St,Manahawkin,"garage_1_or_more, two_or_more_stories"
4,1 Azalea Ct,Barnegat,"basement, garage_1_or_more, garage_2_or_more, ..."
...,...,...,...
2229,A Chatham Ct Unit 1001,Lakewood,"garage_1_or_more, single_story"
2230,B Molly Pitcher Blvd Unit 11,Whiting,single_story
2231,Bee Way,Forked River,"garage_1_or_more, two_or_more_stories"
2232,Court B Unit 1201,Brick,"basement, garage_1_or_more, single_story"


In [5]:
cols = Home_1.columns[:2].tolist() + Home_1.columns[-2:].tolist()
Home = Home_1[cols].merge(Home_2).sort_values('price').drop_duplicates(subset = Home_2.columns[:2].tolist(),
                                                                       keep = 'first').reset_index(drop = True)
Home

Unnamed: 0,line,city,days_ago,price,features
0,525 Main C St,Tuckerton,25,18500,single_story
1,469 S Green St Trlr 27,Tuckerton,78,29900,single_story
2,19D Monticello Dr,Whiting,12,35000,single_story
3,10G Bennington Ln Unit G,Whiting,25,38500,single_story
4,B Molly Pitcher Blvd Unit 11,Whiting,9,39000,single_story
...,...,...,...,...,...
2229,14 Cummins St,Mantoloking,25,4500000,"garage_1_or_more, garage_2_or_more, two_or_mor..."
2230,1324 Ocean Front,Point Pleasant Beach,19,5700000,"basement, garage_1_or_more, garage_2_or_more, ..."
2231,91 Pershing Blvd,Lavallette,139,6500000,"garage_1_or_more, garage_2_or_more, two_or_mor..."
2232,1211 Ocean Ave,Mantoloking,84,9250000,"basement, garage_1_or_more, garage_2_or_more, ..."


In [6]:
home_ult = pre_home.iloc[:,:-3].merge(Home, how = 'right').drop_duplicates(subset = Home.columns[:2].tolist(),
                                                                           keep = 'first').reset_index(drop = True)
home_ult

Unnamed: 0,property_id,price,beds,baths,size(sqft),line,city,state,zipcode,longitude,latitude,days_ago,features
0,M6602530331,18500,2,2,1000,525 Main C St,Tuckerton,NJ,8087,-74.322433,39.615565,25,single_story
1,M9930412229,29900,2,2,784,469 S Green St Trlr 27,Tuckerton,NJ,8087,-74.338429,39.590329,78,single_story
2,M9471690149,35000,1,1,782,19D Monticello Dr,Whiting,NJ,8759,-74.369100,39.956901,12,single_story
3,M5847512768,38500,1,1,537,10G Bennington Ln Unit G,Whiting,NJ,8759,-74.369306,39.959331,25,single_story
4,M9269193256,39000,2,1,850,B Molly Pitcher Blvd Unit 11,Whiting,NJ,8759,-74.375238,39.960069,9,single_story
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2229,M9886104467,4500000,6,5,4430,14 Cummins St,Mantoloking,NJ,8738,-74.057772,40.006954,25,"garage_1_or_more, garage_2_or_more, two_or_mor..."
2230,M9479483724,5700000,7,8,4480,1324 Ocean Front,Point Pleasant Beach,NJ,8742,-74.039428,40.083495,19,"basement, garage_1_or_more, garage_2_or_more, ..."
2231,M5334622826,6500000,8,8,7219,91 Pershing Blvd,Lavallette,NJ,8735,-74.083205,39.965606,139,"garage_1_or_more, garage_2_or_more, two_or_mor..."
2232,M5495064600,9250000,7,8,7700,1211 Ocean Ave,Mantoloking,NJ,8738,-74.049089,40.038775,84,"basement, garage_1_or_more, garage_2_or_more, ..."


In [7]:
pre_home[['line']][~pre_home['line'].isin(list(home_ult['line']))]

Unnamed: 0,line
1147,
1156,
1167,
1174,
1182,
1826,
2516,
3897,
3917,
3927,


In [8]:
home_ult[home_ult.duplicated(['line'])]

Unnamed: 0,property_id,price,beds,baths,size(sqft),line,city,state,zipcode,longitude,latitude,days_ago,features


In [9]:
# home_ult.to_csv('./Data_Source/home_ult.csv', index = False)