In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import requests
import os
import sys

from sklearn.feature_selection import SelectFromModel
from skimage import data as imgData, feature, exposure
from skimage.io import imread_collection, imshow, imread
from sklearn.feature_extraction import DictVectorizer, image
from PIL import Image

In [36]:
sns.set(color_codes=True)
data = pd.read_json('test.json', convert_dates=['created'])
data

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address
0,1.0,1,79780be1514f645d7e6be99a3de696c5,2016-06-11 05:29:41,Large with awesome terrace--accessible via bed...,Suffolk Street,"[Elevator, Laundry in Building, Laundry in Uni...",40.7185,7142618,-73.9865,b1b1852c416d78d7765d746cb1b8921f,[https://photos.renthop.com/2/7142618_1c45a2c8...,2950,99 Suffolk Street
1,1.0,2,0,2016-06-24 06:36:34,Prime Soho - between Bleecker and Houston - Ne...,Thompson Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7278,7210040,-74.0000,d0b5648017832b2427eeb9956d966a14,[https://photos.renthop.com/2/7210040_d824cc71...,2850,176 Thompson Street
2,1.0,0,0,2016-06-17 01:23:39,Spacious studio in Prime Location. Cleanbuildi...,Sullivan Street,"[Pre-War, Dogs Allowed, Cats Allowed]",40.7260,7174566,-74.0026,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/7174566_ba3a35c5...,2295,115 Sullivan Street
3,1.0,2,f9c826104b91d868e69bd25746448c0c,2016-06-21 05:06:02,For immediate access call Bryan.<br /><br />Bo...,Jones Street,"[Hardwood Floors, Dogs Allowed, Cats Allowed]",40.7321,7191391,-74.0028,41735645e0f8f13993c42894023f8e58,[https://photos.renthop.com/2/7191391_8c2f2d49...,2900,23 Jones Street
5,1.0,1,81062936e12ee5fa6cd2b965698e17d5,2016-06-16 07:24:27,Beautiful TRUE 1 bedroom in a luxury building ...,Exchange Place,"[Roof Deck, Doorman, Elevator, Fitness Center,...",40.7054,7171695,-74.0095,a742cf7dd3b2627d83417bc3a1b3ec96,[https://photos.renthop.com/2/7171695_089ffee2...,3254,20 Exchange Place
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124003,1.0,1,bd863d28a6b119ac3bc72d5f27b07f24,2016-04-26 16:09:55,BRAND NEW TO MARKET 1BDR \r107TH AND LEXINGTON...,150 EAST 107TH STREET,[],40.7925,6928108,-73.9454,453d46f8113e1f2c730c2ee5a4469c71,[https://photos.renthop.com/2/6928108_231eb983...,1700,158 EAST 107TH STREET
124005,1.0,2,9174b75c0cd978eb0e5aa93afbad754b,2016-04-21 05:06:19,Convertible 2BR apartment features a brand new...,E 33rd St.,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7456,6906674,-73.9797,2983e45f7e0ad87d677dacd13e362785,[https://photos.renthop.com/2/6906674_9fe899a8...,4195,141 E 33rd St.
124006,1.0,0,0,2016-04-20 01:31:52,"Let's get you in to see this $2,400/mo, recent...",Lexington Avenue,"[Dogs Allowed, Cats Allowed]",40.7416,6897967,-73.9829,e6472c7237327dd3903b3d6f6a94515a,[],2400,95 Lexington Avenue
124007,2.0,2,c90c010e5505365676538e64d02aa1e0,2016-04-08 02:26:45,CooperCooper.com :: Web ID #171357; Access 100...,Park Avenue,"[Doorman, Elevator, Cats Allowed, Dogs Allowed]",40.7485,6842183,-73.9800,6e5c10246156ae5bdcd9b487ca99d96a,[https://photos.renthop.com/2/6842183_b1fe51f4...,6895,41 Park Avenue


# 3 feature extraction

## Basic features

- Number of photos in each posting
- Number of features user included in each posting
- length of the description text
- price per bedroom
- price per bathroom
- price per (bedroom+bathroom)
- interest rate converted to numerical value (1, 2 ,3)
- distance to New York city center in longtitude and latitude unit

In [37]:
data['num_of_photos'] = data['photos'].map(len)
data['num_of_features'] = data['features'].map(len)
data['len_of_description'] = data['description'].map(len)

# price per room
data['price_per_bedroom'] = data['price'] / data['bedrooms']
data['price_per_bathroom'] = data['price'] / data['bathrooms']
data['price_per_bed_bath_room'] = data['price'] / (data['bedrooms'] + data['bathrooms'])

# convert interest_level into numerical values
# interest_map = {'low':1, 'medium':2, 'high':3}
# data['interest_level'] = data['interest_level'].apply(lambda x: interest_map[x])

# distance to city center
city_center = [40.7128, -74.0060]
data['dist_to_city_center'] = list(map(lambda lng, lat: np.sqrt((lat - city_center[0])**2 + (lng - city_center[1])**2), data['longitude'], data['latitude']))


## Text feature extraction

### Some positive adjectives appeared in the description may have possitive effect to the posting's interest rate. Count the times that the possitive adjectives appeared. (the keywords array can be expanded in real-world practice)
- count of each possitive adjs
- count of all possitive adjs

In [38]:
keywords = ['quiet', 'new', 'close', 'spacious', 'convinient', 'safe', 'care']
for kw in keywords:
    col_name = '_'.join(['kw'] + kw.split() + ['count'])
    data[col_name] = [(s.lower().count(kw)) if s is not np.nan else 0 for s in data['description']]

def countPos(desc):
    i = 0
    desc = desc.lower()
    for kw in keywords:
        if kw in desc:
            i = i + desc.count(kw)
    return i
    
    
data['pos_count'] = data['description'].apply(countPos)
data = data.drop(['building_id','created','description', 'street_address','display_address', 'features', 'manager_id', 'photos'], axis=1)

data

Unnamed: 0,bathrooms,bedrooms,latitude,listing_id,longitude,price,num_of_photos,num_of_features,len_of_description,price_per_bedroom,...,price_per_bed_bath_room,dist_to_city_center,kw_quiet_count,kw_new_count,kw_close_count,kw_spacious_count,kw_convinient_count,kw_safe_count,kw_care_count,pos_count
0,1.0,1,40.7185,7142618,-73.9865,2950,8,6,587,2950.0,...,1475.000000,0.020316,0,1,0,0,0,0,0,1
1,1.0,2,40.7278,7210040,-74.0000,2850,3,3,245,1425.0,...,950.000000,0.016155,0,1,0,0,0,0,0,1
2,1.0,0,40.7260,7174566,-74.0026,2295,1,3,268,inf,...,2295.000000,0.013631,0,0,1,1,0,0,0,2
3,1.0,2,40.7321,7191391,-74.0028,2900,4,3,146,1450.0,...,966.666667,0.019563,0,1,0,0,0,0,0,1
5,1.0,1,40.7054,7171695,-74.0095,3254,6,10,564,3254.0,...,1627.000000,0.008186,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124003,1.0,1,40.7925,6928108,-73.9454,1700,10,0,258,1700.0,...,850.000000,0.100122,0,1,1,0,0,0,0,2
124005,1.0,2,40.7456,6906674,-73.9797,4195,4,8,701,2097.5,...,1398.333333,0.042042,0,1,1,0,0,0,0,2
124006,1.0,0,40.7416,6897967,-73.9829,2400,0,2,866,inf,...,2400.000000,0.036920,0,0,0,1,0,0,0,1
124007,2.0,2,40.7485,6842183,-73.9800,6895,8,4,816,3447.5,...,1723.750000,0.044164,0,1,0,0,0,0,0,1


In [39]:
data.to_csv('preprocessed_test_data.csv', index=False)