In [299]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
import category_encoders as ce
import requests
from geopy.distance import distance

data = pd.read_csv('data/houses2.csv', sep='|').drop(columns=['Unnamed: 0'])

In [197]:
data.head()

Unnamed: 0,condition,elevator,energy_class,floor,housetype,neighborhood,price,price_per_square_meter,number_of_rooms,rooms,square_meters,year
0,hyvä,on,G,5/5,kt,Kamppi,191050,10055,1,h,1900,1929
1,tyyd.,ei,,3/4,kt,Munkkivuori,143000,5789,1,"h, kk, kh",2470,1957
2,tyyd.,on,E2007,6/6,kt,Harju,167000,6680,1,h+kk+alk+wc,2500,1940
3,tyyd.,ei,F,1/4,kt,Ruskeasuo,157000,6181,1,"h, kk",2540,1951
4,hyvä,ei,,2/3,kt,Puotila,133000,4926,1,"h, kk, alk, vh...",2700,1961


In [198]:
# Encode condition
data.condition.loc[data.condition == '\xa0'] = None

mapping = [{'col': 'condition', 
            'mapping': 
                [('hyvä', 2), 
                 ('tyyd.', 1), 
                 ('huono', 0)]
           }]

ordinal_encoder = ce.ordinal.OrdinalEncoder(cols=['condition'])
data_cleaned = ordinal_encoder.ordinal_encoding(data, mapping=mapping)[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [199]:
# Encode elevator
elevator_data = data.elevator.copy()
elevator_data[elevator_data == 'on'] = '1'
elevator_data[elevator_data == 'ei'] = '0'

data_cleaned.elevator = pd.to_numeric(elevator_data)

In [201]:
# Return triplet 
# (floor number, top_floor, bottom_floor, floor_fraction)
def floorClean(floor):
    if floor == None:
        return (None, None, None, None)
    
    floor_frac = eval(floor)
    if floor_frac >= 0:
        floor_number = int(floor[0])
    else:
        floor_number = -int(floor[1])
        
    total_floors = int(floor[-1])
    if floor_number == total_floors:
        top_floor = True
    else:
        top_floor = False
    
    if floor_number == '1':
        bottom_floor = True
    else:
        bottom_floor = False
    
    return (floor_number, top_floor, bottom_floor, floor_frac)
    

In [202]:
# Encode floor into floor number, top_floor, floor_fraction
floor_data = data_cleaned.floor.copy()
floor_data[floor_data == '\xa0'] = None

floor_df = floor_data.apply(floorClean)
floor_df = floor_df.apply(pd.Series)
floor_df.columns = ['floor_num', 'top_floor', 
                   'bottom_floor', 'floor_frac']

In [203]:
data_cleaned = data_cleaned.drop(columns=['floor'])
data_cleaned = data_cleaned.join(floor_df)

In [204]:
# Encode house type
onehot_encoder = ce.one_hot.OneHotEncoder(cols=['housetype'],
                                         handle_unknown='ignore')
house_data = onehot_encoder.fit(data_cleaned)

tmp = house_data.transform(data_cleaned)
tmp = tmp.rename(
    columns={'housetype_0' : 'type_kt',
             'housetype_1' : 'type_rt',
             'housetype_2' : 'type_ok'})
data_cleaned = tmp

In [213]:
# Fix square meter
def fix_comma(x):
    x = x.replace(',','.')
    return float(x)

square_meters = data.square_meters.apply(fix_comma)
data_cleaned.square_meters = square_meters

In [262]:
# Find lat and long for neighborhoods
api_key = 'censored'
base_addr = 'https://maps.googleapis.com/maps/api/geocode/json?address='

neighborhood_dict = {}

neighborhoods = data.neighborhood.unique()
for n_idx, neighborhood in enumerate(neighborhoods):
    if (pd.isnull(neighborhood)):
        continue
        
    if '/' in neighborhood:
        neighborhood_query = neighborhood.split('/', 1)[1]
    else:
        neighborhood_query = neighborhood
    
    address = neighborhood_query + ',Helsinki'
    query = base_addr + address + '&key=' + api_key
    
    r = requests.get(query)
    try:
        location = r.json()['results'][0]['geometry']['location']
    except:
        location = {'lat': 0, 'lng': 0}
        
    neighborhood_dict[neighborhood] = location
    
    print('Progress: ' + str(n_idx+1) + ' of ' + str(len(neighborhoods)))
    

Progress: 1 of 155
Progress: 2 of 155
Progress: 3 of 155
Progress: 4 of 155
Progress: 5 of 155
Progress: 6 of 155
Progress: 7 of 155
Progress: 8 of 155
Progress: 9 of 155
Progress: 10 of 155
Progress: 11 of 155
Progress: 12 of 155
Progress: 13 of 155
Progress: 14 of 155
Progress: 15 of 155
Progress: 16 of 155
Progress: 17 of 155
Progress: 18 of 155
Progress: 19 of 155
Progress: 20 of 155
Progress: 21 of 155
Progress: 22 of 155
Progress: 23 of 155
Progress: 24 of 155
Progress: 25 of 155
Progress: 26 of 155
Progress: 27 of 155
Progress: 28 of 155
Progress: 29 of 155
Progress: 30 of 155
Progress: 31 of 155
Progress: 32 of 155
Progress: 33 of 155
Progress: 34 of 155
Progress: 35 of 155
Progress: 36 of 155
Progress: 37 of 155
Progress: 38 of 155
Progress: 39 of 155
Progress: 40 of 155
Progress: 41 of 155
Progress: 42 of 155
Progress: 43 of 155
Progress: 44 of 155
Progress: 45 of 155
Progress: 46 of 155
Progress: 47 of 155
Progress: 48 of 155
Progress: 49 of 155
Progress: 50 of 155
Progress:

In [277]:
def createLatLong(x):
    # x is string from neighborhood dict
    # Returns tuple (lat, long)
    if (pd.isnull(x)):
        return (np.nan, np.nan)
    elif x == 'Kuninkaantammi':
        x = 'Hakuninmaa'
        
    loc = neighborhood_dict[x]
    
    # Stupid bug
    try:
        ret = (loc['lat'], loc['lng'])
    except:
        ret = (loc['lat'], loc['long'])
        
    return ret

In [278]:
location_data = data.neighborhood.copy()

location_df = location_data.apply(createLatLong)
location_df = location_df.apply(pd.Series)
location_df.columns = ['lat', 'lng']
data_cleaned = data_cleaned.join(location_df)

In [315]:
# Add distance from city center
helsinki_center = (60.16985569999999, 24.9383791)
def distanceToCenter(x):
    lat = x['lat']
    lng = x['lng']
    loc = (lat, lng)

    dist_center = distance(helsinki_center, loc).kilometers
    
    return dist_center

In [318]:
dists = data_cleaned.apply(distanceToCenter, axis=1)
data_cleaned['dist_center'] = dists



In [321]:
data_cleaned

Unnamed: 0,type_kt,type_rt,type_ok,elevator,energy_class,neighborhood,price,price_per_square_meter,number_of_rooms,rooms,square_meters,year,condition,floor_num,top_floor,bottom_floor,floor_frac,lat,lng,dist_center
0,1,0,0,1,G,Kamppi,191050,10055,1,h,19.0,1929,2,5.0,True,False,1.000000,60.167458,24.931075,0.485608
1,1,0,0,0,,Munkkivuori,143000,5789,1,"h, kk, kh",24.7,1957,1,3.0,False,False,0.750000,60.206704,24.871212,5.544540
2,1,0,0,1,E2007,Harju,167000,6680,1,h+kk+alk+wc,25.0,1940,1,6.0,True,False,1.000000,60.188258,24.955949,2.270341
3,1,0,0,0,F,Ruskeasuo,157000,6181,1,"h, kk",25.4,1951,1,1.0,False,False,0.250000,60.202747,24.905494,4.093763
4,1,0,0,0,,Puotila,133000,4926,1,"h, kk, alk, vh...",27.0,1961,2,2.0,False,False,0.666667,60.212173,25.096109,9.939839
5,1,0,0,1,E2007,Haaga,181500,6259,1,h+kk+parv,29.0,1963,2,2.0,False,False,0.500000,60.221863,24.896194,6.249120
6,1,0,0,0,G2013,Käpylä,186000,6414,1,"h, k",29.0,1943,2,1.0,False,False,0.333333,60.214163,24.950781,4.984236
7,1,0,0,1,,Kallio,189000,6517,1,h+kk+kph,29.0,1962,1,3.0,False,False,0.428571,60.184287,24.949273,1.717779
8,1,0,0,0,D2007,Mellunmäki,98000,3267,1,"h, kk, kph",30.0,1968,1,3.0,True,False,1.000000,60.237217,25.114083,12.299261
9,1,0,0,1,E,Puotinharju,138000,4452,1,h+kk+kph,31.0,1963,2,6.0,False,False,0.857143,60.212888,25.077880,9.103941


In [324]:
# Finally, save cleaned data to file
data_cleaned.to_csv('data/cleaned_data2.csv')

In [325]:
# Here we save neighborhood dict for future use
import json
with open('data/neighborhood_dictionary.json', 'w') as outfile:
    json.dump(neighborhood_dict, outfile)