In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import os
from supervised.automl import AutoML
import category_encoders as ce
import math

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder
from sklearn.metrics import median_absolute_error
print('Imported sci-kit learn and AutoML libraries!')

Imported sci-kit learn and AutoML libraries!


# UK Feature Engineering

Here we will follow a similar process to Dublin and Cork city:

- we will geocode each data point and remove any data point we cannot geocode successfully
- we will then track the importance of locational attributes such as distance to city center etc
- once this is done we will export this dataset for more prediction trials to see does it improve our results and understand house price determinants in a large city outside of Ireland.

In [2]:
# new file containing rows with our focused area of Dublin found through exploratory analysis
irl = pd.read_csv('leeds-data-1417.csv', index_col='Unnamed: 0')
irl1 = pd.read_csv('leeds-data-1821.csv', index_col='Unnamed: 0')

In [3]:
irl2 = pd.concat([irl, irl1], axis=0)

In [4]:
irl = irl2.copy()

In [5]:
irl.head()

Unnamed: 0,Transaction unique identifier,Price,Postcode,Property Type,Old/New,Duration,HouseNum,Add1,Add2,Add3,Town/City,District,County,PPDCategory Type,Month,Year
465,{50F18103-63C0-9FD5-E050-A8C063054923},210000,LS25 6NJ,D,N,F,6,,BOND INGS RISE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,5,2017
959,{50F18103-63D7-9FD5-E050-A8C063054923},180000,LS25 6BN,D,N,F,1,,RUDSTONE GROVE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,4,2017
2520,{68FEB20C-3CBE-38DA-E053-6C04A8C051AE},142500,LS19 7GL,F,Y,L,61,,DYEHOUSE WALK,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017
2536,{68FEB20C-3CCF-38DA-E053-6C04A8C051AE},102000,LS18 5NP,F,N,L,GRESLEY HOUSE,2.0,SUSSEX AVENUE,HORSFORTH,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017
2539,{68FEB20C-3CD2-38DA-E053-6C04A8C051AE},250000,LS19 7FU,S,Y,F,3,,WEAVERS BECK WAY,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,9,2017


In [6]:
irl.shape

(41033, 16)

In [7]:
#irl.drop(columns = ['Transaction unique identifier', 'Town/City', 'Add1', 'HouseNum'], inplace = True)

In [8]:
irl['Postcode'].isnull().values.any()

True

In [9]:
irl.dropna(subset = ['Postcode'], inplace = True)

In [10]:
irl.shape

(40995, 16)

In [11]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371 * c
    return km 

In [14]:
from geopy.geocoders import Nominatim
import time
geolocator = Nominatim(user_agent = "fypLeeds-t", timeout = None)

In [15]:
irl['geocode'] = irl['Postcode'].apply(geolocator.geocode)

In [16]:
## DON'T RERUN!

#lat = []
#long = []
#geo = []
#for row in irl['Postcode']:
#    time.sleep(1.3)
#    addr = geolocator.geocode(row, timeout = None)
#    if addr.latitude != None:
#        latitude = addr.latitude
#        longitude = addr.longitude
#    else:
#        lat.append('None')
#        long.append('None')
    
#    lat.append(latitude)
#    long.append(longitude)

In [17]:
#lat

In [19]:
#irl['Latitude'] = lat
#irl['Longitude'] = lon

In [41]:
irl = irl.loc[irl['geocode'] != None]
irl.head()

Unnamed: 0,Transaction unique identifier,Price,Postcode,Property Type,Old/New,Duration,HouseNum,Add1,Add2,Add3,Town/City,District,County,PPDCategory Type,Month,Year,geocode
465,{50F18103-63C0-9FD5-E050-A8C063054923},210000,LS25 6NJ,D,N,F,6,,BOND INGS RISE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,5,2017,"(Sherburn in Elmet, Little Fenton, Selby, Nort..."
959,{50F18103-63D7-9FD5-E050-A8C063054923},180000,LS25 6BN,D,N,F,1,,RUDSTONE GROVE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,4,2017,"(Sherburn in Elmet, Little Fenton, Selby, Nort..."
2520,{68FEB20C-3CBE-38DA-E053-6C04A8C051AE},142500,LS19 7GL,F,Y,L,61,,DYEHOUSE WALK,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017,"(Leeds, West Yorkshire, Yorkshire and the Humb..."
2536,{68FEB20C-3CCF-38DA-E053-6C04A8C051AE},102000,LS18 5NP,F,N,L,GRESLEY HOUSE,2.0,SUSSEX AVENUE,HORSFORTH,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017,"(Horsforth, Leeds, West Yorkshire, Yorkshire a..."
2539,{68FEB20C-3CD2-38DA-E053-6C04A8C051AE},250000,LS19 7FU,S,Y,F,3,,WEAVERS BECK WAY,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,9,2017,"(Leeds, West Yorkshire, Yorkshire and the Humb..."


In [34]:
irl1 = irl.reset_index()
irl1.drop(columns = ['index'], inplace = True)
irl1.head()

Unnamed: 0,Transaction unique identifier,Price,Postcode,Property Type,Old/New,Duration,HouseNum,Add1,Add2,Add3,Town/City,District,County,PPDCategory Type,Month,Year,geocode
0,{50F18103-63C0-9FD5-E050-A8C063054923},210000,LS25 6NJ,D,N,F,6,,BOND INGS RISE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,5,2017,"(Sherburn in Elmet, Little Fenton, Selby, Nort..."
1,{50F18103-63D7-9FD5-E050-A8C063054923},180000,LS25 6BN,D,N,F,1,,RUDSTONE GROVE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,4,2017,"(Sherburn in Elmet, Little Fenton, Selby, Nort..."
2,{68FEB20C-3CBE-38DA-E053-6C04A8C051AE},142500,LS19 7GL,F,Y,L,61,,DYEHOUSE WALK,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017,"(Leeds, West Yorkshire, Yorkshire and the Humb..."
3,{68FEB20C-3CCF-38DA-E053-6C04A8C051AE},102000,LS18 5NP,F,N,L,GRESLEY HOUSE,2.0,SUSSEX AVENUE,HORSFORTH,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017,"(Horsforth, Leeds, West Yorkshire, Yorkshire a..."
4,{68FEB20C-3CD2-38DA-E053-6C04A8C051AE},250000,LS19 7FU,S,Y,F,3,,WEAVERS BECK WAY,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,9,2017,"(Leeds, West Yorkshire, Yorkshire and the Humb..."


In [36]:
lat = []
long = []
for i in range(len(irl1['geocode'])):
    if irl1['geocode'][i] != None:
        latitude = irl1['geocode'][i].latitude
        longitude = irl1['geocode'][i].longitude
        lat.append(latitude)
        long.append(longitude)
    else:
        lat.append('None')
        long.append('None')

In [32]:
irl['geocode'][465].latitude

53.79569

In [37]:
lat[0]

53.79569

In [38]:
irl1['Latitude'] = lat
irl1['Longitude'] = long

In [40]:
irl1.head()

Unnamed: 0,Transaction unique identifier,Price,Postcode,Property Type,Old/New,Duration,HouseNum,Add1,Add2,Add3,Town/City,District,County,PPDCategory Type,Month,Year,geocode,Latitude,Longitude
0,{50F18103-63C0-9FD5-E050-A8C063054923},210000,LS25 6NJ,D,N,F,6,,BOND INGS RISE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,5,2017,"(Sherburn in Elmet, Little Fenton, Selby, Nort...",53.79569,-1.24147
1,{50F18103-63D7-9FD5-E050-A8C063054923},180000,LS25 6BN,D,N,F,1,,RUDSTONE GROVE,SHERBURN IN ELMET,LEEDS,SELBY,NORTH YORKSHIRE,A,4,2017,"(Sherburn in Elmet, Little Fenton, Selby, Nort...",53.79803,-1.25588
2,{68FEB20C-3CBE-38DA-E053-6C04A8C051AE},142500,LS19 7GL,F,Y,L,61,,DYEHOUSE WALK,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017,"(Leeds, West Yorkshire, Yorkshire and the Humb...",53.85906,-1.68457
3,{68FEB20C-3CCF-38DA-E053-6C04A8C051AE},102000,LS18 5NP,F,N,L,GRESLEY HOUSE,2.0,SUSSEX AVENUE,HORSFORTH,LEEDS,LEEDS,WEST YORKSHIRE,A,12,2017,"(Horsforth, Leeds, West Yorkshire, Yorkshire a...",53.84793,-1.63152
4,{68FEB20C-3CD2-38DA-E053-6C04A8C051AE},250000,LS19 7FU,S,Y,F,3,,WEAVERS BECK WAY,YEADON,LEEDS,LEEDS,WEST YORKSHIRE,A,9,2017,"(Leeds, West Yorkshire, Yorkshire and the Humb...",53.85909,-1.6838


In [42]:
#irl1.to_csv('leedsallgeo.csv')

In [None]:
### data successfully geocoded add features in now based off this