In this notebook we are going to use the GeoPy API in order to obtain more useful location related information from the coordinates. This technique is called "Reverse Geocoding".

Feature Engineering Geospatial Data

# Import Libraries and Set Options

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from daftpy.daftfeanalysis import (location_dict, location_dataframe, location_engineering, 
                                   geonames_dict)

from daftpy.daftfeanalysis import missing_values

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

## Load Data

In [4]:
sale = pd.read_csv('data_available/sale_cleaned.csv', sep=',', parse_dates=['entered_renewed', 'scraping_date'])
sale.shape

(7662, 19)

In [5]:
sale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7662 entries, 0 to 7661
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   daft_id          7662 non-null   int64         
 1   url              7662 non-null   object        
 2   name             7662 non-null   object        
 3   price            7662 non-null   float64       
 4   sale_type        7662 non-null   object        
 5   floor_area       7662 non-null   int64         
 6   psr              7646 non-null   float64       
 7   ber              7467 non-null   object        
 8   entered_renewed  7662 non-null   datetime64[ns]
 9   views            7662 non-null   float64       
 10  type_house       7175 non-null   object        
 11  type             7662 non-null   object        
 12  scraping_date    7662 non-null   datetime64[ns]
 13  description      7661 non-null   object        
 14  latitude         7662 non-null   float64

# Check Missing Values

In [6]:
# Check missing values in absolute and relative terms
missing_values(sale)

Unnamed: 0,Absolute,Relative
daft_id,0,0.0
url,0,0.0
name,0,0.0
price,0,0.0
sale_type,0,0.0
floor_area,0,0.0
psr,16,0.002088
ber,195,0.02545
entered_renewed,0,0.0
views,0,0.0


If you pay attention to the `ber` column you will note that now there are 195 missing values instead 192 as in the other notebook. That is because there was 3 ads which have `NA` in that column. We can understand `NA` value as a lack of information so the the `pd.read_csv` function has fixed that for us.

# Reverse Geocoding

We do reverse geocoding with GeoPy and Nominatim geolocator. The `location_engineering` function uses two more functions, one to creating a dictionary with the extracted information and another one to add that dictionary to the DataFrame.

In [5]:
sale = location_engineering(df=sale)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
Shape before adding: (7662, 19)
Shape after adding: (7662, 32)
----------
Difference: 13 columns


It took a long time so I decided save the resulted dataframe into a csv file.

In [6]:
sale.to_csv('data_available/sale_post_reverse_geocoding.csv', 
            sep=',', index=False)

--------------

# Load Post Reverse Geocoding Data

In [7]:
sale = pd.read_csv('data_available/sale_post_reverse_geocoding.csv', sep=',')
sale.shape

(7662, 32)

In [8]:
sale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7662 entries, 0 to 7661
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   daft_id          7662 non-null   int64  
 1   url              7662 non-null   object 
 2   name             7662 non-null   object 
 3   price            7662 non-null   float64
 4   sale_type        7662 non-null   object 
 5   floor_area       7662 non-null   int64  
 6   psr              7646 non-null   float64
 7   ber              7467 non-null   object 
 8   entered_renewed  7662 non-null   object 
 9   views            7662 non-null   float64
 10  type_house       7175 non-null   object 
 11  type             7662 non-null   object 
 12  scraping_date    7662 non-null   object 
 13  description      7661 non-null   object 
 14  latitude         7662 non-null   float64
 15  longitude        7662 non-null   float64
 16  bedroom          7662 non-null   int64  
 17  bathroom      

# Check Missing Values

In [9]:
# Check missing values in absolute and relative terms
missing_values(sale)

Unnamed: 0,Absolute,Relative
daft_id,0,0.0
url,0,0.0
name,0,0.0
price,0,0.0
sale_type,0,0.0
floor_area,0,0.0
psr,16,0.002088
ber,195,0.02545
entered_renewed,0,0.0
views,0,0.0


As we can see there are a lot of new missing values in the data recently added. We are going to deal with them in a minut but first let's see if any UK ad managed to skip our data cleaning task when we cleaned the coordinates .

In [59]:
sale.country.value_counts()

Éire / Ireland    7661
United Kingdom       1
Name: country, dtype: int64

There it is. Let's quickly drop it.

In [60]:
sale.drop(sale[sale.country == 'United Kingdom'].index, inplace=True)
sale.country.value_counts()

Éire / Ireland    7661
Name: country, dtype: int64

# Dealing With The New Missing Values

Aislamos las variables relacionadas con la localizacion para trabajar mas comodamente

In [10]:
location_features = sale[['url', 
                          'latitude', 
                          'longitude', 
                          'country_code', 
                          'country', 
                          'postcode', 
                          'state_district', 
                          'county', 
                          'municipality', 
                          'city', 
                          'town', 
                          'city_district', 
                          'locality', 
                          'road', 
                          'house_number']].copy()

missing_values(location_features)

Unnamed: 0,Absolute,Relative
url,0,0.0
latitude,0,0.0
longitude,0,0.0
country_code,0,0.0
country,0,0.0
postcode,1208,0.157661
state_district,395,0.051553
county,695,0.090707
municipality,6391,0.834116
city,5646,0.736883


Mis coordenadas estan en grados decimales, DD.

Estrategia 1 para llenar missing values:
- Urban area = city/town
- postcode -> urban area

In [195]:
# 506 no tienen ni city ni town
location_features.loc[location_features[['city', 'town']].isna().all(axis=1)].shape

(4519, 15)

In [196]:
# 
print(location_features.loc[location_features[['city', 'town']].notna().all(axis=1), ['city', 'town']].shape)
location_features.loc[location_features[['city', 'town']].notna().all(axis=1), ['city', 'town']].head()

(6, 2)


Unnamed: 0,city,town
2113,Cork,Ballincollig
3194,Cork,Blarney
4039,Cork,Blarney
6553,The Municipal District of Adare — Rathkeale,Foynes
7000,Blarney - Macroom,Macroom


In [197]:
location_features.loc[location_features[['city', 'town', 'postcode']].isna().all(axis=1)].shape

(1095, 15)

In [198]:
location_features.postcode.isna().sum()

1215

![](imgs/eircode.png)

In [200]:
geonames_dict = geonames_dict()  # solo se puede ejecutar una vez
geonames_df = pd.DataFrame(geonames_dict)
geonames_df.head(3)

Unnamed: 0,place,code,admin1,place_coordinates
0,Ballyboughal,A41,Leinster,53.52/-6.267
1,Garristown,A42,Leinster,53.566/-6.386
2,Oldtown,A45,Leinster,53.525/-6.316


In [201]:
locna = location_features.loc[location_features[['city', 'town']].isna().all(axis=1)].copy()
locna.head()

Unnamed: 0,url,latitude,longitude,country_code,country,postcode,state_district,county,municipality,city,town,city_district,locality,road,house_number
3,https://www.daft.ie/for-sale/semi-detached-hou...,52.830996,-8.972552,ie,Éire / Ireland,V95 W893,Munster,County Clare,,,,Ennis Rural ED,,Abbey Court,91.0
4,https://www.daft.ie/new-home-for-sale/townhous...,53.043874,-7.260309,ie,Éire / Ireland,R32 DTW5,Leinster,County Laois,,,,Portlaoise Rural ED,,R445,
5,https://www.daft.ie/for-sale/apartment-20-thor...,53.403388,-6.411067,ie,Éire / Ireland,D15 X925,Leinster,,Fingal,,,Blanchardstown-Blakestown ED,,Thornbury Square,
6,https://www.daft.ie/for-sale/detached-house-90...,53.374213,-6.358932,ie,Éire / Ireland,D15 W9DH,Leinster,County Dublin,Fingal,,,Blanchardstown ED,,Castleknock Park,90.0
11,https://www.daft.ie/for-sale/semi-detached-hou...,52.702058,-8.869069,ie,Éire / Ireland,V14 PV30,Munster,County Clare,,,,Clenagh ED,,,


In [202]:
locna['postcode'].str.len().value_counts()

8.0     2968
7.0      394
10.0      22
9.0       14
3.0       13
12.0       7
6.0        2
13.0       2
4.0        1
11.0       1
Name: postcode, dtype: int64

In [203]:
locna.loc[locna.postcode.str.len() == 8, 'postcode'].sample(3)

6291    V23 EH32
5181    P25 YY72
3270    A96 CD72
Name: postcode, dtype: object

In [204]:
locna.loc[locna.postcode.str.len() == 7, 'postcode'].sample(5)

4423    A92DKR0
4063    P25A062
2561    C15X4A7
5948    R32W8C5
6468    W91XW6H
Name: postcode, dtype: object

In [205]:
locna.loc[locna.postcode.str.len() == 10, 'postcode'].sample(5)

1862    CO WICKLOW
2529    CO.ATHLONE
762     CO.ATHLONE
5418    CO.ATHLONE
7484    CO.ATHLONE
Name: postcode, dtype: object

In [206]:
locna.loc[locna.postcode.str.len() == 9, 'postcode'].sample(5)

1075    DUBLIN 18
4895    DUBLIN 18
136     DUBLIN 18
4291    DUBLIN 18
3312    DUBLIN 18
Name: postcode, dtype: object

In [207]:
locna.loc[locna.postcode.str.len() == 3, 'postcode'].sample(3)

1451    D16
4927    F12
2648    W23
Name: postcode, dtype: object

In [208]:
locna.loc[locna.postcode.str.len() == 12, 'postcode']

113     CO. KILKENNY
1542    CO WESTMEATH
2330    CO WESTMEATH
4208    CO WESTMEATH
5440    CO WESTMEATH
5543    CO. KILKENNY
7549    CO WESTMEATH
Name: postcode, dtype: object

In [209]:
locna.loc[locna.postcode.str.len() == 6, 'postcode']

404     H91 DV
6731    H91 DV
Name: postcode, dtype: object

In [210]:
locna.loc[locna.postcode.str.len() == 13, 'postcode']

6936    CO. ROSCOMMON
7067    CO. ROSCOMMON
Name: postcode, dtype: object

In [211]:
locna.loc[locna.postcode.str.len() == 4, 'postcode']

772    0000
Name: postcode, dtype: object

In [212]:
locna.loc[locna.postcode.str.len() == 11, 'postcode']

6657    CO. WICKLOW
Name: postcode, dtype: object

In [7]:
import re

def eircode_homogenize(df):

    def homogenize(eircode):   # 8, 3, 7, , dublin, 6, 9, 10
        if eircode is np.nan: pass
        elif len(eircode) == 8: pass
        elif len(eircode) == 3: pass
        elif len(eircode) == 7:
            if re.match(r'\w{3} \w{3}', eircode):
                eircode = eircode[:3]
            else:            
                routing_key = re.search(r'(\b\w{3})', eircode)[0]
                unique_identifier = re.search(r'(\w{4}\b)', eircode)[0]
                eircode = f'{routing_key} {unique_identifier}'
        elif re.match(r'DUBLIN', eircode):
            num = eircode[-2:]
            try:
                if int(num) < 10:
                    eircode = f'D0{int(num)}'
                elif int(num) < 25:
                    eircode = f'D{num}'
                else:
                    eircode = np.nan
            except:  # 6w
                eircode = f'D{num}'
        elif len(eircode) == 6: eircode = eircode[:3]
        elif (len(eircode) == 9) or (len(eircode) == 10):
            if eircode == 'CO. CLARE':
                eircode = np.nan
            elif eircode == 'CO WICKLOW':
                eircode = np.nan 
            elif re.match(r'\b\w{3}\b \b\w{2}\b \b\w{2}\b', eircode):   #D20 HK 69
                eircode = eircode[:3]
            else:
                print('8( ' * 10)
                eircode = np.nan
                print(eircode)
        else:
            print('8( ' * 10) 
            print(eircode)
            eircode = np.nan
        return eircode
    
    df['postcode'] = df['postcode'].apply(homogenize)
    #df.iloc[:, 5] = df['postcode'].apply(homogenize)
 
    return df

In [8]:
def add_location(df):
    #count = 0
    for row in df.iterrows():
        #print(type(row[1]))
        row_index = row[0]
        #print(row_index)
        if row[1]['postcode'] is not np.nan:
            #routing_key = row[1]['postcode'].split(' ')[0]
            routing_key = row[1]['postcode'][:3]
        elif row[1]['postcode'] is np.nan:
            continue
        #else:
         #   routing_key = row[1]['postcode'].split(' ')[0]
        #print(routing_key)
        #try:
        geonames_row = geonames_df[geonames_df['code'].str.contains(routing_key)]
        if len(geonames_row) != 0:
            for column in geonames_df:
                #print(column)
                #print(df.loc[row_index, column])
                #print(geonames_row[column].values)
                df.loc[row_index, column] = geonames_row[column].values[0]
        #print(geonames_row)
        #except:
         #   print(geonames_row.index, '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        else: continue
        
        
        #for column in geonames_df:
         #   #print(column)
          #  #print(df.loc[row_index, column])
           # #print(geonames_row[column].values)
            #df.loc[row_index, column] = geonames_row[column].values#[0]
        
        #count += 1
        #print(count) 
    #print(count)    
    return df

In [215]:
print(locna.shape)
locna = eircode_homogenize(locna)
print(locna.shape)

(4519, 15)
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO. KILKENNY
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
0000
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO. KILKENNY
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO. WICKLOW
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO. ROSCOMMON
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO. ROSCOMMON
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 


In [216]:
locna['postcode'].str.len().value_counts()

8.0    3358
3.0      32
Name: postcode, dtype: int64

In [217]:
print(locna.shape)
add_location(locna)
print(locna.shape)

(4519, 15)
(4519, 19)


In [218]:
locna.isna().sum()

url                     0
latitude                0
longitude               0
country_code            0
country                 0
postcode             1129
state_district        287
county                230
municipality         3939
city                 4519
town                 4519
city_district          45
locality             4514
road                 1621
house_number         4118
place                1149
code                 1149
admin1               1149
place_coordinates    1149
dtype: int64

In [219]:
locna.city_district.value_counts().head()#.count()

Glencullen ED           56
Kilcoole ED             32
Navan Rural ED          31
Enniscorthy Rural ED    27
Thurles Urban           26
Name: city_district, dtype: int64

In [220]:
location_features.shape

(7692, 15)

In [221]:
print(location_features.shape)
location_features = eircode_homogenize(location_features)
print(location_features.shape)

(7692, 15)
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
9
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO. KILKENNY
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
9
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
8
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
0000
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
3
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
8
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
8
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
22
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 

In [223]:
print(location_features.shape)
add_location(location_features)
print(location_features.shape)

(7692, 15)
(7692, 19)


In [224]:
location_features.isna().sum()

url                     0
latitude                0
longitude               0
country_code            0
country                 0
postcode             1302
state_district        368
county                701
municipality         6402
city                 5648
town                 6557
city_district         377
locality             7687
road                 2003
house_number         6223
place                1372
code                 1372
admin1               1372
place_coordinates    1372
dtype: int64

In [228]:
location_features.state_district.value_counts()

Leinster    4162
Munster     2156
Connacht    1006
Name: state_district, dtype: int64

In [246]:
#location_features.loc[location_features.city.notna(), ['postcode','city','town','place']]

In [252]:
#location_features.loc[location_features.place.isna(), ['city','town','place','postcode','latitude','longitude']]

----------------------

In [11]:
sale_data = pd.read_csv('data_available/sale_data_post_geos_eng.csv', sep=',')
sale_data.shape

(7695, 28)

In [12]:
sale_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7695 entries, 0 to 7694
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   daft_id          7695 non-null   int64  
 1   url              7695 non-null   object 
 2   name             7695 non-null   object 
 3   price            7695 non-null   float64
 4   sale_type        7695 non-null   object 
 5   floor_area       7695 non-null   int64  
 6   entered_renewed  7695 non-null   object 
 7   views            7695 non-null   float64
 8   type_house       7695 non-null   object 
 9   type             7695 non-null   object 
 10  scraping_date    7695 non-null   object 
 11  latitude         7695 non-null   float64
 12  longitude        7695 non-null   float64
 13  bedroom          7695 non-null   int64  
 14  bathroom         7695 non-null   int64  
 15  country_code     7695 non-null   object 
 16  country          7695 non-null   object 
 17  postcode      

In [13]:
print(sale_data.shape)
sale_data = eircode_homogenize(sale_data)
print(sale_data.shape)

(7695, 28)
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
9
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO. KILKENNY
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
9
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
8
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
0000
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
3
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
nan
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
8
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
8
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
D5
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
22
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
1
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
CO WESTMEATH
8( 8( 8( 8( 8( 8( 8( 8( 8( 8( 
4
8( 8( 

In [15]:
import requests
import lxml.html as lh

def geonames_dict():
    
    url = 'http://www.geonames.org/postalcode-search.html?q=&country=IE'
    page = requests.get(url)
    doc = lh.fromstring(page.content) 
    tr_elements = doc.xpath('//tr')
    
    #Create empty dict
    col = {}
    #For each row, store each first element (header) and an empty list
    for i, t in enumerate(tr_elements[2]):
        key = t.text_content().lower()
        #print('%d: "%s"'%(i,name))
        col[key] = [] 
    col['place_coordinates'] = []

    # Fill dict
    #print(tr_elements[-1].text_content())
    for tr in tr_elements[3:]:
        
        if len(tr) == 7:
   
            for key, td in zip(col, tr):
                td = td.text_content()
                #print(td)
                col[key].append(td)
        elif len(tr) == 2:
        
            td = tr[-1].text_content()
            #print(td)
            col['place_coordinates'].append(td)
            
    del col['']
    del col['country']
    del col['admin2']
    del col['admin3']

    return col

In [16]:
geonames_dict = geonames_dict()  # solo se puede ejecutar una vez
geonames_df = pd.DataFrame(geonames_dict)
geonames_df.head(3)

Unnamed: 0,place,code,admin1,place_coordinates
0,Ballyboughal,A41,Leinster,53.52/-6.267
1,Garristown,A42,Leinster,53.566/-6.386
2,Oldtown,A45,Leinster,53.525/-6.316


In [17]:
print(sale_data.shape)
add_location(sale_data)
print(sale_data.shape)

(7695, 28)
(7695, 32)


In [20]:
sale_data.isna().sum() / sale_data.shape[0] *100

daft_id               0.000000
url                   0.000000
name                  0.000000
price                 0.000000
sale_type             0.000000
floor_area            0.000000
entered_renewed       0.000000
views                 0.000000
type_house            0.000000
type                  0.000000
scraping_date         0.000000
latitude              0.000000
longitude             0.000000
bedroom               0.000000
bathroom              0.000000
country_code          0.000000
country               0.000000
postcode             16.920078
state_district        4.821313
county                9.109812
municipality         83.235867
city                 73.437297
town                 85.237167
city_district         4.938272
locality             99.935023
suburb               76.855101
road                 26.029890
house_number         80.909682
place                17.868746
code                 17.868746
admin1               17.868746
place_coordinates    17.868746
dtype: f

In [21]:
sale_data.to_csv('data_available/sale_data_geosp_improved.csv', sep=',', index=False)

### Assing Coordinates to Different Clusters

In [None]:
from sklearn.cluster import KMeans

def cluster(data):
    '''
    input: dataframe containing Latitude(x) and Longitude(y) coordinates
    output: series of cluster labels that each row of coordinates belongs to.
    '''
    model = KMeans(n_clusters=50)
    labels = model.fit_predict(data)
    return labels

#### Visualization

In [None]:
import matplotlib.pyplot as plt
def visualize(data):
    '''
    input: dataframe containing Latitude(x) and Longitude(y)
    '''
    plt.scatter(data['longitude'], data['latitude'], color='blue', s=1, alpha=0.1)
    #data[['longitude','latitude']].scatter(color='blue', s=1, alpha=0.1)
    plt.title('Visualization of Latitude and Longitude')
    plt.ylabel('latitude')
    plt.xlabel('longitude')
    #plt.ylim([53.350, 53.325])
    #plt.xlim([-6.24, -6.23])
    plt.show()

In [None]:
visualize(sale_data)

### Reverse-Geocode

In [None]:
import reverse_geocode

In [None]:
coordinates = (54.276114, -8.506455), (54.276114, -8.506455)
reverse_geocode.search(coordinates)

### Polar Coordinates

In [None]:
# Converting Cartesian Coordinate to Polar Coordinate
# Importing math library
import math

# Reading cartesian coordinate
x = sale_data['latitude']
y = sale_data['longitude']

# Converting cartesian to polar coordinate
# Calculating radius
#radius = math.sqrt( x * x + y * y )
radius = ((x * x) + (y * y)) ** (1 / 2)
# Calculating angle (theta) in radian
theta = np.arctan(y / x)
# Converting theta from radian to degree
theta = 180 * theta / math.pi

# Displaying polar coordinates
#print('Polar coordinate is: (radius = %0.2f,theta = %0.2f)' %(radius, theta))

In [None]:
plt.scatter(theta, radius, color='red', s=1);#alpha=0.1

### Rotational Cartesian Coordinates

In [None]:
def rotation(df):
    rot_45_x = (0.707 * df['latitude']) + (0.707 * df['longitude'])
    rot_45_y = (0.707 * df['longitude']) + (0.707 * df['latitude'])
    rot_30_x = (0.866 * df['latitude']) + (0.5 * df['longitude'])
    rot_30_y = (0.866 * df['longitude']) + (0.5 * df['latitude'])
    return rot_45_x, rot_45_y, rot_30_x, rot_30_y

In [None]:
rot_45_x, rot_45_y, rot_30_x, rot_30_y = rotation(sale_data)

In [None]:
plt.scatter(rot_45_x, rot_45_y);

In [None]:
plt.scatter(rot_30_x, rot_30_y);

### Haversine Distance

In [None]:
def haversine_dist(lat1,lng1,lat2,lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    radius = 6371  # Earth's radius taken from google
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat/2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng/2) ** 2
    h = 2 * radius * np.arcsin(np.sqrt(d))
    return h

### Manhattan Distance

In [None]:
def manhattan_dist(lat1, lng1, lat2, lng2):
    '''
    calculating two haversine distances by,
     - avoiding Latitude of one point 
     - avoiding Longitude of one point
    and adding it together.
    '''
    a = haversine_dist(lat1, lng1, lat1, lng2)
    b = haversine_dist(lat1, lng1, lat2, lng1)
    return a + b

### Bearing Degree

In [None]:
def bearing_degree(lat1, lng1, lat2, lng2):
    '''
    calculate angle between two points
    '''
    radius = 6371  # Mean radius of Earth
    diff_lng = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(diff_lng) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(diff_lng)
    return np.degrees(np.arctan2(y, x))

### Rotational Coordinates using PCA

Here, the idea is only to rotate them which would help decision tree splits in typical tree-based models.

In [None]:
from sklearn.decomposition import PCA
def pca(data):
    '''
    input: dataframe containing Latitude(x) and Longitude(y)
    '''
    coordinates = data[['x','y']].values
    pca_obj = PCA().fit(coordinates)
    pca_x = pca_obj.transform(data[['x', 'y']])[:,0]
    pca_y = pca_obj.transform(data[['x', 'y']])[:,1]
    return pca_x, pca_y