In [65]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import widgets, interact

# Step 1: Read the CSV
file_path = '../data/processed/saved_scrapes/mynest_myhome_20240924.csv'
df = pd.read_csv(file_path)

### Step 2: Clean the 'Asking Price' column (Remove currency symbols, commas, etc.)
# Remove any non-numeric characters (commas, currency symbols)
df['Asking Price'] = df['Asking Price'].replace({'[^0-9.]': ''}, regex=True)

### Convert 'Asking Price' and 'MyHome_Floor_Area_Value' to Numeric
df['Asking Price'] = pd.to_numeric(df['Asking Price'], errors='coerce')  # Convert to numeric, invalid values become NaN
df['MyHome_Floor_Area_Value'] = pd.to_numeric(df['MyHome_Floor_Area_Value'], errors='coerce')  # Convert to numeric

### Safe Division: Add Square Metres (Handling NaNs and Division by Zero)
def safe_divide(row):
    if pd.isna(row['Asking Price']) or pd.isna(row['MyHome_Floor_Area_Value']) or row['MyHome_Floor_Area_Value'] == 0:
        return None  # Return None if the value is NaN or the denominator is zero
    else:
        return row['Asking Price'] / row['MyHome_Floor_Area_Value']

# Apply the safe divide function to the DataFrame
df['price_per_square_meter'] = df.apply(safe_divide, axis=1)

# Step 3: Display the DataFrame
df.head(3)  # Displays the first few rows


Unnamed: 0,Address,Asking Price,Beds,Baths,Property Type,Energy Rating,Eircode,Local Property Tax,Agency Name,Agency Contact,...,MyHome_Latitude,MyHome_Longitude,MyHome_Monthly_Price,MyHome_Floor_Area_Unit,MyHome_Publish_Date,MyHome_Sale_Type,MyHome_Category,MyHome_Featured_Level,MyHome_Link,price_per_square_meter
0,"Taramar, Middle Third, Dublin 5, D05X8N9",750000,4 Bed,1 Bath,End of Terrace,E2,D05 X8N9,€765,Hamill Estate Agents & Valuers,Hamill Estate Agents & Valuers,...,53.373857,-6.203371,0.0,m²,,,,,https://www.myhome.ie/residential/brochure/tar...,5281.690141
1,"87 Haddington Road, Dublin 4, D04WP23",990000,5 Bed,3 Bath,Terrace,SI_666,D04 WP23,"€1,035",Turley Property Advisors,Susan Turley,...,53.335385,-6.239842,0.0,m²,,,,,https://www.myhome.ie/residential/brochure/87-...,5657.142857
2,"Apartment 79, The Northumberlands, Love Lane E...",410000,2 Bed,1 Bath,Apartment,C1,D02 X068,€405,Owen Reilly,Owen Reilly Sales,...,53.349805,-6.26031,0.0,m²,,,,,https://www.myhome.ie/residential/brochure/79-...,6406.25


In [66]:
import pandas as pd

# Subset of columns, ensuring column name is consistent
columns_subset = ['Address', 'Asking Price', 'Beds', 'Baths', 'Property Type',
                  'Energy Rating', 'Eircode', 'Agency Name', 
                  'MyHome_Latitude', 'MyHome_Longitude', 'price_per_square_meter',
                  'Price Changes', 'MyHome_Floor_Area_Value']

# Subsetting the DataFrame
df_subset = df[columns_subset]

# Filtering rows where 'price_per_square_metre' is greater than 0 (ensure consistent spelling)
df_filtered = df_subset[df_subset['price_per_square_meter'] > 0]

# Display the filtered DataFrame
df_filtered
df = df_filtered
df.head()


Unnamed: 0,Address,Asking Price,Beds,Baths,Property Type,Energy Rating,Eircode,Agency Name,MyHome_Latitude,MyHome_Longitude,price_per_square_meter,Price Changes,MyHome_Floor_Area_Value
0,"Taramar, Middle Third, Dublin 5, D05X8N9",750000,4 Bed,1 Bath,End of Terrace,E2,D05 X8N9,Hamill Estate Agents & Valuers,53.373857,-6.203371,5281.690141,"Sold, €950,000, Fri Sep 13 2024; Sale Agreed, ...",142.0
1,"87 Haddington Road, Dublin 4, D04WP23",990000,5 Bed,3 Bath,Terrace,SI_666,D04 WP23,Turley Property Advisors,53.335385,-6.239842,5657.142857,"Sold, €1,010,000, Fri Sep 13 2024; Sale Agreed...",175.0
2,"Apartment 79, The Northumberlands, Love Lane E...",410000,2 Bed,1 Bath,Apartment,C1,D02 X068,Owen Reilly,53.349805,-6.26031,6406.25,"Sold, €480,000, Fri Sep 13 2024; Sale Agreed, ...",64.0
3,"7 Parkside Heath, Clongriffin, Dublin 13, Dubl...",535000,3 Bed,3 Bath,Terrace,A3,D13 WN3C,Sherry FitzGerald Sutton,53.407653,-6.163418,4734.513274,"Sold, €569,000, Fri Sep 13 2024; Unlisted, €53...",113.0
6,"31 Tibradden Grove, Dublin 12, D12P2X4",355000,3 Bed,1 Bath,Terrace,D1,D12 P2X4,Byrne and Moore Property Consultants Limited,53.308228,-6.34192,3349.056604,"Sold, €390,000, Fri Sep 13 2024; Sale Agreed, ...",106.0


In [69]:
import pandas as pd
import re

def get_sold_price_and_date(price_changes: str):
    """
    Extracts the sold asking price and the date sold from the Price Changes column.
    
    Args:
    price_changes (str): The value from the Price Changes column.
    
    Returns:
    tuple: A tuple containing the sold price as a numeric value and the sold date as a string, or (None, None) if not found.
    """
    if isinstance(price_changes, str):  # Ensure the input is a string
        # Updated regex pattern to capture the "Sold" price and date
        match = re.search(r"Sold, €([0-9,]+), [A-Za-z]{3} ([A-Za-z]{3} \d{2} \d{4})", price_changes)
        
        if match:
            # Extract the price, remove any commas, and convert to float
            sold_price = float(match.group(1).replace(',', ''))
            sold_date = match.group(2)  # Extract the date without the day of the week
            return sold_price, sold_date
    
    return None, None

# Apply the function to extract and convert the sold price and sold date for each row using .loc
df.loc[:, 'Sold Asking Price'], df.loc[:, 'Sold Date'] = zip(*df['Price Changes'].apply(get_sold_price_and_date))

# Display the DataFrame
df.head()


Unnamed: 0,Address,Asking Price,Beds,Baths,Property Type,Energy Rating,Eircode,Agency Name,MyHome_Latitude,MyHome_Longitude,price_per_square_meter,Price Changes,MyHome_Floor_Area_Value,Sold Asking Price,Sold Date
0,"Taramar, Middle Third, Dublin 5, D05X8N9",750000,4 Bed,1 Bath,End of Terrace,E2,D05 X8N9,Hamill Estate Agents & Valuers,53.373857,-6.203371,5281.690141,"Sold, €950,000, Fri Sep 13 2024; Sale Agreed, ...",142.0,950000.0,Sep 13 2024
1,"87 Haddington Road, Dublin 4, D04WP23",990000,5 Bed,3 Bath,Terrace,SI_666,D04 WP23,Turley Property Advisors,53.335385,-6.239842,5657.142857,"Sold, €1,010,000, Fri Sep 13 2024; Sale Agreed...",175.0,1010000.0,Sep 13 2024
2,"Apartment 79, The Northumberlands, Love Lane E...",410000,2 Bed,1 Bath,Apartment,C1,D02 X068,Owen Reilly,53.349805,-6.26031,6406.25,"Sold, €480,000, Fri Sep 13 2024; Sale Agreed, ...",64.0,480000.0,Sep 13 2024
3,"7 Parkside Heath, Clongriffin, Dublin 13, Dubl...",535000,3 Bed,3 Bath,Terrace,A3,D13 WN3C,Sherry FitzGerald Sutton,53.407653,-6.163418,4734.513274,"Sold, €569,000, Fri Sep 13 2024; Unlisted, €53...",113.0,569000.0,Sep 13 2024
6,"31 Tibradden Grove, Dublin 12, D12P2X4",355000,3 Bed,1 Bath,Terrace,D1,D12 P2X4,Byrne and Moore Property Consultants Limited,53.308228,-6.34192,3349.056604,"Sold, €390,000, Fri Sep 13 2024; Sale Agreed, ...",106.0,390000.0,Sep 13 2024


In [70]:
# Check distinct values of property type
distinct_values = df['Property Type'].unique()
distinct_values

array(['End of Terrace', 'Terrace', 'Apartment', 'Semi-D', 'Detached',
       'Duplex', 'Bungalow', 'Studio', 'Townhouse', 'Houses'],
      dtype=object)

In [71]:
# List of property types to filter for (houses)
house_types = ['End of Terrace', 'Terrace', 'Apartment', 'Semi-D', 'Detached', 
               'Duplex', 'Bungalow', 'Studio', 'Townhouse', 'Houses']

# Filter rows where 'property_type' is in the house_types list
house_rows = df[df['Property Type'].isin(house_types)]
house_rows.head()

Unnamed: 0,Address,Asking Price,Beds,Baths,Property Type,Energy Rating,Eircode,Agency Name,MyHome_Latitude,MyHome_Longitude,price_per_square_meter,Price Changes,MyHome_Floor_Area_Value,Sold Asking Price,Sold Date
0,"Taramar, Middle Third, Dublin 5, D05X8N9",750000,4 Bed,1 Bath,End of Terrace,E2,D05 X8N9,Hamill Estate Agents & Valuers,53.373857,-6.203371,5281.690141,"Sold, €950,000, Fri Sep 13 2024; Sale Agreed, ...",142.0,950000.0,Sep 13 2024
1,"87 Haddington Road, Dublin 4, D04WP23",990000,5 Bed,3 Bath,Terrace,SI_666,D04 WP23,Turley Property Advisors,53.335385,-6.239842,5657.142857,"Sold, €1,010,000, Fri Sep 13 2024; Sale Agreed...",175.0,1010000.0,Sep 13 2024
2,"Apartment 79, The Northumberlands, Love Lane E...",410000,2 Bed,1 Bath,Apartment,C1,D02 X068,Owen Reilly,53.349805,-6.26031,6406.25,"Sold, €480,000, Fri Sep 13 2024; Sale Agreed, ...",64.0,480000.0,Sep 13 2024
3,"7 Parkside Heath, Clongriffin, Dublin 13, Dubl...",535000,3 Bed,3 Bath,Terrace,A3,D13 WN3C,Sherry FitzGerald Sutton,53.407653,-6.163418,4734.513274,"Sold, €569,000, Fri Sep 13 2024; Unlisted, €53...",113.0,569000.0,Sep 13 2024
6,"31 Tibradden Grove, Dublin 12, D12P2X4",355000,3 Bed,1 Bath,Terrace,D1,D12 P2X4,Byrne and Moore Property Consultants Limited,53.308228,-6.34192,3349.056604,"Sold, €390,000, Fri Sep 13 2024; Sale Agreed, ...",106.0,390000.0,Sep 13 2024


In [82]:
import pandas as pd
import numpy as np

# Assuming df = house_rows is already available
# For sold date assume that we just use everything in the dataset for now. 

# Columns for model fitting
columns_subset_training = ['Beds', 'Baths', 'MyHome_Floor_Area_Value', 'Sold Asking Price', 'MyHome_Latitude', 'MyHome_Longitude'] #'Sold Date',

# Subsetting the DataFrame
df_subset = df[columns_subset_training].copy()  # Use .copy() to avoid SettingWithCopyWarning

# Extract numeric part from 'Beds' and 'Baths' and convert to integer
df_subset['Beds'] = df_subset['Beds'].str.extract('(\d+)').astype(float)  # Extract and convert to float
df_subset['Baths'] = df_subset['Baths'].str.extract('(\d+)').astype(float)  # Extract and convert to float

# Check the updated DataFrame
df_subset.head()


Unnamed: 0,Beds,Baths,MyHome_Floor_Area_Value,Sold Asking Price,MyHome_Latitude,MyHome_Longitude
0,4.0,1.0,142.0,950000.0,53.373857,-6.203371
1,5.0,3.0,175.0,1010000.0,53.335385,-6.239842
2,2.0,1.0,64.0,480000.0,53.349805,-6.26031
3,3.0,3.0,113.0,569000.0,53.407653,-6.163418
6,3.0,1.0,106.0,390000.0,53.308228,-6.34192


In [95]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Haversine formula to calculate the distance between two points
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])  # Convert degrees to radians
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Function to combine Haversine distance and KNN distance into a final score
def calculate_combined_score(df_subset, given_lat, given_lon, given_beds, given_baths, given_size, geo_weight=0.5):
    # Ensure the coordinates are numeric
    df_subset['MyHome_Latitude'] = pd.to_numeric(df_subset['MyHome_Latitude'], errors='coerce')
    df_subset['MyHome_Longitude'] = pd.to_numeric(df_subset['MyHome_Longitude'], errors='coerce')
    
    # Drop rows with missing values in the relevant columns
    df_subset = df_subset.dropna(subset=['MyHome_Floor_Area_Value', 'Beds', 'Baths', 'MyHome_Latitude', 'MyHome_Longitude'])
    
    # Step 1: Calculate Haversine distance using .loc[] to avoid warnings
    df_subset.loc[:, 'Geo Distance (km)'] = df_subset.apply(lambda row: haversine_distance(given_lat, given_lon, row['MyHome_Latitude'], row['MyHome_Longitude']), axis=1)

    # Step 2: Apply KNN based on other variables (Beds, Baths, Floor Area)
    knn_features = df_subset[['MyHome_Floor_Area_Value', 'Beds', 'Baths']].copy()  # Use only numeric features
    
    # Combine input values (for comparison) with the dataset
    target_features = pd.DataFrame([[given_size, given_beds, given_baths]], columns=['MyHome_Floor_Area_Value', 'Beds', 'Baths'])
    
    # Concatenate the target with the existing dataset
    combined_features = pd.concat([knn_features, target_features], axis=0)
    
    # Standardize the features to ensure they are on the same scale
    scaler = StandardScaler()
    combined_features_scaled = scaler.fit_transform(combined_features)
    
    # Separate back the target (the last row)
    target_scaled = combined_features_scaled[-1].reshape(1, -1)
    knn_features_scaled = combined_features_scaled[:-1]
    
    # Step 3: Use KNN to find distances from the target to all properties in the dataset
    nbrs = NearestNeighbors(n_neighbors=len(df_subset), algorithm='ball_tree').fit(knn_features_scaled)
    knn_distances, _ = nbrs.kneighbors(target_scaled)
    
    # Step 4: Assign the KNN distances to the dataframe
    df_subset.loc[:, 'KNN Distance'] = knn_distances.flatten()  # Use .flatten() to convert to a 1D array
    
    # Step 5: Combine Haversine distance and KNN distance using a weighted sum
    df_subset.loc[:, 'Combined Score'] = geo_weight * df_subset['Geo Distance (km)'] + (1 - geo_weight) * df_subset['KNN Distance']
    
    # Sort by the combined score
    df_sorted = df_subset.sort_values(by='Combined Score')
    
    return df_sorted

# Example latitude, longitude, and house features (beds, baths, size)
given_lat = 53.349805  # Example latitude
given_lon = -6.260310  # Example longitude
given_beds = 10.0  # Number of beds
given_baths = 10.0  # Number of baths
given_size = 1000.0  # Size in square meters
geo_weight = 0.3  # 70% weight for geo, 30% for KNN features

# Apply the function to calculate the combined score
df_sorted = calculate_combined_score(df_subset, given_lat, given_lon, given_beds, given_baths, given_size, geo_weight=geo_weight)

# Display the sorted DataFrame
df_sorted = df_sorted.sort_values(by='Combined Score', ascending=True)
df_sorted.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset.loc[:, 'Geo Distance (km)'] = df_subset.apply(lambda row: haversine_distance(given_lat, given_lon, row['MyHome_Latitude'], row['MyHome_Longitude']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset.loc[:, 'KNN Distance'] = knn_distances.flatten()  # Use .flatten() to convert to a 1D array
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/s

Unnamed: 0,Beds,Baths,MyHome_Floor_Area_Value,Sold Asking Price,MyHome_Latitude,MyHome_Longitude,Geo Distance (km),KNN Distance,Combined Score
2,2.0,1.0,64.0,480000.0,53.349805,-6.26031,3.9e-05,13.316082,9.321269
1,5.0,3.0,175.0,1010000.0,53.335385,-6.239842,2.101702,12.795811,9.587578
0,4.0,1.0,142.0,950000.0,53.373857,-6.203371,4.629047,12.795811,10.345782
43,2.0,1.0,61.0,459000.0,53.344975,-6.247985,0.978658,16.091054,11.557335
6,3.0,1.0,106.0,390000.0,53.308228,-6.34192,7.12353,13.570401,11.636339


In [96]:
# Display the sorted DataFrame
df_sorted.head(5)

Unnamed: 0,Beds,Baths,MyHome_Floor_Area_Value,Sold Asking Price,MyHome_Latitude,MyHome_Longitude,Geo Distance (km),KNN Distance,Combined Score
2,2.0,1.0,64.0,480000.0,53.349805,-6.26031,3.9e-05,13.316082,9.321269
1,5.0,3.0,175.0,1010000.0,53.335385,-6.239842,2.101702,12.795811,9.587578
0,4.0,1.0,142.0,950000.0,53.373857,-6.203371,4.629047,12.795811,10.345782
43,2.0,1.0,61.0,459000.0,53.344975,-6.247985,0.978658,16.091054,11.557335
6,3.0,1.0,106.0,390000.0,53.308228,-6.34192,7.12353,13.570401,11.636339
