In [45]:
import pandas as pd
from scipy.spatial.distance import euclidean

In [46]:
# Define categories and subcategories of public services with corresponding weights
# TODO this weights are to be gotten from a correlation with data price?
SERVICE_WEIGHTS = {
    'Education': {'Schools': 0.8, 'Libraries': 0.2},
    'Healthcare': {'Hospitals': 0.7, 'Pharmacies': 0.3},
    'Transportation': {'Bus Stops': 0.5, 'Train Stations': 0.5}
}

In [47]:
def get_distances_from_services(listing_coordinates, service_coordinates, SERVICE_WEIGHTS):
    """
    Calculates the euclidean distance from the listing to each service, only includes subcategories with defined weights
    Returns a dictionary similar to service_coordinates, but the values are distances from listing
    """
    proximity_scores = {}

    # Iterate over categories, to create a dictionary where each key is present in the define service_weights
    for category, subcategories in SERVICE_WEIGHTS.items():
        # Create an empty dictionary to store subcategory distances
        proximity_scores[category] = {}

        # Iterate over subcategories
        for subcategory in subcategories:
            # Create an empty list to store distances for the current subcategory
            proximity_scores[category][subcategory] = []

            # Iterate over service coordinates for the current subcategory
            for service_coords in service_coordinates[category][subcategory]:
                # Calculate the distance between the listing coordinates and the service coordinates
                # TODO use km not lat/lng
                distance = euclidean(listing_coordinates, service_coords)

                # Append the distance to the list of distances for the current subcategory
                proximity_scores[category][subcategory].append(distance)

    return proximity_scores


def calculate_proximity_score(distance):
    """
    Calculates a promixity score for a given distance
    """
    # TODO how do we define a maximum and minimum?
    # Define the maximum and minimum distance for a service to be considered proximate
    max_distance = 0.001 
    min_distance = 0.00001 

    if distance > max_distance: # Disance is too far
        proximity_score = 0

    elif distance < min_distance: # Distance is too close
        proximity_score = 0
    
    else:
        # Assign a score based on where the distance lies within the maximum and minimum
        proximity_score = 1 - ((distance - min_distance) / (max_distance - min_distance))

    return proximity_score


def get_subcategory_proximity_scores(services_distances):
    """
    Assigns proximity score for each subcategory
    Uses the highest public service score in each sub category
    """
    subcategory_scores = {}

    # Iterate over categories in calculated weights
    for category, subcategories in services_distances.items():
        # Create an empty dictionary to store subcategory proximity scores
        subcategory_scores[category] = {}

        # Iterate over subcategories and their corresponding distances
        for subcategory, distances in subcategories.items():
            
            """
            TODO: do we use maximum score or average score?
            Captures the presence of at least one highly proximate service in each subcategory
            If a listing is near the nearest school, it might be more desirable compared to one that is equidistant from multiple schools
            """
            # Calculate the maximum proximity score each distance
            max_proximity_score = max([calculate_proximity_score(distance) for distance in distances])

            # Store the maximum proximity score for the current subcategory in the nested dictionary
            subcategory_scores[category][subcategory] = max_proximity_score

    return subcategory_scores


def get_cummulative_category_scores(proximity_scores, SERVICE_WEIGHTS):
    """
    Calculates the weighted sum of each category based on the weights of its subcategories
    Returns a dictionary containing the cummulative scores for each category
    """

    public_service_scores = {}

    # Iterate over each category and its corresponding scores in the proximity_scores dictionary
    for category, subcategories in proximity_scores.items():
        
        # To store the cumulative score for the current category
        total_category_score = 0

        # Iterate over each subcategory and its corresponding scores
        for subcategory, subcategory_score in subcategories.items():
            
            # Get the subcategory weight
            subcategory_weight = SERVICE_WEIGHTS[category][subcategory]
            # Calculate the weighted contribution of the subcategory score
            weighted_score = subcategory_score * subcategory_weight
            # Add to the total
            total_category_score += weighted_score

        public_service_scores[category] = total_category_score

    return public_service_scores


def get_rs_score(listing_coordinates, public_service_coordinates):
    """Builder program"""

    # Calculate distances between the listings and the public services
    services_distances = get_distances_from_services(listing_coordinates, public_service_coordinates, SERVICE_WEIGHTS)

    # Determine the proximity score for the distances between the listing and the public services
    subcategory_proximity_scores = get_subcategory_proximity_scores(services_distances)
    print(subcategory_proximity_scores)

    # Calculate the weighted cummulative score
    final_category_scores = get_cummulative_category_scores(subcategory_proximity_scores, SERVICE_WEIGHTS)

    # Find the average score (final rs_score)
    rs_score = sum(value for value in final_category_scores.values()) / len(final_category_scores)

    return {"rs_score":rs_score, "category_scores":final_category_scores, "subcategory_scores":subcategory_proximity_scores}



In [48]:
# Assuming the data is stored in a DataFrame named 'df'
df = pd.DataFrame({
    'Category': ['Education', 'Education', 'Healthcare', 'Healthcare', 'Transportation', 'Transportation'],
    'Subcategory': ['Schools', 'Libraries', 'Hospitals', 'Pharmacies', 'Bus Stops', 'Train Stations'],
    'Coordinates': [(40.7120, -74.0059), (40.7125, -74.0065), (40.7130, -74.0070), (40.7122, -74.0063), (40.7132, -74.0066), (40.7123, -74.0067)]
})

# Convert the DataFrame to a dictionary
service_coordinates = {}
for _, row in df.iterrows():
    category = row['Category']
    subcategory = row['Subcategory']
    coordinates = row['Coordinates']

    if category not in service_coordinates:
        service_coordinates[category] = {}

    if subcategory not in service_coordinates[category]:
        service_coordinates[category][subcategory] = []

    service_coordinates[category][subcategory].append(coordinates)




In [49]:
# Example data
real_estate_listings = {
    'Listing 1': (40.7128, -74.0060),
    'Listing 2': (34.0522, -118.2437),
    'Listing 3': (51.5074, -0.1278)
}


for listing, listing_coordinates in real_estate_listings.items():

    scores = get_rs_score(listing_coordinates, service_coordinates)

    print(f"{listing} RS Score: {scores['rs_score']}")
    print(f"Category scores: {scores['category_scores']}")
    print(f"Subcategory scores: {scores['subcategory_scores']}")
    print()


{'Education': {'Schools': 0.19573154057735698, 'Libraries': 0.42111597021409564}, 'Healthcare': {'Hospitals': 0, 'Pharmacies': 0.33250465328607615}, 'Transportation': {'Bus Stops': 0.2817068130333038, 'Train Stations': 0.14117926595814967}}
Listing 1 RS Score: 0.18400095399541808
Category scores: {'Education': 0.24080842650470474, 'Healthcare': 0.09975139598582285, 'Transportation': 0.21144303949572674}
Subcategory scores: {'Education': {'Schools': 0.19573154057735698, 'Libraries': 0.42111597021409564}, 'Healthcare': {'Hospitals': 0, 'Pharmacies': 0.33250465328607615}, 'Transportation': {'Bus Stops': 0.2817068130333038, 'Train Stations': 0.14117926595814967}}

{'Education': {'Schools': 0, 'Libraries': 0}, 'Healthcare': {'Hospitals': 0, 'Pharmacies': 0}, 'Transportation': {'Bus Stops': 0, 'Train Stations': 0}}
Listing 2 RS Score: 0.0
Category scores: {'Education': 0.0, 'Healthcare': 0.0, 'Transportation': 0.0}
Subcategory scores: {'Education': {'Schools': 0, 'Libraries': 0}, 'Healthcare