In [1]:
import math
import pandas as pd
import numpy as np

In [2]:
def calculate_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Radius of the Earth in kilometers
    RADIUS = 6371.0

    # Calculate the differences between the latitudes and longitudes
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Apply the Haversine formula
    a = (
        math.sin(dlat / 2) ** 2
        + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2) ** 2
    )
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = RADIUS * c

    return distance


def calculate_proximity_score(distance):
    """
    Calculates a promixity score for a given distance, on a scale of minimum to maximum
    The maximum and minimum distance should be defined in kilometers
    """

    if distance > max_distance:  # Zero score for disances that are too far
        proximity_score = 0

    elif distance < min_distance:  # Zero score for distances that are too close
        proximity_score = 0

    else:
        # Assign a score based on where the distance lies within the maximum and minimum
        proximity_score = 1 - (
            (distance - min_distance) / (max_distance - min_distance)
        )

    return proximity_score


def get_public_services_weights():
    # Read public services data
    PUBLIC_SERVICES_WEIGHTS = pd.read_csv("data/public_services_weights.csv")
    return PUBLIC_SERVICES_WEIGHTS


def get_public_services_data():
    public_services_data = pd.read_csv("data/public_services.csv")

    # Clean data
    public_services_data["main_cat"] = public_services_data["main_cat"].apply(
        lambda x: x.strip().title()
    )
    public_services_data["sub_cat"] = public_services_data["sub_cat"].apply(
        lambda x: x.strip().title()
    )
    # Filter out irrelevant row
    public_services_data = public_services_data[
        (public_services_data["sub_cat"] != "Delete")
        & (public_services_data["sub_cat"] != "Delete")
    ]
    return public_services_data


def get_rs_score(listings_lat, listings_long):
    """
    Clean up function to compute the rs_score for a latitude and logitude
    """

    # Get distance of each public service from the listing
    public_services_data["distance"] = public_services_data.apply(
        lambda row: calculate_distance(
            lat1=listings_lat,
            lon1=listings_long,
            lat2=row["lat"],
            lon2=row["lng"],
        ),
        axis=1,
    )

    # Determine the proximity score for the each distances
    public_services_data["proximity_score"] = public_services_data["distance"].apply(
        lambda x: calculate_proximity_score(x)
    )

    # Define a function to count non-zero values
    def count_non_zero(x):
        return (x != 0).sum()

    # Calculate the max proximity score and count non-zero proximity scores for each sub-category
    proximity_scores_agg = (
        public_services_data.groupby(["main_cat", "sub_cat"])["proximity_score"]
        .agg(
            max_proximity_score="max",
            total_public_services="size",
            proximate_public_services=count_non_zero,
        )
        .reset_index()
    )

    # Find the ratio of proximity scores that are not zero
    proximity_scores_agg["public_services_ratio"] = (
        proximity_scores_agg["proximate_public_services"]
        / proximity_scores_agg["total_public_services"]
    )

    # Combine the max proximity score and count to create a final score
    proximity_scores_agg["agg_proximity_score"] = (
        proximity_scores_agg["max_proximity_score"]
        + proximity_scores_agg["public_services_ratio"]
    ) / 2

    # Merge the weights dataframe to the proximity scores
    proximity_scores = pd.merge(
        proximity_scores_agg,
        PUBLIC_SERVICES_WEIGHTS,
        how="left",
        on=["main_cat", "sub_cat"],
    )

    # Calculate the final weighted scores
    proximity_scores["weighted_score"] = (
        proximity_scores["agg_proximity_score"] * proximity_scores["weight"]
    )

    # Get the scores for the main category
    category_scores = (
        proximity_scores.groupby("main_cat")["weighted_score"].sum().reset_index()
    )

    # Calculate the average category score (final rs_score)
    rs_score = round(category_scores["weighted_score"].mean(), 4)

    return {
        "rs_score": rs_score,
        "category_scores": category_scores,
        "proximity_scores": proximity_scores,
    }


In [3]:
# Get data
PUBLIC_SERVICES_WEIGHTS = get_public_services_weights()
public_services_data = get_public_services_data()

# Get maximum and minimum distances
max_distance = 10.0
min_distance = 0.1

### Test sample data

In [None]:
# Read listing data
real_estate_listings = pd.read_csv("data/riyadh_sample.csv")
real_estate_listings = real_estate_listings.dropna(subset=["lat", "lng"])
real_estate_listings["rs_score"] = np.nan

In [4]:
for i, listing_row in real_estate_listings.iterrows():
    # Calculate rs_score for all rows

    results = get_rs_score(listing_row["lat"], listing_row["lng"])
    real_estate_listings.loc[i, "rs_score"] = results["rs_score"]


real_estate_listings.head()

Unnamed: 0,l_neighborhood,deal_price,area_size,meter_price,lng,lat,uuid,rs_score
0,rd,1080000,350.0,3086,46.757046,24.727277,c1442e7b-ddd2-48db-b7c4-79d49412754e,0.1377
1,ml,11000000,560.0,19643,46.613572,24.801113,eaab2273-b0da-407f-a0e2-170b94f19d9d,0.5585
2,ml,4590000,600.0,7650,46.594482,24.794327,2be6d20a-bf6d-4a32-8459-d80deb18979a,0.4935
3,ml,41760000,2610.0,16000,46.627284,24.799022,1f0a475e-4d34-42ce-af59-de8cebe00664,0.5676
4,nr,2000000,500.0,4000,46.656898,24.885271,4fc312b5-bbe7-4f45-9c22-25a5ad94ae79,0.4298


In [5]:
real_estate_listings.to_csv("results.csv", index=False)

### Input individual coordinates

In [6]:
# Test sample inputs
input_lat = 24.8033547
input_lng = 46.6206349

get_rs_score(input_lat, input_lng)

results = get_rs_score(input_lat, input_lng)

rs_score = round(results["rs_score"], 4)
category_scores = results["category_scores"]
proximity_scores = results["proximity_scores"]

# Display result
print(f"RS Score: {rs_score}")

RS Score: 0.569


In [7]:
print("Scores across main public services categories:")
category_scores

Scores across main public services categories:


Unnamed: 0,main_cat,weighted_score
0,Education,0.598899
1,General Facility,0.619348
2,Government,0.158565
3,Health,0.673667
4,Hopspitality,0.663528
5,"Stores, Offices And Commercial Services",0.700264


In [8]:
print("Scores across subcategories:")
proximity_scores

Scores across subcategories:


Unnamed: 0,main_cat,sub_cat,max_proximity_score,total_public_services,proximate_public_services,public_services_ratio,agg_proximity_score,weight,weighted_score
0,Education,College,0.510619,18,4,0.222222,0.366421,0.333333,0.12214
1,Education,Other Education Services,0.966839,44,27,0.613636,0.790238,0.333333,0.263413
2,Education,School,0.994672,466,133,0.285408,0.64004,0.333333,0.213347
3,General Facility,Facilities,0.852243,43,12,0.27907,0.565656,0.25,0.141414
4,General Facility,General Libarary,0.958618,32,10,0.3125,0.635559,0.25,0.15889
5,General Facility,Mosque,0.993863,680,182,0.267647,0.630755,0.25,0.157689
6,General Facility,Park,0.918499,94,35,0.37234,0.64542,0.25,0.161355
7,Government,Civil Defense,0.0,3,0,0.0,0.0,0.25,0.0
8,Government,Ministry,0.0,1,0,0.0,0.0,0.25,0.0
9,Government,Other Governmental Services,0.820285,8,2,0.25,0.535143,0.25,0.133786
