In [21]:
import osmium
from geopy.distance import geodesic
from shapely import wkb as wkblib
import json
from scipy import spatial
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [None]:
class TagHandler(osmium.SimpleHandler):
    def __init__(self):
        super().__init__()
        self.sport_places = []
        self.context_list = []
        self.geometry_gen = osmium.geom.WKBFactory()

    def node(self, n):
       
        tags_dict = dict(n.tags)
        tag_keys = list(tags_dict.keys())
        loc = (n.location.lat, n.location.lon)

        if(
            tags_dict.get("building") == "sports_centre"
            or tags_dict.get("leisure") == "sports_centre"
            or "sport" in tags_dict
        ):
            self.sport_places.append({
                "location": loc,
                "tags": tag_keys
            })

        
        if tags_dict.get("leisure") == "park":
            self.context_list.append(("park", loc))
        elif tags_dict.get("amenity") == "school":
            self.context_list.append(("school", loc))
        elif tags_dict.get("building") == "apartment":
            self.context_list.append(("apartment", loc))
        elif tags_dict.get("building") == "house":
            self.context_list.append(("house", loc))
    
    def way(self, w):
       
        tags_dict = dict(w.tags)
        tag_keys = list(tags_dict.keys())

        if(
            tags_dict.get("building") == "sports_centre"
            or tags_dict.get("building") == "sports_hall"
            or tags_dict.get("leisure") == "sports_centre"
            or tags_dict.get("leisure") == "sports_hall"
            or "sport" in tags_dict
        ):
            #this is done because these are polygons and they do not have
            #just one coordinate, so from the polygon, we calculate the center of it
            #and use that as its location
            loc = None
            try:
                wkb = self.geometry_gen.create_multipolygon(w)
                shape = wkblib.loads(wkb, hex=True)
                centroid = shape.centroid
                loc = (centroid.y, centroid.x)
                
            #if calculating the center coordinate fails, it means the polygon was incomplete
            #so we instead use a valid node that makes up the polygon as its coordinate/location
            except Exception:
                for node in w.nodes:
                    if node.location.valid():
                        loc = (node.location.lat, node.location.lon)
                        break

            if loc:
                self.sport_places.append({
                    "location": loc,
                    "tags": tag_keys
                })

        context_tags = {
            "leisure": "park",
            "amenity": "school",
            "building": ["apartments", "house", "residential"]
        }

        loc = None
        
        try:
            wkb = self.geometry_gen.create_multipolygon(w)
            shape = wkblib.loads(wkb, hex=True)
            centroid = shape.centroid
            loc = (centroid.y, centroid.x)

            
        except Exception:
             for node in w.nodes:
                if node.location.valid():
                    loc = (node.location.lat, node.location.lon)
                    break
        
        if loc:
            for key, value in context_tags.items():
                if isinstance(value, list):
                    if tags_dict.get(key) in value:
                        self.context_list.append((tags_dict.get(key), loc))
                elif tags_dict.get(key) == value:
                    self.context_list.append((value, loc))
        
       
    

osm_file = "netherlands-latest.osm.pbf"
tag_holder = TagHandler()
tag_holder.apply_file(osm_file, locations= True)

print(f"the amount of sports related places BEFORE: {len(tag_holder.sport_places)}")
#group context with all their locations
filtered_data = []
grouped_context = defaultdict(list)
for type_of_ctx, (lat, lon) in tag_holder.context_list:
    grouped_context[type_of_ctx].append((lat, lon))

#makes a kdtree for each context
context_tree = {}
for context, locations in grouped_context.items():
    coordinates = [(lon, lat) for lat, lon in locations]
    context_tree[context] = spatial.KDTree(coordinates)

#loop through the sport places
for sport_place in tag_holder.sport_places:
    location = sport_place["location"]
    lat, lon = location
    target = (lon, lat)
    tags = list(sport_place["tags"])
    #loop through each context and its related tree
    for context_type, tree in context_tree.items():
        nearby_locations = tree.query_ball_point(target, r=0.002)
        if nearby_locations:
            tags.append(f"near_{context_type}")

    filtered_data.append(tags)


print(f"the amount of sports related places AFTER: {len(filtered_data)}")

train_data, test_data = train_test_split(filtered_data, test_size=0.2, random_state=42)

with open("train_w_ctx_NL.tsv", "w", encoding="utf-8") as f:
    for tag_list in train_data:
        f.write("\t".join(tag_list) + "\n")

the amount of sports related places BEFORE: 50171
the amount of sports related places AFTER: 50171


In [None]:
import random
import requests

def evaluate_model(test_data, k):
    recall_scores = []
    precision_scores = []

    for tag_set in test_data:
        non_context_tags = []
        for t in tag_set:
            if not t.startswith("near"):
                non_context_tags.append(t)
                
        # we need at least 3 non-context tags for the back-off model, because 1 is going to be left out.
        if len(non_context_tags) < 2:
            continue

        
        temp = []
        for el in tag_set:
            if el.startswith("near"):
                temp.append(el)
                tag_set.remove(el)

        left_out_tags = random.sample(tag_set, k=1) 
        input_tags = []
        for t in tag_set:
            if t not in left_out_tags:
                input_tags.append(t)

        for i in temp:
            if len(temp) == 0:
                break
            else:
                input_tags.append(i)

            

        tags ={ "properties": input_tags,
                "types": []}
        response = requests.post("http://localhost:8080/recommender", json=tags)
        recommendations = response.json()

        recommended_tags = []
        for recom in recommendations["recommendations"]:
            tag = recom["property"]
            if not tag.startswith("near"):
                recommended_tags.append(tag)
            if len(recommended_tags) == k:
                break
        
        existing_tags = []
        for tag in recommended_tags:
            if tag in left_out_tags:
                existing_tags.append(tag)
        true_pos = len(existing_tags)
        precision = true_pos / k
        recall = true_pos / len(left_out_tags)

        precision_scores.append(precision)
        recall_scores.append(recall)
    
    average_precision_score = sum(precision_scores) / len(precision_scores)
    average_recall_score = sum(recall_scores) / len(recall_scores)

    return average_precision_score, average_recall_score


precision_model, recall_model = evaluate_model(test_data, k=1)
print(f"Precision@1: {precision_model}")
print(f"Recall@1: {recall_model}")

precision_model, recall_model = evaluate_model(test_data, k=3)
print(f"Precision@3: {precision_model}")
print(f"Recall@3: {recall_model}")

precision_model, recall_model = evaluate_model(test_data, k=5)
print(f"Precision@5: {precision_model}")
print(f"Recall@5: {recall_model}")

precision_model, recall_model = evaluate_model(test_data, k=10)
print(f"Precision@10: {precision_model}")
print(f"Recall@10: {recall_model}")


Precision@1: 0.774749498997996
Recall@1: 0.774749498997996
Precision@3: 0.30701402805613986
Recall@3: 0.9210420841683367
Precision@5: 0.18857715430864638
Recall@5: 0.9428857715430862
Precision@10: 0.096002004008031
Recall@10: 0.9600200400801603


In [None]:

def evaluate_model_backoff(test_data, strat, k):
    recall_scores = []
    precision_scores = []

    for tag_set in test_data:
        non_context_tags = []
        for t in tag_set:
            if not t.startswith("near"):
                non_context_tags.append(t)
                
        # we need at least 3 non-context tags for the back-off model, because 1 is going to be left out.
        if len(non_context_tags) < 3:
            continue

        
        temp = []
        for el in tag_set:
            if el.startswith("near"):
                temp.append(el)
                tag_set.remove(el)

        left_out_tags = random.sample(tag_set, k=1) 

        input_tags = []
        for t in tag_set:
            if t not in left_out_tags:
                input_tags.append(t)

        P1 = input_tags[::2]
        P2 = input_tags[1::2]

        for i in temp:
            if len(temp) == 0:
                break
            else:
                P1.append(i)
                P2.append(i)

            
        
        def get_clean_recommendations(input, other_input):
            tags ={ "properties": input,
                "types": []}
            response = requests.post("http://localhost:8080/recommender", json=tags)
            recommendations = response.json()
            cleaned = {}
            for rec in recommendations["recommendations"]:
                tag = rec["property"]
                score = rec["probability"]
                if not tag.startswith("near") and tag not in other_input:
                    cleaned[tag] = score
                if len(cleaned) == k:
                    break
            return cleaned
        
        recommended1 = get_clean_recommendations(P1, P2)
        recommended2 = get_clean_recommendations(P2, P1)

        merged_recommendations = defaultdict(list)
        for tag, prob in recommended1.items():
            merged_recommendations[tag].append(prob)
        for tag, prob in recommended2.items():
            merged_recommendations[tag].append(prob)
        
        
        final_recommendations = []
        for tag, prob in merged_recommendations.items():
            if strat == "max":
                final_recommendations.append((tag, max(prob)))
            if strat == "avg":
                final_recommendations.append((tag, sum(prob) / len(prob)))
            
        final_recommendations.sort(key=lambda x: x[1], reverse= True)
        final_k_recommendations = []
        for t, s in final_recommendations:
            final_k_recommendations.append(t)
            if len(final_k_recommendations) == k:
                break
                
        existing_tags = []
        for tag in final_k_recommendations:
            if tag in left_out_tags:
                existing_tags.append(tag)
        true_pos = len(existing_tags)
        precision = true_pos / k
        recall = true_pos / len(left_out_tags)

        precision_scores.append(precision)
        recall_scores.append(recall)
    
    average_precision_score = sum(precision_scores) / len(precision_scores)
    average_recall_score = sum(recall_scores) / len(recall_scores)

    return average_precision_score, average_recall_score

strat_m = "max"
strat_a = "avg"

precision_model, recall_model = evaluate_model_backoff(test_data, strat_m, k=1)
print(f"Strategy used is {strat_m}")
print(f"Precision@1: {precision_model}")
print(f"Recall@1: {recall_model}")

precision_model, recall_model = evaluate_model_backoff(test_data, strat_a, k=1)
print(f"Strategy used is {strat_a}")
print(f"Precision@1: {precision_model}")
print(f"Recall@1: {recall_model}")

print("\n")
precision_model, recall_model = evaluate_model_backoff(test_data, strat_m, k=3)
print(f"Strategy used is {strat_m}")
print(f"Precision@3: {precision_model}")
print(f"Recall@3: {recall_model}")

precision_model, recall_model = evaluate_model_backoff(test_data, strat_a, k=3)
print(f"Strategy used is {strat_a}")
print(f"Precision@3: {precision_model}")
print(f"Recall@3: {recall_model}")

print("\n")
precision_model, recall_model = evaluate_model_backoff(test_data, strat_m, k=5)
print(f"Strategy used is {strat_m}")
print(f"Precision@5: {precision_model}")
print(f"Recall@5: {recall_model}")

precision_model, recall_model = evaluate_model_backoff(test_data, strat_a, k=5)
print(f"Strategy used is {strat_a}")
print(f"Precision@5: {precision_model}")
print(f"Recall@5: {recall_model}")

print("\n")
precision_model, recall_model = evaluate_model_backoff(test_data, strat_m, k=10)
print(f"Strategy used is {strat_m}")
print(f"Precision@10: {precision_model}")
print(f"Recall@10: {recall_model}")

precision_model, recall_model = evaluate_model_backoff(test_data, strat_a, k=10)
print(f"Strategy used is {strat_a}")
print(f"Precision@10: {precision_model}")
print(f"Recall@10: {recall_model}")

Strategy used is max
Precision@1: 0.7687203791469195
Recall@1: 0.7687203791469195
Strategy used is avg
Precision@1: 0.7723222748815166
Recall@1: 0.7723222748815166


Strategy used is max
Precision@3: 0.28834123222747554
Recall@3: 0.8650236966824645
Strategy used is avg
Precision@3: 0.2928909952606498
Recall@3: 0.8786729857819905


Strategy used is max
Precision@5: 0.17827488151660223
Recall@5: 0.8913744075829384
Strategy used is avg
Precision@5: 0.17804739336494343
Recall@5: 0.8902369668246446


Strategy used is max
Precision@10: 0.0919052132701501
Recall@10: 0.9190521327014218
Strategy used is avg
Precision@10: 0.09186729857820695
Recall@10: 0.9186729857819905


#below are the models without context

In [25]:
filtered_data_no_ctx = []
for sport_place in tag_holder.sport_places:
    tags_no_ctx = list(sport_place["tags"])
    filtered_data_no_ctx.append(tags_no_ctx)

with open("filtered_data_no_ctx.tsv", "w", encoding="utf-8") as f:
    for tag_list in filtered_data_no_ctx:
        f.write("\t".join(tag_list) + "\n")

In [29]:
train_no_ctx_data, test_no_ctx_data = train_test_split(filtered_data_no_ctx, test_size=0.2, random_state=42)

with open("train_no_ctx_NL.tsv", "w", encoding="utf-8") as f:
    for tag_list in train_no_ctx_data:
        f.write("\t".join(tag_list) + "\n")

with open("test_no_ctx_NL.tsv", "w", encoding="utf-8") as f:
    for tag_list in test_no_ctx_data:
        f.write("\t".join(tag_list) + "\n")

with open("test_data_no_ctx_NL.pkl", "wb") as f:
    pickle.dump(filtered_data_no_ctx, f)

In [None]:
import random
import requests

def evaluate_model_no_ctx(test_no_ctx_data, k):
    recall_scores = []
    precision_scores = []

    for tag_set in test_no_ctx_data:
        if len(tag_set) < 2:
            continue

        left_out_tags = random.sample(tag_set, k=1) 
        input_tags = []
        for t in tag_set:
            if t not in left_out_tags:
                input_tags.append(t)


        tags ={ "properties": input_tags,
                "types": []}
        response = requests.post("http://localhost:8080/recommender", json=tags)
        recommendations = response.json()

        recommended_tags = []
        for recom in recommendations["recommendations"]:
            tag = recom["property"]
            if not tag.startswith("near"):
                recommended_tags.append(tag)
            if len(recommended_tags) == k:
                break
        
        existing_tags = []
        for tag in recommended_tags:
            if tag in left_out_tags:
                existing_tags.append(tag)
        true_pos = len(existing_tags)
        precision = true_pos / k
        recall = true_pos / len(left_out_tags)

        precision_scores.append(precision)
        recall_scores.append(recall)
    
    average_precision_score = sum(precision_scores) / len(precision_scores)
    average_recall_score = sum(recall_scores) / len(recall_scores)

    return average_precision_score, average_recall_score


precision_model, recall_model = evaluate_model_no_ctx(test_no_ctx_data, k=1)
print(f"Precision@1: {precision_model}")
print(f"Recall@1: {recall_model}")
print("\n")
precision_model, recall_model = evaluate_model_no_ctx(test_no_ctx_data, k=3)
print(f"Precision@3: {precision_model}")
print(f"Recall@3: {recall_model}")
print("\n")
precision_model, recall_model = evaluate_model_no_ctx(test_no_ctx_data, k=5)
print(f"Precision@5: {precision_model}")
print(f"Recall@5: {recall_model}")
print("\n")
precision_model, recall_model = evaluate_model_no_ctx(test_no_ctx_data, k=10)
print(f"Precision@10: {precision_model}")
print(f"Recall@10: {recall_model}")

Precision@1: 0.8596192384769539
Recall@1: 0.8596192384769539


Precision@3: 0.30838343353376235
Recall@3: 0.9251503006012024


Precision@5: 0.18873747494992898
Recall@5: 0.943687374749499


Precision@10: 0.09604208416835164
Recall@10: 0.9604208416833667


In [None]:
def evaluate_model_backoff_no_ctx(test_no_ctx_data, strat, k):
    recall_scores = []
    precision_scores = []

    for tag_set in test_no_ctx_data:
        if len(tag_set) < 3:
            continue


        left_out_tags = random.sample(tag_set, k=1) 
        input_tags = []
        for t in tag_set:
            if t not in left_out_tags:
                input_tags.append(t)

        P1 = input_tags[::2]
        P2 = input_tags[1::2]

        
        def get_clean_recommendations(input, other_input):
            tags ={ "properties": input,
                "types": []}
            response = requests.post("http://localhost:8080/recommender", json=tags)
            recommendations = response.json()
            cleaned = {}
            for rec in recommendations["recommendations"]:
                tag = rec["property"]
                score = rec["probability"]
                if tag not in other_input:
                    cleaned[tag] = score
                if len(cleaned) == k:
                    break
            return cleaned
        
        recommended1 = get_clean_recommendations(P1, P2)
        recommended2 = get_clean_recommendations(P2, P1)

        merged_recommendations = defaultdict(list)
        for tag, prob in recommended1.items():
            merged_recommendations[tag].append(prob)
        for tag, prob in recommended2.items():
            merged_recommendations[tag].append(prob)
        
        
        final_recommendations = []
        for tag, prob in merged_recommendations.items():
            if strat == "max":
                final_recommendations.append((tag, max(prob)))
            if strat == "avg":
                final_recommendations.append((tag, sum(prob) / len(prob)))
            
        final_recommendations.sort(key=lambda x: x[1], reverse= True)
        final_k_recommendations = []
        for t, s in final_recommendations:
            final_k_recommendations.append(t)
            if len(final_k_recommendations) == k:
                break
                
        existing_tags = []
        for tag in final_k_recommendations:
            if tag in left_out_tags:
                existing_tags.append(tag)
        true_pos = len(existing_tags)
        precision = true_pos / k
        recall = true_pos / len(left_out_tags)

        precision_scores.append(precision)
        recall_scores.append(recall)
    
    average_precision_score = sum(precision_scores) / len(precision_scores)
    average_recall_score = sum(recall_scores) / len(recall_scores)

    return average_precision_score, average_recall_score

strat_m = "max"
strat_a = "avg"
precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_m, k=1)
print(f"Strategy used is {strat_m}")
print(f"Precision@1: {precision_model}")
print(f"Recall@1: {recall_model}")

precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_a, k=1)
print(f"Strategy used is {strat_a}")
print(f"Precision@1: {precision_model}")
print(f"Recall@1: {recall_model}")

precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_m, k=3)
print(f"Strategy used is {strat_m}")
print(f"Precision@3: {precision_model}")
print(f"Recall@3: {recall_model}")

precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_a, k=3)
print(f"Strategy used is {strat_a}")
print(f"Precision@3: {precision_model}")
print(f"Recall@3: {recall_model}")

precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_m, k=5)
print(f"Strategy used is {strat_m}")
print(f"Precision@5: {precision_model}")
print(f"Recall@5: {recall_model}")

precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_a, k=5)
print(f"Strategy used is {strat_a}")
print(f"Precision@5: {precision_model}")
print(f"Recall@5: {recall_model}")


precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_m, k=10)
print(f"Strategy used is {strat_m}")
print(f"Precision@10: {precision_model}")
print(f"Recall@10: {recall_model}")

precision_model, recall_model = evaluate_model_backoff_no_ctx(test_no_ctx_data, strat_a, k=10)
print(f"Strategy used is {strat_a}")
print(f"Precision@10: {precision_model}")
print(f"Recall@10: {recall_model}")

Strategy used is max
Precision@1: 0.7717535545023697
Recall@1: 0.7717535545023697
Strategy used is avg
Precision@1: 0.7823696682464455
Recall@1: 0.7823696682464455
Strategy used is max
Precision@3: 0.28834123222747554
Recall@3: 0.8650236966824645
Strategy used is avg
Precision@3: 0.28960505529224617
Recall@3: 0.8688151658767772
Strategy used is max
Precision@5: 0.17850236966826105
Recall@5: 0.8925118483412322
Strategy used is avg
Precision@5: 0.17630331753555914
Recall@5: 0.8815165876777251
Strategy used is max
Precision@10: 0.09186729857820695
Recall@10: 0.9186729857819905
Strategy used is avg
Precision@10: 0.09211374407583735
Recall@10: 0.9211374407582938
