# Data Exploration

###  OpenStreetMap

In [1]:
!pip install -q overpy

In [2]:
import overpy

api = overpy.Overpass()

In [3]:
query = """
[out:json];
(
  node["amenity"="restaurant"](52.229,20.944,52.410,21.222);
  node["amenity"="bank"](52.229,20.944,52.410,21.222);
);
out body;
"""

result = api.query(query)

In [4]:
len(result.nodes)

1167

In [6]:
for node in result.nodes[:5]:
    print(f"{node.tags.get('amenity', 'unknown')} at ({node.lat}, {node.lon})")

restaurant at (52.2399471, 21.0619767)
restaurant at (52.2312362, 21.0121087)
restaurant at (52.2340883, 21.0233987)
restaurant at (52.2484456, 21.0142668)
restaurant at (52.2371565, 21.1191810)


### GBIF

In [None]:
from pygbif import occurrences

In [None]:
results = occurrences.search(taxon_key=6, geometry="((52.229,20.944,52.410,21.222))")

HTTPError: 400 Client Error: Bad Request for url: https://api.gbif.org/v1/occurrence/search?geometry=%28%2852.229%2C20.944%2C52.410%2C21.222%29%29&limit=300&offset=0&taxon.key=6

In [None]:
results

{'offset': 0,
 'limit': 300,
 'endOfRecords': True,
 'count': 0,
 'results': [],
 'facets': []}

In [None]:
# Wydrukuj kilka wyników
for result in results['results']:
    print(f"Species: {result['species']}, Location: {result['decimalLatitude']}, {result['decimalLongitude']}")

## Dataset Class

In [7]:
from abc import ABC, abstractmethod
import pandas as pd
import overpy

class ColocationDataset(ABC):
    def __init__(self):
        """
        Base class for colocation datasets.
        """
        self._data = None

    @abstractmethod
    def load_data(self) -> pd.DataFrame:
        """Loads the data from the source."""
        pass

    @property
    def data(self) -> pd.DataFrame:
        """Returns the loaded data."""
        return self._data


class OSMColocationDataset(ColocationDataset):
    def __init__(self, area: tuple, poi_types: list):
        """
        Colocation dataset for OpenStreetMap (OSM) data.

        :param area: Area in the format (min_lat, min_lon, max_lat, max_lon)
        :param poi_types: List of POI types to search for (e.g., ['restaurant', 'bank'])
        """
        super().__init__()
        self._area = area
        self._poi_types = poi_types

    def load_data(self) -> pd.DataFrame:
        """Loads data from OSM using the Overpass API."""
        api = overpy.Overpass()

        query = f"""
        [out:json];
        (
            {' '.join([f'node["amenity"="{poi}"]({self._area[0]},{self._area[1]},{self._area[2]},{self._area[3]});' for poi in self._poi_types])}
        );
        out body;
        """

        result = api.query(query)

        data = []
        for node in result.nodes:
            data.append({
                "id": node.id,
                "type": node.tags.get('amenity', 'unknown'),
                "x": node.lat,
                "y": node.lon
            })

        self._data = pd.DataFrame(data)
        return self.data

    @property
    def data(self) -> pd.DataFrame:
        """Returns the loaded data."""
        if self._data is None:
            self.load_data()
        return self._data


In [8]:
area = (52.229, 20.944, 52.410, 21.222)  # Warsaw area
poi_types = ["bar", "cafe", "fast_food", "food_court", "ice_cream", "pub", 'restaurant', "college", "library", "research_institute", "school", "university", "parking", "atm", 'bank', "clinic", "doctors", "pharmacy", "veterinary", "casino", "cinema", "events_venue", "nightclub", "theatre", "	police"]

dataset = OSMColocationDataset(area, poi_types)
data = dataset.load_data()

In [9]:
data.shape

(4294, 4)

In [10]:
data.head()

Unnamed: 0,id,type,x,y
0,247441607,restaurant,52.2399471,21.0619767
1,247458167,fast_food,52.2454787,21.085803
2,247458168,fast_food,52.2454434,21.0860156
3,247458173,fast_food,52.2451877,21.0825923
4,247461210,restaurant,52.2312362,21.0121087


In [11]:
class ColocationPattern:
    def __init__(self, types: list, participation_index: float, instances: list):
        self._types = tuple(sorted(types))
        self._pi = participation_index
        self._instances = instances

    def __str__(self) -> str:
        return f"Pattern {self._types} (PI={self._pi:.2f}, Instances={len(self._instances)})"

    def to_dict(self) -> dict:
        return {
            "types": self._types,
            "participation_index": self._pi,
            "num_instances": len(self._instances)
        }

    @property
    def types(self) -> tuple:
        return self._types

    @property
    def pi(self) -> float:
        return self._pi

    @property
    def instances(self) -> list:
        return self._instances


In [12]:
from scipy.spatial import KDTree
from collections import defaultdict
from itertools import combinations
import pandas as pd


class ColocationMiner:
    def __init__(self, radius: float = 0.005, min_prevalence: float = 0.3):
        """
        Colocation miner for discovering colocation patterns.

        :param radius: Radius for neighbor search (in degrees).
        :param min_prevalence: Minimum participation index value for a pattern to be considered significant.
        """
        self.radius = radius
        self.min_prevalence = min_prevalence
        self.patterns = []

    def fit(self, df: pd.DataFrame) -> None:
        self.df = df.reset_index(drop=True)
        self.df['id'] = self.df.index
        self.instances_by_type = {
            t: self.df[self.df['type'] == t] for t in self.df['type'].unique()
        }

        self.neighbor_pairs = self._find_neighbor_pairs()
        self.patterns = self._discover_frequent_patterns()

    def _find_neighbor_pairs(self) -> dict:
        neighbor_dict = defaultdict(list)
        for t1, t2 in combinations(self.instances_by_type.keys(), 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    neighbor_dict[(t1, t2)].append((ids1[i], ids2[j]))

        return neighbor_dict

    def _discover_frequent_patterns(self) -> list[ColocationPattern]:
        patterns = []

        for (t1, t2), pairs in self.neighbor_pairs.items():
            if not pairs:
                continue

            ids1 = set(a for a, _ in pairs)
            ids2 = set(b for _, b in pairs)

            pi1 = len(ids1) / len(self.instances_by_type[t1])
            pi2 = len(ids2) / len(self.instances_by_type[t2])
            pi = min(pi1, pi2)

            if pi >= self.min_prevalence:
                patterns.append(ColocationPattern((t1, t2), pi, pairs))

        return patterns

    def get_patterns(self) -> list[ColocationPattern]:
        return sorted(self.patterns, key=lambda p: -p.pi)

In [13]:
miner = ColocationMiner(radius=0.003, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()

In [14]:
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])

In [15]:
patterns_df.shape

(56, 3)

In [16]:
patterns_df

Unnamed: 0,types,participation_index,num_instances
0,"(atm, fast_food)",0.784466,3214
1,"(cafe, fast_food)",0.757235,3200
2,"(cafe, restaurant)",0.730887,4571
3,"(atm, restaurant)",0.730887,3381
4,"(fast_food, restaurant)",0.729867,5724
5,"(atm, cafe)",0.695146,2013
6,"(pharmacy, restaurant)",0.69419,1199
7,"(atm, pharmacy)",0.681356,802
8,"(atm, bank)",0.673786,1022
9,"(bar, pub)",0.67033,424


In [21]:
from collections import defaultdict
from itertools import product


class ColocationMiner:
    def __init__(self, radius=0.005, min_prevalence=0.3):
        self.radius = radius
        self.min_prevalence = min_prevalence
        self.patterns = []

    def fit(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.df['id'] = self.df.index
        self.instances_by_type = {
            t: self.df[self.df['type'] == t] for t in self.df['type'].unique()
        }

        # Zacznij od par 2-elementowych
        self.neighbor_pairs = self._find_neighbor_pairs()
        self.patterns = self._discover_frequent_patterns()

        # Iteruj dla większych wzorców (3, 4, ...)
        k = 3
        while True:
            print(f"Processing patterns of length: {k}")
            candidates = self._generate_candidates(k)
            print(f"Found {len(candidates)} candidates")
            if not candidates:
                break

            # Sprawdź, które kandydaty są częste
            new_patterns = self._discover_frequent_patterns_for_candidates(candidates)
            print(f"Found {len(new_patterns)} frequent patterns")
            if not new_patterns:
                break

            self.patterns.extend(new_patterns)
            k += 1

    def _find_neighbor_pairs(self):
        neighbor_dict = defaultdict(list)
        for t1, t2 in combinations(self.instances_by_type.keys(), 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    neighbor_dict[(t1, t2)].append((ids1[i], ids2[j]))

        return neighbor_dict

    def _discover_frequent_patterns(self):
        patterns = []
        for (t1, t2), pairs in self.neighbor_pairs.items():
            if not pairs:
                continue

            ids1 = set(a for a, _ in pairs)
            ids2 = set(b for _, b in pairs)

            pi1 = len(ids1) / len(self.instances_by_type[t1])
            pi2 = len(ids2) / len(self.instances_by_type[t2])
            pi = min(pi1, pi2)

            if pi >= self.min_prevalence:
                patterns.append(ColocationPattern((t1, t2), pi, pairs))

        return patterns

    def _generate_candidates(self, k):
        previous_patterns = [p for p in self.patterns if len(p.types) == k - 1]
        seen = set()
        candidates = []

        for p1, p2 in combinations(previous_patterns, 2):
            union = set(p1.types) | set(p2.types)
            if len(union) == k:
                new_pattern = tuple(sorted(union))
                if new_pattern not in seen:
                    seen.add(new_pattern)
                    candidates.append(new_pattern)

        return candidates

    def _discover_frequent_patterns_for_candidates(self, candidates):
        new_patterns = []
        for candidate in candidates:
            # Generujemy instancje
            pairs = self._find_candidate_instances(candidate)
            if pairs:
                pi = self._calculate_participation_index(candidate, pairs)
                if pi >= self.min_prevalence:
                    new_patterns.append(ColocationPattern(candidate, pi, pairs))
        return new_patterns

    def _find_candidate_instances(self, candidate):
        # Zbuduj mapę sąsiadów dla każdej pary typów
        neighbor_map = defaultdict(set)

        for t1, t2 in combinations(candidate, 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    id1, id2 = ids1[i], ids2[j]
                    neighbor_map[(t1, id1)].add((t2, id2))
                    neighbor_map[(t2, id2)].add((t1, id1))  # dla symetrii

        candidate_instances = []
        first_type = candidate[0]
        first_ids = self.instances_by_type[first_type]['id'].values

        for comb in product(*(self.instances_by_type[t]['id'].values for t in candidate)):
            id_map = {t: i for t, i in zip(candidate, comb)}
            valid = True
            for t1, t2 in combinations(candidate, 2):
                if (t2, id_map[t2]) not in neighbor_map.get((t1, id_map[t1]), set()):
                    valid = False
                    break
            if valid:
                candidate_instances.append(tuple(id_map[t] for t in candidate))

        return candidate_instances

    def _calculate_participation_index(self, candidate, instances):
        participants = {t: set() for t in candidate}
        for inst in instances:
            for t, i in zip(candidate, inst):
                participants[t].add(i)

        pis = [
            len(participants[t]) / len(self.instances_by_type[t])
            for t in candidate
            if len(self.instances_by_type[t]) > 0
        ]
        return min(pis) if pis else 0

    def get_patterns(self):
        return sorted(self.patterns, key=lambda p: -p.pi)


In [22]:
miner = ColocationMiner(radius=0.003, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])

Processing patterns of length: 3
Found 240 candidates


KeyboardInterrupt: 

In [18]:
# Testowanie z wcześniej załadowanymi danymi POI
miner = ColocationMiner(radius=0.003, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])


Processing patterns of length: 3
Found 240 candidates
Found 23 frequent patterns
Processing patterns of length: 4
Found 80 candidates
Found 80 frequent patterns
Processing patterns of length: 5
Found 195 candidates
Found 195 frequent patterns
Processing patterns of length: 6
Found 350 candidates
Found 350 frequent patterns
Processing patterns of length: 7
Found 469 candidates
Found 469 frequent patterns
Processing patterns of length: 8
Found 463 candidates
Found 463 frequent patterns
Processing patterns of length: 9
Found 330 candidates
Found 330 frequent patterns
Processing patterns of length: 10
Found 165 candidates
Found 165 frequent patterns
Processing patterns of length: 11
Found 55 candidates
Found 55 frequent patterns
Processing patterns of length: 12
Found 11 candidates
Found 11 frequent patterns
Processing patterns of length: 13
Found 1 candidates
Found 1 frequent patterns
Processing patterns of length: 14
Found 0 candidates


In [19]:
patterns_df.shape

(2198, 3)

In [20]:
patterns_df

Unnamed: 0,types,participation_index,num_instances
0,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.890079,54157
1,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.873079,51025
2,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.869585,51534
3,"(atm, bank, bar, cafe, doctors, fast_food, ice...",0.868654,52575
4,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.864695,52262
...,...,...,...
2193,"(doctors, pub)",0.304348,128
2194,"(doctors, library)",0.304348,94
2195,"(cafe, restaurant, school)",0.301816,5502
2196,"(clinic, doctors)",0.300000,90


In [23]:
# Ponowne załadowanie bibliotek i danych po resecie
import numpy as np
import pandas as pd
from scipy.spatial import KDTree
from itertools import combinations, product
from collections import defaultdict

# Klasa wzorca kolokacji
class ColocationPattern:
    def __init__(self, types, participation_index, instances):
        self.types = tuple(sorted(types))
        self.pi = participation_index
        self.instances = instances

    def to_dict(self):
        return {
            "types": self.types,
            "participation_index": self.pi,
            "num_instances": len(self.instances)
        }

# Zbiór danych testowych
def generate_synthetic_poi():
    np.random.seed(42)
    n_points = 30

    def generate_points(type_label, center_x, center_y):
        x = np.random.normal(center_x, 0.001, n_points)
        y = np.random.normal(center_y, 0.001, n_points)
        return pd.DataFrame({
            "type": type_label,
            "x": x,
            "y": y
        })

    df_A = generate_points("restaurant", 21.01, 52.23)
    df_B = generate_points("bank", 21.011, 52.231)
    df_C = generate_points("atm", 21.012, 52.232)
    return pd.concat([df_A, df_B, df_C], ignore_index=True)

preprocessed_data = generate_synthetic_poi()

# Ulepszona wersja klasy ColocationMiner
class ColocationMiner:
    def __init__(self, radius=0.005, min_prevalence=0.3):
        self.radius = radius
        self.min_prevalence = min_prevalence
        self.patterns = []

    def fit(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.df['id'] = self.df.index
        self.instances_by_type = {
            t: self.df[self.df['type'] == t] for t in self.df['type'].unique()
        }

        self.neighbor_pairs = self._find_neighbor_pairs()
        self.patterns = self._discover_frequent_patterns()

        k = 3
        while True:
            print(f"Processing patterns of length: {k}")
            candidates = self._generate_candidates(k)
            print(f"Found {len(candidates)} candidates")
            if not candidates:
                break

            new_patterns = self._discover_frequent_patterns_for_candidates(candidates)
            print(f"Found {len(new_patterns)} frequent patterns")
            if not new_patterns:
                break

            self.patterns.extend(new_patterns)
            k += 1

    def _find_neighbor_pairs(self) -> dict[tuple[str, str], list[tuple[int, int]]]:
        neighbor_dict = defaultdict(list)
        for t1, t2 in combinations(self.instances_by_type.keys(), 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    neighbor_dict[(t1, t2)].append((ids1[i], ids2[j]))

        return neighbor_dict

    def _discover_frequent_patterns(self):
        """
        Discovers frequent patterns in the data (pairs of types with PI above the threshold).

        :return: List of ColocationPattern objects.
        """
        patterns = []
        for (t1, t2), pairs in self.neighbor_pairs.items():
            if not pairs:
                continue

            ids1 = set(a for a, _ in pairs)
            ids2 = set(b for _, b in pairs)

            pi1 = len(ids1) / len(self.instances_by_type[t1])
            pi2 = len(ids2) / len(self.instances_by_type[t2])
            pi = min(pi1, pi2)

            if pi >= self.min_prevalence:
                patterns.append(ColocationPattern((t1, t2), pi, pairs))

        return patterns

    def _generate_candidates(self, k):
        previous_patterns = [p for p in self.patterns if len(p.types) == k - 1]
        seen = set()
        candidates = []

        combs = combinations(previous_patterns, 2)
        print(f"Number of possible combinations: {len(combs)}")

        for p1, p2 in combs:
            union = set(p1.types) | set(p2.types)
            if len(union) == k:
                new_pattern = tuple(sorted(union))
                if new_pattern not in seen:
                    seen.add(new_pattern)
                    candidates.append(new_pattern)

        print(f"First 5 candidates: {candidates[:5]}")

        return candidates

    def _discover_frequent_patterns_for_candidates(self, candidates):
        new_patterns = []
        for candidate in candidates:
            instances = self._find_candidate_instances(candidate)
            if instances:
                pi = self._calculate_participation_index(candidate, instances)
                if pi >= self.min_prevalence:
                    new_patterns.append(ColocationPattern(candidate, pi, instances))
        return new_patterns

    def _find_candidate_instances(self, candidate):
        neighbor_map = defaultdict(set)

        for t1, t2 in combinations(candidate, 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    id1, id2 = ids1[i], ids2[j]
                    neighbor_map[(t1, id1)].add((t2, id2))
                    neighbor_map[(t2, id2)].add((t1, id1))

        candidate_instances = []
        for comb in product(*(self.instances_by_type[t]['id'].values for t in candidate)):
            id_map = {t: i for t, i in zip(candidate, comb)}
            valid = True
            for t1, t2 in combinations(candidate, 2):
                if (t2, id_map[t2]) not in neighbor_map.get((t1, id_map[t1]), set()):
                    valid = False
                    break
            if valid:
                candidate_instances.append(tuple(id_map[t] for t in candidate))

        return candidate_instances

    def _calculate_participation_index(self, candidate, instances):
        participants = {t: set() for t in candidate}
        for inst in instances:
            for t, i in zip(candidate, inst):
                participants[t].add(i)

        pis = [
            len(participants[t]) / len(self.instances_by_type[t])
            for t in candidate
            if len(self.instances_by_type[t]) > 0
        ]
        return min(pis) if pis else 0

    def get_patterns(self):
        return sorted(self.patterns, key=lambda p: -p.pi)



In [25]:
# Uruchomienie algorytmu
miner = ColocationMiner(radius=0.005, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])


Processing patterns of length: 3
Found 623 candidates


KeyboardInterrupt: 