# Data Exploration

###  OpenStreetMap

In [None]:
import overpy

api = overpy.Overpass()

In [None]:
query = """
[out:json];
(
  node["amenity"="restaurant"](52.229,20.944,52.410,21.222);
  node["amenity"="bank"](52.229,20.944,52.410,21.222);
);
out body;
"""

result = api.query(query)

In [6]:
len(result.nodes)

1167

In [23]:
for node in result.nodes[:5]:
    print(f"{node.tags.get('amenity', 'unknown')} at ({node.lat}, {node.lon})")

restaurant at (52.2399471, 21.0619767)
restaurant at (52.2312362, 21.0121087)
restaurant at (52.2340883, 21.0233987)
restaurant at (52.2484456, 21.0142668)
restaurant at (52.2371565, 21.1191810)


### GBIF

In [8]:
from pygbif import occurrences

In [15]:
results = occurrences.search(taxon_key=6, geometry="((52.229,20.944,52.410,21.222))")

HTTPError: 400 Client Error: Bad Request for url: https://api.gbif.org/v1/occurrence/search?geometry=%28%2852.229%2C20.944%2C52.410%2C21.222%29%29&limit=300&offset=0&taxon.key=6

In [None]:
results

{'offset': 0,
 'limit': 300,
 'endOfRecords': True,
 'count': 0,
 'results': [],
 'facets': []}

In [None]:
# Wydrukuj kilka wyników
for result in results['results']:
    print(f"Species: {result['species']}, Location: {result['decimalLatitude']}, {result['decimalLongitude']}")

## Dataset Class

In [None]:
from abc import ABC, abstractmethod
import pandas as pd
import overpy

class ColocationDataset(ABC):
    def __init__(self):
        """
        Base class for colocation datasets.
        """
        self._data = None

    @abstractmethod
    def load_data(self) -> pd.DataFrame:
        """Loads the data from the source."""
        pass

    @property
    def data(self) -> pd.DataFrame:
        """Returns the loaded data."""
        return self._data


class OSMColocationDataset(ColocationDataset):
    def __init__(self, area: tuple, poi_types: list):
        """
        Colocation dataset for OpenStreetMap (OSM) data.

        :param area: Area in the format (min_lat, min_lon, max_lat, max_lon)
        :param poi_types: List of POI types to search for (e.g., ['restaurant', 'bank'])
        """
        super().__init__()
        self._area = area
        self._poi_types = poi_types

    def load_data(self) -> pd.DataFrame:
        """Loads data from OSM using the Overpass API."""
        api = overpy.Overpass()
        
        query = f"""
        [out:json];
        (
            {' '.join([f'node["amenity"="{poi}"]({self._area[0]},{self._area[1]},{self._area[2]},{self._area[3]});' for poi in self._poi_types])}
        );
        out body;
        """
        
        result = api.query(query)
        
        data = []
        for node in result.nodes:
            data.append({
                "id": node.id,
                "type": node.tags.get('amenity', 'unknown'),
                "x": node.lat,
                "y": node.lon
            })
        
        self._data = pd.DataFrame(data)
        return self.data
    
    @property
    def data(self) -> pd.DataFrame:
        """Returns the loaded data."""
        if self._data is None:
            self.load_data()
        return self._data


In [28]:
area = (52.229, 20.944, 52.410, 21.222)  # Warsaw area
poi_types = ["bar", "cafe", "fast_food", "food_court", "ice_cream", "pub", 'restaurant', "college", "library", "research_institute", "school", "university", "parking", "atm", 'bank', "clinic", "doctors", "pharmacy", "veterinary", "casino", "cinema", "events_venue", "nightclub", "theatre", "	police"]

dataset = OSMColocationDataset(area, poi_types)
data = dataset.load_data()

In [36]:
data.shape

(4294, 3)

In [30]:
data.head()

Unnamed: 0,type,x,y
0,restaurant,52.2399471,21.0619767
1,fast_food,52.2454787,21.085803
2,fast_food,52.2454434,21.0860156
3,fast_food,52.2451877,21.0825923
4,restaurant,52.2312362,21.0121087


In [31]:
class ColocationPattern:
    def __init__(self, types: list, participation_index: float, instances: list):
        self.types = tuple(sorted(types))
        self.pi = participation_index
        self.instances = instances

    def __str__(self) -> str:
        return f"Pattern {self.types} (PI={self.pi:.2f}, Instances={len(self.instances)})"

    def to_dict(self) -> dict:
        return {
            "types": self.types,
            "participation_index": self.pi,
            "num_instances": len(self.instances)
        }

In [None]:
from scipy.spatial import KDTree
from collections import defaultdict
from itertools import combinations
import pandas as pd


class ColocationMiner:
    def __init__(self, radius=0.005, min_prevalence=0.3):
        self.radius = radius
        self.min_prevalence = min_prevalence
        self.patterns = []

    def fit(self, df: pd.DataFrame) -> None:
        self.df = df.reset_index(drop=True)
        self.df['id'] = self.df.index
        self.instances_by_type = {
            t: self.df[self.df['type'] == t] for t in self.df['type'].unique()
        }

        self.neighbor_pairs = self._find_neighbor_pairs()
        self.patterns = self._discover_frequent_patterns()

    def _find_neighbor_pairs(self) -> dict:
        neighbor_dict = defaultdict(list)
        for t1, t2 in combinations(self.instances_by_type.keys(), 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    neighbor_dict[(t1, t2)].append((ids1[i], ids2[j]))

        return neighbor_dict

    def _discover_frequent_patterns(self) -> list[ColocationPattern]:
        patterns = []

        for (t1, t2), pairs in self.neighbor_pairs.items():
            if not pairs:
                continue

            ids1 = set(a for a, _ in pairs)
            ids2 = set(b for _, b in pairs)

            pi1 = len(ids1) / len(self.instances_by_type[t1])
            pi2 = len(ids2) / len(self.instances_by_type[t2])
            pi = min(pi1, pi2)

            if pi >= self.min_prevalence:
                patterns.append(ColocationPattern((t1, t2), pi, pairs))

        return patterns

    def get_patterns(self) -> list[ColocationPattern]:
        return sorted(self.patterns, key=lambda p: -p.pi)

In [37]:
miner = ColocationMiner(radius=0.002, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()

In [38]:
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])

In [39]:
patterns_df.shape

(28, 3)

In [40]:
patterns_df

Unnamed: 0,types,participation_index,num_instances
0,"(atm, fast_food)",0.72233,1750
1,"(cafe, fast_food)",0.672026,1733
2,"(cafe, restaurant)",0.643221,2479
3,"(fast_food, restaurant)",0.622834,3095
4,"(bar, pub)",0.620879,268
5,"(atm, cafe)",0.613592,1071
6,"(atm, pharmacy)",0.610169,460
7,"(atm, bank)",0.578641,609
8,"(atm, restaurant)",0.573904,1721
9,"(fast_food, pharmacy)",0.542373,556
