# Data Exploration

###  OpenStreetMap

In [4]:
!pip install -q overpy


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import overpy

api = overpy.Overpass()

In [6]:
query = """
[out:json];
(
  node["amenity"="restaurant"](52.229,20.944,52.410,21.222);
  node["amenity"="bank"](52.229,20.944,52.410,21.222);
);
out body;
"""

result = api.query(query)

In [7]:
len(result.nodes)

1167

In [8]:
for node in result.nodes[:5]:
    print(f"{node.tags.get('amenity', 'unknown')} at ({node.lat}, {node.lon})")

restaurant at (52.2399471, 21.0619767)
restaurant at (52.2312362, 21.0121087)
restaurant at (52.2340883, 21.0233987)
restaurant at (52.2484456, 21.0142668)
restaurant at (52.2371565, 21.1191810)


### GBIF

In [9]:
from pygbif import occurrences

In [1]:
from abc import ABC, abstractmethod
from typing import List, Tuple
import pandas as pd
import overpy


class ColocationDataset(ABC):
    def __init__(self):
        """
        Base class for colocation datasets.
        """
        self._data = None

    @abstractmethod
    def load_data(self) -> pd.DataFrame:
        """
        Loads the data from the source.
        
        Returns:
            DataFrame with the loaded data.
        """
        pass

    @property
    def data(self) -> pd.DataFrame:
        """
        Returns the loaded data.
        
        Returns:
            DataFrame with the loaded data.
        """
        if self._data is None:
            self.load_data()
        return self._data

In [2]:
import pandas as pd
import requests
from typing import List, Tuple, Dict, Any, Optional
import time
import random
from datetime import datetime, timedelta


class GBIFColocationDataset(ColocationDataset):
    def __init__(self, 
        area: Tuple[float], 
        species_names: List[str], 
        min_year: int = 2010,
        limit_per_species: int | None = None,
    ):
        """
        Colocation dataset for Global Biodiversity Information Facility (GBIF) data.

        Args:
            area (tuple): Bounding box in the format (min_lat, min_lon, max_lat, max_lon).
            species_names (list): List of species scientific names to load from GBIF.
            limit_per_species (int): Maximum number of records per species.
            min_year (int): Minimum year for data if recent_years_only is True.
        """
        super().__init__()
        self._area = area
        self._species_names = species_names
        self._min_year = min_year
        self._limit_per_species = limit_per_species
        
    def _get_species_key(self, species_name: str) -> int:
        """
        Get the GBIF taxon key for a species name.
        
        Args:
            species_name (str): Scientific name of the species.
        
        Returns:
            GBIF taxon key for the species, or None if not found.
        """
        url = "https://api.gbif.org/v1/species/match"
        params = {"name": species_name, "strict": "false"}
        
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            
            data = response.json()
            if data.get('matchType') not in ['NONE', None]:
                return data.get('usageKey')
            print(f"Warning: No match found for {species_name}. Response: {data}")
        except requests.exceptions.RequestException as e:
            print(f"Error querying species {species_name}: {e}")
        
        return None
    
    def _get_all_occurrences(self, species_key: int, species_name: str) -> List[Dict[str, Any]]:
        """
        Get all occurrence records for a species with pagination and filtering.
        
        Args:
            species_key (int): GBIF taxon key for the species.
            species_name (str): Scientific name of the species.

        Returns:
            List of occurrence records with keys: id, type, x, y, year, month, day.
        """
        min_lat, min_lon, max_lat, max_lon = self._area
        
        base_params = {
            "taxonKey": species_key,
            "hasCoordinate": "true",
            "decimalLatitude": f"{min_lat},{max_lat}",
            "decimalLongitude": f"{min_lon},{max_lon}",
            "limit": 300,  # Max per page
            "year": f"{self._min_year},{datetime.now().year}",
        }

        url = "https://api.gbif.org/v1/occurrence/search"
        all_occurrences = []
        offset = 0
        total_count = None
        
        try:
            params = {**base_params, "offset": 0}
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()
            total_count = data.get('count', 0)
            
            print(f"Found {total_count} records for {species_name}")
            
            if self._limit_per_species is not None:
                records_to_fetch = min(total_count, self._limit_per_species)
            else:
                records_to_fetch = total_count
            
            while len(all_occurrences) < records_to_fetch and offset < total_count:
                params = {**base_params, "offset": offset}
                response = requests.get(url, params=params)
                response.raise_for_status()
                data = response.json()
                
                for result in data.get('results', []):
                    if result.get('decimalLatitude') is not None and result.get('decimalLongitude') is not None:
                        all_occurrences.append({
                            "id": str(result.get('key')),
                            "type": species_name,
                            "x": float(result.get('decimalLatitude')),
                            "y": float(result.get('decimalLongitude')),
                            "year": result.get('year'),
                            "month": result.get('month'),
                            "day": result.get('day')
                        })
                
                offset += 300
                if len(all_occurrences) >= records_to_fetch:
                    break
                
                time.sleep(0.5)
                
        except Exception as e:
            print(f"Error retrieving occurrences for {species_name}: {e}")
        
        return all_occurrences[:records_to_fetch]

    def load_data(self) -> pd.DataFrame:
        """
        Load species occurrence data from GBIF API.
        
        Returns:
            DataFrame with the loaded data.
        """
        data = []
        
        for species_name in self._species_names:
            print(f"\nProcessing species: {species_name}")
            
            species_key = self._get_species_key(species_name)
            
            if species_key:
                print(f"Found species key: {species_key}")
                
                species_data = self._get_all_occurrences(species_key, species_name)
                print(f"Retrieved {len(species_data)} occurrences")
                
                data.extend(species_data)
                
                time.sleep(1)
            else:
                print(f"Warning: Could not find species '{species_name}' in GBIF database")
        
        df = pd.DataFrame(data)
        
        if not df.empty:
            self._data = df[["id", "type", "x", "y"]]
        else:
            print("No data found for any species in the specified area")
            self._data = pd.DataFrame(columns=["id", "type", "x", "y"])
            
        return self._data

In [11]:
# Define Poland's bounding box
poland_bbox = (49.0, 14.1, 55.0, 24.2)  # (min_lat, min_lon, max_lat, max_lon)

# Select ecologically interesting species for Poland
# These include interacting species pairs and common species
poland_species = [
    # Forest ecosystem
    "Quercus robur",         # Oak tree - provides habitat
    "Fomes fomentarius",     # Tinder fungus - grows on trees
    "Dendrocopos major",     # Great spotted woodpecker - nests in trees
    "Sciurus vulgaris",      # Red squirrel - lives in forests
    
    # Meadow ecosystem
    "Dactylis glomerata",    # Cock's-foot grass - common in meadows
    "Apis mellifera",        # Honey bee - pollinates flowers
    "Papilio machaon",       # Swallowtail butterfly
    
    # Wetland ecosystem
    "Phragmites australis",  # Common reed - wetland plant
    "Ardea cinerea",         # Grey heron - wetland bird
    "Esox lucius"            # Northern pike - predatory fish
]

# Create dataset with grid sampling to ensure good spatial distribution
gbif_dataset = GBIFColocationDataset(
    area=poland_bbox,
    species_names=poland_species,
    limit_per_species=10000,  # Adjust based on your needs
    min_year=2020,  # Only data from 2015 onwards
)

print("Starting GBIF data collection for Poland...")
df = gbif_dataset.load_data()

print(f"\nLoaded {len(df)} total occurrence records")
print("\nSample of data:")
print(df.head())
print("\nOccurrences per species:")
print(df['type'].value_counts())

# Optional: Save to CSV for later use
df.to_csv("poland_species_colocation_data.csv", index=False)

Starting GBIF data collection for Poland...

Processing species: Quercus robur
Found species key: 2878688
Found 2057 records for Quercus robur
Retrieved 2057 occurrences

Processing species: Fomes fomentarius
Found species key: 8068867
Found 1140 records for Fomes fomentarius
Retrieved 1140 occurrences

Processing species: Dendrocopos major
Found species key: 2477968
Found 70000 records for Dendrocopos major
Retrieved 10000 occurrences

Processing species: Sciurus vulgaris
Found species key: 8211070
Found 2927 records for Sciurus vulgaris
Retrieved 2927 occurrences

Processing species: Dactylis glomerata
Found species key: 2705308
Found 3804 records for Dactylis glomerata
Retrieved 3804 occurrences

Processing species: Apis mellifera
Found species key: 1341976
Found 20534 records for Apis mellifera
Retrieved 10000 occurrences

Processing species: Papilio machaon
Found species key: 8225376
Found 1015 records for Papilio machaon
Retrieved 1015 occurrences

Processing species: Phragmites 

In [7]:
from src.colocation_miner import ColocationMiner

In [12]:
miner = ColocationMiner(radius=0.05, min_prevalence=0.6)
miner.fit(df)

patterns = miner.get_patterns()
print(f"Found {len(patterns)} colocation patterns")
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])

if len(patterns) > 0:
    print("\nTop patterns:")
    pd.set_option('display.max_colwidth', None)
    print(patterns_df.head(10))


Data preparation completed in 0.06 seconds
Spatial indices built in 0.02 seconds
Neighbor precomputation completed in 3.08 seconds
Found 5 patterns of size 2 in 1.91 seconds
Processing patterns of length: 3
Found 1 candidates
After pruning: 1 candidates remain
Found 0 frequent patterns in 1.18 seconds
Found 5 colocation patterns

Top patterns:
                                        types  participation_index  \
0          (Ardea cinerea, Dendrocopos major)             0.870400   
1         (Dactylis glomerata, Quercus robur)             0.681575   
2       (Phragmites australis, Quercus robur)             0.666033   
3  (Dactylis glomerata, Phragmites australis)             0.665652   
4         (Apis mellifera, Fomes fomentarius)             0.610526   

   num_instances  
0         356284  
1          27727  
2          12923  
3          23568  
4          14894  


## Synthetic Data

In [12]:
import math
import numpy as np


class SyntheticColocationDataset(ColocationDataset):
    def __init__(self, 
                 area: Tuple[float], 
                 feature_types: List[str],
                 colocation_patterns: List[List[str]],
                 instances_per_feature: Dict[str, int] = None,
                 prevalence_threshold: float = 0.8,
                 proximity_distance: float = 0.01,
                 noise_ratio: float = 0.2,
                 random_seed: int = 42):
        """
        Synthetic dataset generator for colocation pattern mining.
        
        Args:
            area (tuple): Bounding box in the format (min_lat, min_lon, max_lat, max_lon).
            feature_types (list): List of feature types to generate.
            colocation_patterns (list): List of feature type lists that should co-locate.
            instances_per_feature (dict): Number of instances per feature type. 
                                         If None, generates 100 instances per feature.
            prevalence_threshold (float): Prevalence threshold for patterns (0-1).
            proximity_distance (float): Maximum distance for co-locating instances.
            noise_ratio (float): Ratio of noise instances (0-1).
            random_seed (int): Random seed for reproducibility.
        """
        super().__init__()
        self._area = area
        self._feature_types = feature_types
        self._colocation_patterns = colocation_patterns
        self._instances_per_feature = instances_per_feature or {ft: 100 for ft in feature_types}
        self._prevalence_threshold = prevalence_threshold
        self._proximity_distance = proximity_distance
        self._noise_ratio = noise_ratio
        self._random_seed = random_seed
        
        # Validate input parameters
        self._validate_inputs()
        
        # Set random seed
        np.random.seed(self._random_seed)
        random.seed(self._random_seed)
        
    def _validate_inputs(self):
        """Validate input parameters."""
        # Check if all pattern features are in feature_types
        for pattern in self._colocation_patterns:
            for feature in pattern:
                if feature not in self._feature_types:
                    raise ValueError(f"Feature '{feature}' in colocation pattern is not in feature_types.")
        
        # Check if prevalence_threshold is between 0 and 1
        if not 0 <= self._prevalence_threshold <= 1:
            raise ValueError("prevalence_threshold must be between 0 and 1.")
        
        # Check if noise_ratio is between 0 and 1
        if not 0 <= self._noise_ratio <= 1:
            raise ValueError("noise_ratio must be between 0 and 1.")
    
    def load_data(self) -> pd.DataFrame:
        """
        Generate synthetic colocation data.
        
        Returns:
            DataFrame with the generated data in the format:
            - id: Instance ID (feature_type + index)
            - type: Feature type
            - x: Latitude
            - y: Longitude
        """
        data = []
        instance_counters = {ft: 0 for ft in self._feature_types}
        
        # First, generate pattern instances (co-located)
        for pattern_idx, pattern in enumerate(self._colocation_patterns):
            # Calculate instances needed for each feature in the pattern
            pattern_instances = {}
            for feature in pattern:
                # Calculate instances needed to meet prevalence threshold
                instances_needed = math.ceil(self._instances_per_feature[feature] * self._prevalence_threshold)
                pattern_instances[feature] = instances_needed
            
            # Generate pattern instances
            for instance_idx in range(max(pattern_instances.values())):
                # Generate a center point for this cluster/pattern instance
                center_x = np.random.uniform(self._area[0], self._area[2])
                center_y = np.random.uniform(self._area[1], self._area[3])
                
                # Generate an instance for each feature in the pattern
                for feature in pattern:
                    if instance_idx < pattern_instances[feature]:
                        # Add some jitter within proximity distance
                        jitter_distance = self._proximity_distance * 0.8  # 80% of proximity distance
                        dx = np.random.uniform(-jitter_distance, jitter_distance)
                        dy = np.random.uniform(-jitter_distance, jitter_distance)
                        
                        instance_id = f"{feature}_{instance_counters[feature]}"
                        instance_data = {
                            "id": instance_id,
                            "type": feature,
                            "x": center_x + dx,
                            "y": center_y + dy,
                            "pattern_id": f"pattern_{pattern_idx}_{instance_idx}"  # For debugging/analysis
                        }
                        data.append(instance_data)
                        instance_counters[feature] += 1
        
        # Generate non-pattern (random/noise) instances
        for feature in self._feature_types:
            remaining_instances = self._instances_per_feature[feature] - instance_counters[feature]
            for _ in range(remaining_instances):
                random_x = np.random.uniform(self._area[0], self._area[2])
                random_y = np.random.uniform(self._area[1], self._area[3])
                
                instance_id = f"{feature}_{instance_counters[feature]}"
                instance_data = {
                    "id": instance_id,
                    "type": feature,
                    "x": random_x,
                    "y": random_y,
                    "pattern_id": "noise"  # For debugging/analysis
                }
                data.append(instance_data)
                instance_counters[feature] += 1
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Remove pattern_id column from final output (keep only id, type, x, y)
        self._data = df[["id", "type", "x", "y"]]
        return self._data
    
    def get_patterns_info(self) -> Dict:
        """
        Returns information about the generated patterns.
        
        Returns:
            Dict containing pattern information.
        """
        if self._data is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        df_with_patterns = pd.DataFrame(self._data)
        if "pattern_id" not in df_with_patterns.columns:
            df_with_patterns["pattern_id"] = "unknown"
        
        # Count instances per pattern
        pattern_counts = df_with_patterns[df_with_patterns["pattern_id"] != "noise"].groupby(["pattern_id", "type"]).size()
        noise_counts = df_with_patterns[df_with_patterns["pattern_id"] == "noise"].groupby("type").size()
        
        # Total counts per feature
        total_counts = df_with_patterns.groupby("type").size()
        
        return {
            "patterns": self._colocation_patterns,
            "pattern_counts": pattern_counts,
            "noise_counts": noise_counts,
            "total_counts": total_counts,
            "prevalence_threshold": self._prevalence_threshold,
            "proximity_distance": self._proximity_distance,
            "noise_ratio": self._noise_ratio
        }

In [13]:
def generate_example_dataset():
    """
    Generate an example synthetic dataset similar to those in the Shekhar papers.
    """
    # Define area (bounding box)
    area = (0.0, 0.0, 1.0, 1.0)  # (min_lat, min_lon, max_lat, max_lon)
    
    # Define feature types (using letters as in the paper examples)
    feature_types = ["A", "B", "C", "D", "E", "F"]
    
    # Define colocation patterns (sets of features that should be found together)
    colocation_patterns = [
        ["A", "B"],           # Pattern 1: A and B colocate
        ["C", "D", "E"],      # Pattern 2: C, D, and E colocate
        ["B", "F"]            # Pattern 3: B and F colocate
    ]
    
    # Define number of instances per feature
    instances_per_feature = {
        "A": 150,
        "B": 200,
        "C": 120,
        "D": 100,
        "E": 180,
        "F": 90
    }
    
    # Create dataset
    synthetic_dataset = SyntheticColocationDataset(
        area=area,
        feature_types=feature_types,
        colocation_patterns=colocation_patterns,
        instances_per_feature=instances_per_feature,
        prevalence_threshold=0.8,  # 80% of instances participate in patterns
        proximity_distance=0.01,   # Max distance for co-located instances
        noise_ratio=0.2,           # 20% noise ratio
        random_seed=42
    )
    
    # Load (generate) data
    df = synthetic_dataset.load_data()
    
    # Get pattern information
    pattern_info = synthetic_dataset.get_patterns_info()
    
    return df, pattern_info


In [None]:
df, pattern_info = generate_example_dataset()

print(f"Generated {len(df)} instances across {len(df['type'].unique())} feature types")
print("\nSample of the dataset:")
print(df.head())

print("\nFeature type distribution:")
print(df['type'].value_counts())

print("\nDefined colocation patterns:")
for i, pattern in enumerate(pattern_info['patterns']):
    print(f"Pattern {i+1}: {' + '.join(pattern)}")

Generated 960 instances across 6 feature types

Sample of the dataset:
    id type         x         y
0  A_0    A  0.378252  0.952293
1  B_0    B  0.369036  0.945210
2  A_1    A  0.059701  0.869505
3  B_1    B  0.050413  0.873695
4  A_2    A  0.827352  0.207274

Feature type distribution:
type
B    320
E    180
A    150
C    120
D    100
F     90
Name: count, dtype: int64

Defined colocation patterns:
Pattern 1: A + B
Pattern 2: C + D + E
Pattern 3: B + F


In [18]:
from src.colocation_miner import ColocationMiner

miner = ColocationMiner(radius=0.05, min_prevalence=0.6)
miner.fit(df)
patterns = miner.get_patterns()
print(f"\nFound {len(patterns)} colocation patterns")
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])
if len(patterns) > 0:
    print("\nTop patterns:")
    pd.set_option('display.max_colwidth', None)
    print(patterns_df.head(10))

Data preparation completed in 0.00 seconds
Spatial indices built in 0.00 seconds
Neighbor precomputation completed in 0.02 seconds
Found 7 patterns of size 2 in 0.00 seconds
Processing patterns of length: 3
Found 2 candidates
After pruning: 2 candidates remain
Found 1 frequent patterns in 0.00 seconds
Processing patterns of length: 4
Found 0 candidates

Found 8 colocation patterns

Top patterns:
       types  participation_index  num_instances
0     (A, B)             0.812500            484
1     (C, D)             0.808333            176
2     (C, E)             0.761111            261
3     (B, E)             0.715625            399
4     (D, E)             0.700000            221
5  (C, D, E)             0.644444            404
6     (A, E)             0.633333            185
7     (B, F)             0.609375            284


In [30]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Set
import random
import math

class ColocationDataset:
    """Base class for colocation datasets."""
    
    def __init__(self):
        self._data = None
        
    @property
    def data(self) -> pd.DataFrame:
        """Returns the loaded data."""
        if self._data is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        return self._data


class SyntheticColocationDataset(ColocationDataset):
    def __init__(
        self, 
        config: str = 'C1',
        custom_params: Dict = None,
        random_seed: int = 42
    ):
        """
        Synthetic dataset generator for colocation pattern mining, following the methodology
        from 'Discovering Colocation Patterns from Spatial Data Sets: A General Approach'
        by Huang, Shekhar, and Xiong.
        
        Args:
            config (str): Configuration preset - 'C1' or 'C2' as in the original paper.
            custom_params (dict): Custom parameters to override defaults.
            random_seed (int): Random seed for reproducibility.
        """
        super().__init__()
        
        if config == 'C1':
            self.params = {
                'N_co_loc': 5,         # Number of core co-locations
                'lambda1': 5,          # Parameter of Poisson distribution for size of core co-locations
                'lambda2': 50,         # Parameter of Poisson distribution for table instance size
                'D1': 1000000,         # Width of spatial framework
                'D2': 1000000,         # Height of spatial framework
                'd': 10,               # Size of square to define a co-location
                'r_noise_f': 0.5,      # Ratio of noise features
                'r_noise_n': 50000,    # Number of noise instances
                'm_overlap': 1,        # Number of co-locations generated per core co-location
                'm_clump': 1           # Number of instances per feature in a proximity neighborhood
            }
        elif config == 'C2':
            self.params = {
                'N_co_loc': 4,         # Number of core co-locations
                'lambda1': 5,          # Parameter of Poisson distribution for size of core co-locations
                'lambda2': 50,         # Parameter of Poisson distribution for table instance size
                'D1': 250,             # Width of spatial framework
                'D2': 1000,            # Height of spatial framework
                'd': 10,               # Size of square to define a co-location
                'r_noise_f': 0.5,      # Ratio of noise features
                'r_noise_n': 1000,     # Number of noise instances
                'm_overlap': 1,        # Number of co-locations generated per core co-location
                'm_clump': 1           # Number of instances per feature in a proximity neighborhood
            }
        else:
            raise ValueError(f"Unknown configuration: {config}. Use 'C1' or 'C2'.")
        
        # Override with custom parameters if provided
        if custom_params:
            for key, value in custom_params.items():
                if key in self.params:
                    self.params[key] = value
                else:
                    raise ValueError(f"Unknown parameter: {key}")
        
        self._random_seed = random_seed
        self._feature_types = []
        self._colocation_patterns = []
        self._data = None
        
        np.random.seed(self._random_seed)
        random.seed(self._random_seed)
    
    def load_data(self) -> pd.DataFrame:
        """
        Generate synthetic colocation data based on the specified parameters.
        
        Returns:
            DataFrame with the generated data in the format:
            - id: Instance ID
            - type: Feature type
            - x: X coordinate
            - y: Y coordinate
        """
        # Generate core co-location patterns
        self._generate_colocation_patterns()
        
        # Generate instances for each co-location pattern
        data = self._generate_pattern_instances()
        
        # Step 3: Add noise instances
        data.extend(self._generate_noise_instances())
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Normalize coordinates to [0, 1] range for easier visualization if needed
        if max(self.params['D1'], self.params['D2']) > 1000:
            df['x'] = df['x'] / self.params['D1']
            df['y'] = df['y'] / self.params['D2']
        
        # Save data
        self._data = df
        return df
    
    def _generate_colocation_patterns(self):
        """Generate the core and extended co-location patterns."""
        # Generate feature types (use letters A, B, C, ... as in the paper examples)
        alphabet = [chr(65+i) for i in range(26)]  # A to Z
        
        # Step 1: Generate N_co_loc core co-location patterns
        core_patterns = []
        core_features = set()
        
        for i in range(self.params['N_co_loc']):
            # Size of this core pattern (from Poisson distribution with mean lambda1)
            pattern_size = max(2, np.random.poisson(self.params['lambda1']))  # At least 2 features
            
            # Select features for this pattern
            available_features = [f for f in alphabet if f not in core_features]
            if len(available_features) < pattern_size:
                # Add more letters if needed (AA, AB, etc.)
                for a in alphabet:
                    for b in alphabet:
                        available_features.append(a+b)
                        if len(available_features) >= pattern_size + 10:  # Buffer
                            break
                    if len(available_features) >= pattern_size + 10:
                        break
            
            pattern_features = random.sample(available_features, pattern_size)
            core_features.update(pattern_features)
            core_patterns.append(pattern_features)
        
        # Step 2: Generate extended patterns by appending features
        extended_patterns = []
        for core_pattern in core_patterns:
            for _ in range(self.params['m_overlap']):
                # Add one more feature to the core pattern
                available_features = [f for f in alphabet if f not in core_pattern]
                if not available_features:
                    # Add more letters if needed
                    for a in alphabet:
                        for b in alphabet:
                            new_feature = a+b
                            if new_feature not in core_pattern:
                                available_features.append(new_feature)
                                break
                        if available_features:
                            break
                
                if available_features:
                    new_feature = random.choice(available_features)
                    extended_pattern = core_pattern.copy()
                    extended_pattern.append(new_feature)
                    extended_patterns.append(extended_pattern)
        
        # Combine core and extended patterns
        self._colocation_patterns = core_patterns + extended_patterns
        
        # Create list of all feature types
        all_features = set()
        for pattern in self._colocation_patterns:
            all_features.update(pattern)
        self._feature_types = list(all_features)
        
        # Generate noise features
        num_noise_features = int(len(self._feature_types) * self.params['r_noise_f'])
        noise_features = []
        
        # Generate unique noise feature names
        feature_idx = 0
        while len(noise_features) < num_noise_features:
            if feature_idx < 26:
                # Single lowercase letter
                feature = chr(97 + feature_idx)  # a to z
            else:
                # Two lowercase letters
                feature = chr(97 + (feature_idx // 26)) + chr(97 + (feature_idx % 26))
            
            if feature not in self._feature_types and feature not in noise_features:
                noise_features.append(feature)
            
            feature_idx += 1
        
        # Add noise features to the feature list
        self._feature_types.extend(noise_features)
    
    def _generate_pattern_instances(self):
        """Generate instances for each co-location pattern."""
        data = []
        instance_counters = {ft: 0 for ft in self._feature_types}
        
        for pattern_idx, pattern in enumerate(self._colocation_patterns):
            # Determine number of table instances for this pattern (from Poisson with mean lambda2)
            num_instances = max(1, np.random.poisson(self.params['lambda2']))
            
            for instance_idx in range(num_instances):
                # Generate center location for this proximity neighborhood
                center_x = np.random.uniform(0, self.params['D1'])
                center_y = np.random.uniform(0, self.params['D2'])
                
                # Generate m_clump instances for each feature in the pattern
                for feature in pattern:
                    for _ in range(self.params['m_clump']):
                        # Place instance within square of size d
                        x = center_x + np.random.uniform(-self.params['d']/2, self.params['d']/2)
                        y = center_y + np.random.uniform(-self.params['d']/2, self.params['d']/2)
                        
                        # Ensure within bounds
                        x = max(0, min(self.params['D1'], x))
                        y = max(0, min(self.params['D2'], y))
                        
                        # Create instance
                        instance_id = f"{feature}_{instance_counters[feature]}"
                        instance_data = {
                            "id": instance_id,
                            "type": feature,
                            "x": x,
                            "y": y,
                            "pattern_id": f"pattern_{pattern_idx}_{instance_idx}"  # For debugging
                        }
                        
                        data.append(instance_data)
                        instance_counters[feature] += 1
        
        return data
    
    def _generate_noise_instances(self):
        """Generate noise instances."""
        data = []
        instance_counters = {ft: 0 for ft in self._feature_types}
        
        # Update counters based on pattern instances
        if self._data is not None:
            for _, row in self._data.iterrows():
                instance_counters[row['type']] = max(instance_counters[row['type']], 
                                                   int(row['id'].split('_')[1]) + 1)
        
        # Get noise features (features not in any pattern)
        pattern_features = set()
        for pattern in self._colocation_patterns:
            pattern_features.update(pattern)
        
        noise_features = [f for f in self._feature_types if f not in pattern_features]
        
        # If no dedicated noise features, use all features
        if not noise_features:
            noise_features = self._feature_types
        
        # Generate noise instances
        for _ in range(self.params['r_noise_n']):
            # Randomly select a feature
            feature = random.choice(noise_features)
            
            # Random location
            x = np.random.uniform(0, self.params['D1'])
            y = np.random.uniform(0, self.params['D2'])
            
            # Create instance
            instance_id = f"{feature}_{instance_counters[feature]}"
            instance_data = {
                "id": instance_id,
                "type": feature,
                "x": x,
                "y": y,
                "pattern_id": "noise"  # For debugging
            }
            
            data.append(instance_data)
            instance_counters[feature] += 1
        
        return data
    
    def get_patterns_info(self) -> Dict[str, Any]:
        """
        Returns information about the generated patterns.
        
        Returns:
            Dict containing pattern information.
        """
        if self._data is None:
            raise ValueError("Data not loaded. Call load_data() first.")
        
        pattern_info = {
            "parameters": self.params,
            "colocation_patterns": self._colocation_patterns,
            "feature_types": self._feature_types,
            "feature_counts": self._data['type'].value_counts().to_dict(),
            "pattern_counts": {}
        }
        
        if 'pattern_id' in self._data.columns:
            pattern_counts = self._data['pattern_id'].value_counts().to_dict()
            pattern_info["pattern_counts"] = pattern_counts
        
        return pattern_info

In [31]:
print("Generating dataset with C1 configuration...")
# from src.colocation_dataset import SyntheticColocationDataset
c1_dataset = SyntheticColocationDataset(config='C1')
c1_df = c1_dataset.load_data()
c1_info = c1_dataset.get_patterns_info()
    
print(f"C1 Dataset: {len(c1_df)} instances across {len(c1_df['type'].unique())} feature types")
print(f"Generated {len(c1_info['colocation_patterns'])} colocation patterns")
print(c1_info["colocation_patterns"])

Generating dataset with C1 configuration...
C1 Dataset: 52591 instances across 34 feature types
Generated 10 colocation patterns
[['U', 'D', 'A', 'X', 'I'], ['K', 'Z', 'G', 'F'], ['E', 'T', 'B', 'V'], ['H', 'L', 'W', 'Q', 'C'], ['O', 'R', 'S', 'P', 'Y'], ['U', 'D', 'A', 'X', 'I', 'K'], ['K', 'Z', 'G', 'F', 'R'], ['E', 'T', 'B', 'V', 'W'], ['H', 'L', 'W', 'Q', 'C', 'K'], ['O', 'R', 'S', 'P', 'Y', 'A']]


In [32]:


from src.colocation_miner import ColocationMiner

miner = ColocationMiner(radius=0.00001, min_prevalence=0.8)
miner.fit(c1_df)
patterns = miner.get_patterns()
print(f"\nFound {len(patterns)} colocation patterns")
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])
if len(patterns) > 0:
    print("\nTop patterns:")
    pd.set_option('display.max_colwidth', None)
    print(patterns_df.head(10))
    

Data preparation completed in 0.07 seconds
Spatial indices built in 0.02 seconds
Neighbor precomputation completed in 1.54 seconds
Found 27 patterns of size 2 in 0.05 seconds
Processing patterns of length: 3
Found 17 candidates
After pruning: 17 candidates remain
Found 17 frequent patterns in 0.00 seconds
Processing patterns of length: 4
Found 4 candidates
After pruning: 4 candidates remain
Found 4 frequent patterns in 0.01 seconds
Processing patterns of length: 5
Found 0 candidates

Found 48 colocation patterns

Top patterns:
    types  participation_index  num_instances
0  (L, Q)             1.000000             87
1  (P, Y)             1.000000            106
2  (D, X)             0.990196            101
3  (H, L)             0.988506             86
4  (C, H)             0.988506             86
5  (E, T)             0.981982            109
6  (B, E)             0.981982            109
7  (B, T)             0.981982            109
8  (B, V)             0.981982            109
9  (O, 

In [4]:
def test_synthetic_dataset():
    """Test the SyntheticColocationDataset class with both configurations."""
    
    # Test C1 configuration
    print("Generating dataset with C1 configuration...")
    c1_dataset = SyntheticColocationDataset(config='C1')
    c1_df = c1_dataset.load_data()
    c1_info = c1_dataset.get_patterns_info()
    
    print(f"C1 Dataset: {len(c1_df)} instances across {len(c1_df['type'].unique())} feature types")
    print(f"Generated {len(c1_info['colocation_patterns'])} colocation patterns")
    print("\nSample of the dataset:")
    print(c1_df.head())
    
    # Test C2 configuration
    print("\n\nGenerating dataset with C2 configuration...")
    c2_dataset = SyntheticColocationDataset(config='C2')
    c2_df = c2_dataset.load_data()
    c2_info = c2_dataset.get_patterns_info()
    
    print(f"C2 Dataset: {len(c2_df)} instances across {len(c2_df['type'].unique())} feature types")
    print(f"Generated {len(c2_info['colocation_patterns'])} colocation patterns")
    print("\nSample of the dataset:")
    print(c2_df.head())
    
    # Test custom parameters
    print("\n\nGenerating dataset with custom parameters...")
    custom_params = {
        'N_co_loc': 3,
        'lambda1': 3,
        'lambda2': 20,
        'D1': 100,
        'D2': 100,
        'd': 5,
        'r_noise_n': 500
    }
    
    custom_dataset = SyntheticColocationDataset(config='C2', custom_params=custom_params)
    custom_df = custom_dataset.load_data()
    custom_info = custom_dataset.get_patterns_info()
    
    print(f"Custom Dataset: {len(custom_df)} instances across {len(custom_df['type'].unique())} feature types")
    print(f"Generated {len(custom_info['colocation_patterns'])} colocation patterns")
    print("\nSample of the dataset:")
    print(custom_df.head())
    
    return c1_dataset, c2_dataset, custom_dataset

if __name__ == "__main__":
    c1_dataset, c2_dataset, custom_dataset = test_synthetic_dataset()

Generating dataset with C1 configuration...
C1 Dataset: 52591 instances across 34 feature types
Generated 10 colocation patterns

Sample of the dataset:
    id type         x         y   pattern_id
0  U_0    U  0.607541  0.170529  pattern_0_0
1  D_0    D  0.607550  0.170527  pattern_0_0
2  A_0    A  0.607543  0.170520  pattern_0_0
3  X_0    X  0.607547  0.170524  pattern_0_0
4  I_0    I  0.607541  0.170524  pattern_0_0


Generating dataset with C2 configuration...
C2 Dataset: 2895 instances across 28 feature types
Generated 8 colocation patterns

Sample of the dataset:
    id type           x           y   pattern_id
0  U_0    U  111.014234  785.318306  pattern_0_0
1  D_0    D  114.941642  780.640466  pattern_0_0
2  A_0    A  115.092945  781.881203  pattern_0_0
3  X_0    X  109.668012  789.664817  pattern_0_0
4  I_0    I  118.673816  788.259935  pattern_0_0


Generating dataset with custom parameters...
Custom Dataset: 904 instances across 15 feature types
Generated 6 colocation patter

## Dataset Class

In [7]:
from abc import ABC, abstractmethod
import pandas as pd
import overpy

class ColocationDataset(ABC):
    def __init__(self):
        """
        Base class for colocation datasets.
        """
        self._data = None

    @abstractmethod
    def load_data(self) -> pd.DataFrame:
        """Loads the data from the source."""
        pass

    @property
    def data(self) -> pd.DataFrame:
        """Returns the loaded data."""
        return self._data


class OSMColocationDataset(ColocationDataset):
    def __init__(self, area: tuple, poi_types: list):
        """
        Colocation dataset for OpenStreetMap (OSM) data.

        :param area: Area in the format (min_lat, min_lon, max_lat, max_lon)
        :param poi_types: List of POI types to search for (e.g., ['restaurant', 'bank'])
        """
        super().__init__()
        self._area = area
        self._poi_types = poi_types

    def load_data(self) -> pd.DataFrame:
        """Loads data from OSM using the Overpass API."""
        api = overpy.Overpass()

        query = f"""
        [out:json];
        (
            {' '.join([f'node["amenity"="{poi}"]({self._area[0]},{self._area[1]},{self._area[2]},{self._area[3]});' for poi in self._poi_types])}
        );
        out body;
        """

        result = api.query(query)

        data = []
        for node in result.nodes:
            data.append({
                "id": node.id,
                "type": node.tags.get('amenity', 'unknown'),
                "x": node.lat,
                "y": node.lon
            })

        self._data = pd.DataFrame(data)
        return self.data

    @property
    def data(self) -> pd.DataFrame:
        """Returns the loaded data."""
        if self._data is None:
            self.load_data()
        return self._data


In [None]:
area = (52.229, 20.944, 52.410, 21.222)  # Warsaw area
# poi_types = ["bar", "cafe", "fast_food", "food_court", "ice_cream", "pub", 'restaurant', "college", "library", "research_institute", "school", "university", "parking", "atm", 'bank', "clinic", "doctors", "pharmacy", "veterinary", "casino", "cinema", "events_venue", "nightclub", "theatre", "	police"]
poi_types = ["bar", "cafe", "fast_food", "food_court", "pub", 'restaurant']

dataset = OSMColocationDataset(area, poi_types)
data = dataset.load_data()

In [9]:
data.shape

(4294, 4)

In [10]:
data.head()

Unnamed: 0,id,type,x,y
0,247441607,restaurant,52.2399471,21.0619767
1,247458167,fast_food,52.2454787,21.085803
2,247458168,fast_food,52.2454434,21.0860156
3,247458173,fast_food,52.2451877,21.0825923
4,247461210,restaurant,52.2312362,21.0121087


In [11]:
class ColocationPattern:
    def __init__(self, types: list, participation_index: float, instances: list):
        self._types = tuple(sorted(types))
        self._pi = participation_index
        self._instances = instances

    def __str__(self) -> str:
        return f"Pattern {self._types} (PI={self._pi:.2f}, Instances={len(self._instances)})"

    def to_dict(self) -> dict:
        return {
            "types": self._types,
            "participation_index": self._pi,
            "num_instances": len(self._instances)
        }

    @property
    def types(self) -> tuple:
        return self._types

    @property
    def pi(self) -> float:
        return self._pi

    @property
    def instances(self) -> list:
        return self._instances


In [12]:
from scipy.spatial import KDTree
from collections import defaultdict
from itertools import combinations
import pandas as pd


class ColocationMiner:
    def __init__(self, radius: float = 0.005, min_prevalence: float = 0.3):
        """
        Colocation miner for discovering colocation patterns.

        :param radius: Radius for neighbor search (in degrees).
        :param min_prevalence: Minimum participation index value for a pattern to be considered significant.
        """
        self.radius = radius
        self.min_prevalence = min_prevalence
        self.patterns = []

    def fit(self, df: pd.DataFrame) -> None:
        self.df = df.reset_index(drop=True)
        self.df['id'] = self.df.index
        self.instances_by_type = {
            t: self.df[self.df['type'] == t] for t in self.df['type'].unique()
        }

        self.neighbor_pairs = self._find_neighbor_pairs()
        self.patterns = self._discover_frequent_patterns()

    def _find_neighbor_pairs(self) -> dict:
        neighbor_dict = defaultdict(list)
        for t1, t2 in combinations(self.instances_by_type.keys(), 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    neighbor_dict[(t1, t2)].append((ids1[i], ids2[j]))

        return neighbor_dict

    def _discover_frequent_patterns(self) -> list[ColocationPattern]:
        patterns = []

        for (t1, t2), pairs in self.neighbor_pairs.items():
            if not pairs:
                continue

            ids1 = set(a for a, _ in pairs)
            ids2 = set(b for _, b in pairs)

            pi1 = len(ids1) / len(self.instances_by_type[t1])
            pi2 = len(ids2) / len(self.instances_by_type[t2])
            pi = min(pi1, pi2)

            if pi >= self.min_prevalence:
                patterns.append(ColocationPattern((t1, t2), pi, pairs))

        return patterns

    def get_patterns(self) -> list[ColocationPattern]:
        return sorted(self.patterns, key=lambda p: -p.pi)

In [13]:
miner = ColocationMiner(radius=0.003, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()

In [14]:
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])

In [15]:
patterns_df.shape

(56, 3)

In [16]:
patterns_df

Unnamed: 0,types,participation_index,num_instances
0,"(atm, fast_food)",0.784466,3214
1,"(cafe, fast_food)",0.757235,3200
2,"(cafe, restaurant)",0.730887,4571
3,"(atm, restaurant)",0.730887,3381
4,"(fast_food, restaurant)",0.729867,5724
5,"(atm, cafe)",0.695146,2013
6,"(pharmacy, restaurant)",0.69419,1199
7,"(atm, pharmacy)",0.681356,802
8,"(atm, bank)",0.673786,1022
9,"(bar, pub)",0.67033,424


In [21]:
from collections import defaultdict
from itertools import product


class ColocationMiner:
    def __init__(self, radius=0.005, min_prevalence=0.3):
        self.radius = radius
        self.min_prevalence = min_prevalence
        self.patterns = []

    def fit(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.df['id'] = self.df.index
        self.instances_by_type = {
            t: self.df[self.df['type'] == t] for t in self.df['type'].unique()
        }

        # Zacznij od par 2-elementowych
        self.neighbor_pairs = self._find_neighbor_pairs()
        self.patterns = self._discover_frequent_patterns()

        # Iteruj dla większych wzorców (3, 4, ...)
        k = 3
        while True:
            print(f"Processing patterns of length: {k}")
            candidates = self._generate_candidates(k)
            print(f"Found {len(candidates)} candidates")
            if not candidates:
                break

            # Sprawdź, które kandydaty są częste
            new_patterns = self._discover_frequent_patterns_for_candidates(candidates)
            print(f"Found {len(new_patterns)} frequent patterns")
            if not new_patterns:
                break

            self.patterns.extend(new_patterns)
            k += 1

    def _find_neighbor_pairs(self):
        neighbor_dict = defaultdict(list)
        for t1, t2 in combinations(self.instances_by_type.keys(), 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    neighbor_dict[(t1, t2)].append((ids1[i], ids2[j]))

        return neighbor_dict

    def _discover_frequent_patterns(self):
        patterns = []
        for (t1, t2), pairs in self.neighbor_pairs.items():
            if not pairs:
                continue

            ids1 = set(a for a, _ in pairs)
            ids2 = set(b for _, b in pairs)

            pi1 = len(ids1) / len(self.instances_by_type[t1])
            pi2 = len(ids2) / len(self.instances_by_type[t2])
            pi = min(pi1, pi2)

            if pi >= self.min_prevalence:
                patterns.append(ColocationPattern((t1, t2), pi, pairs))

        return patterns

    def _generate_candidates(self, k):
        previous_patterns = [p for p in self.patterns if len(p.types) == k - 1]
        seen = set()
        candidates = []

        for p1, p2 in combinations(previous_patterns, 2):
            union = set(p1.types) | set(p2.types)
            if len(union) == k:
                new_pattern = tuple(sorted(union))
                if new_pattern not in seen:
                    seen.add(new_pattern)
                    candidates.append(new_pattern)

        return candidates

    def _discover_frequent_patterns_for_candidates(self, candidates):
        new_patterns = []
        for candidate in candidates:
            # Generujemy instancje
            pairs = self._find_candidate_instances(candidate)
            if pairs:
                pi = self._calculate_participation_index(candidate, pairs)
                if pi >= self.min_prevalence:
                    new_patterns.append(ColocationPattern(candidate, pi, pairs))
        return new_patterns

    def _find_candidate_instances(self, candidate):
        # Zbuduj mapę sąsiadów dla każdej pary typów
        neighbor_map = defaultdict(set)

        for t1, t2 in combinations(candidate, 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    id1, id2 = ids1[i], ids2[j]
                    neighbor_map[(t1, id1)].add((t2, id2))
                    neighbor_map[(t2, id2)].add((t1, id1))  # dla symetrii

        candidate_instances = []
        first_type = candidate[0]
        first_ids = self.instances_by_type[first_type]['id'].values

        for comb in product(*(self.instances_by_type[t]['id'].values for t in candidate)):
            id_map = {t: i for t, i in zip(candidate, comb)}
            valid = True
            for t1, t2 in combinations(candidate, 2):
                if (t2, id_map[t2]) not in neighbor_map.get((t1, id_map[t1]), set()):
                    valid = False
                    break
            if valid:
                candidate_instances.append(tuple(id_map[t] for t in candidate))

        return candidate_instances

    def _calculate_participation_index(self, candidate, instances):
        participants = {t: set() for t in candidate}
        for inst in instances:
            for t, i in zip(candidate, inst):
                participants[t].add(i)

        pis = [
            len(participants[t]) / len(self.instances_by_type[t])
            for t in candidate
            if len(self.instances_by_type[t]) > 0
        ]
        return min(pis) if pis else 0

    def get_patterns(self):
        return sorted(self.patterns, key=lambda p: -p.pi)


In [22]:
miner = ColocationMiner(radius=0.003, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])

Processing patterns of length: 3
Found 240 candidates


KeyboardInterrupt: 

In [18]:
# Testowanie z wcześniej załadowanymi danymi POI
miner = ColocationMiner(radius=0.003, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])


Processing patterns of length: 3
Found 240 candidates
Found 23 frequent patterns
Processing patterns of length: 4
Found 80 candidates
Found 80 frequent patterns
Processing patterns of length: 5
Found 195 candidates
Found 195 frequent patterns
Processing patterns of length: 6
Found 350 candidates
Found 350 frequent patterns
Processing patterns of length: 7
Found 469 candidates
Found 469 frequent patterns
Processing patterns of length: 8
Found 463 candidates
Found 463 frequent patterns
Processing patterns of length: 9
Found 330 candidates
Found 330 frequent patterns
Processing patterns of length: 10
Found 165 candidates
Found 165 frequent patterns
Processing patterns of length: 11
Found 55 candidates
Found 55 frequent patterns
Processing patterns of length: 12
Found 11 candidates
Found 11 frequent patterns
Processing patterns of length: 13
Found 1 candidates
Found 1 frequent patterns
Processing patterns of length: 14
Found 0 candidates


In [19]:
patterns_df.shape

(2198, 3)

In [20]:
patterns_df

Unnamed: 0,types,participation_index,num_instances
0,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.890079,54157
1,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.873079,51025
2,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.869585,51534
3,"(atm, bank, bar, cafe, doctors, fast_food, ice...",0.868654,52575
4,"(atm, bank, bar, cafe, clinic, doctors, fast_f...",0.864695,52262
...,...,...,...
2193,"(doctors, pub)",0.304348,128
2194,"(doctors, library)",0.304348,94
2195,"(cafe, restaurant, school)",0.301816,5502
2196,"(clinic, doctors)",0.300000,90


In [23]:
# Ponowne załadowanie bibliotek i danych po resecie
import numpy as np
import pandas as pd
from scipy.spatial import KDTree
from itertools import combinations, product
from collections import defaultdict

# Klasa wzorca kolokacji
class ColocationPattern:
    def __init__(self, types, participation_index, instances):
        self.types = tuple(sorted(types))
        self.pi = participation_index
        self.instances = instances

    def to_dict(self):
        return {
            "types": self.types,
            "participation_index": self.pi,
            "num_instances": len(self.instances)
        }

# Zbiór danych testowych
def generate_synthetic_poi():
    np.random.seed(42)
    n_points = 30

    def generate_points(type_label, center_x, center_y):
        x = np.random.normal(center_x, 0.001, n_points)
        y = np.random.normal(center_y, 0.001, n_points)
        return pd.DataFrame({
            "type": type_label,
            "x": x,
            "y": y
        })

    df_A = generate_points("restaurant", 21.01, 52.23)
    df_B = generate_points("bank", 21.011, 52.231)
    df_C = generate_points("atm", 21.012, 52.232)
    return pd.concat([df_A, df_B, df_C], ignore_index=True)

preprocessed_data = generate_synthetic_poi()

# Ulepszona wersja klasy ColocationMiner
class ColocationMiner:
    def __init__(self, radius=0.005, min_prevalence=0.3):
        self.radius = radius
        self.min_prevalence = min_prevalence
        self.patterns = []

    def fit(self, df: pd.DataFrame):
        self.df = df.reset_index(drop=True)
        self.df['id'] = self.df.index
        self.instances_by_type = {
            t: self.df[self.df['type'] == t] for t in self.df['type'].unique()
        }

        self.neighbor_pairs = self._find_neighbor_pairs()
        self.patterns = self._discover_frequent_patterns()

        k = 3
        while True:
            print(f"Processing patterns of length: {k}")
            candidates = self._generate_candidates(k)
            print(f"Found {len(candidates)} candidates")
            if not candidates:
                break

            new_patterns = self._discover_frequent_patterns_for_candidates(candidates)
            print(f"Found {len(new_patterns)} frequent patterns")
            if not new_patterns:
                break

            self.patterns.extend(new_patterns)
            k += 1

    def _find_neighbor_pairs(self) -> dict[tuple[str, str], list[tuple[int, int]]]:
        neighbor_dict = defaultdict(list)
        for t1, t2 in combinations(self.instances_by_type.keys(), 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    neighbor_dict[(t1, t2)].append((ids1[i], ids2[j]))

        return neighbor_dict

    def _discover_frequent_patterns(self):
        """
        Discovers frequent patterns in the data (pairs of types with PI above the threshold).

        :return: List of ColocationPattern objects.
        """
        patterns = []
        for (t1, t2), pairs in self.neighbor_pairs.items():
            if not pairs:
                continue

            ids1 = set(a for a, _ in pairs)
            ids2 = set(b for _, b in pairs)

            pi1 = len(ids1) / len(self.instances_by_type[t1])
            pi2 = len(ids2) / len(self.instances_by_type[t2])
            pi = min(pi1, pi2)

            if pi >= self.min_prevalence:
                patterns.append(ColocationPattern((t1, t2), pi, pairs))

        return patterns

    def _generate_candidates(self, k):
        previous_patterns = [p for p in self.patterns if len(p.types) == k - 1]
        seen = set()
        candidates = []

        combs = combinations(previous_patterns, 2)
        print(f"Number of possible combinations: {len(combs)}")

        for p1, p2 in combs:
            union = set(p1.types) | set(p2.types)
            if len(union) == k:
                new_pattern = tuple(sorted(union))
                if new_pattern not in seen:
                    seen.add(new_pattern)
                    candidates.append(new_pattern)

        print(f"First 5 candidates: {candidates[:5]}")

        return candidates

    def _discover_frequent_patterns_for_candidates(self, candidates):
        new_patterns = []
        for candidate in candidates:
            instances = self._find_candidate_instances(candidate)
            if instances:
                pi = self._calculate_participation_index(candidate, instances)
                if pi >= self.min_prevalence:
                    new_patterns.append(ColocationPattern(candidate, pi, instances))
        return new_patterns

    def _find_candidate_instances(self, candidate):
        neighbor_map = defaultdict(set)

        for t1, t2 in combinations(candidate, 2):
            pts1 = self.instances_by_type[t1][['x', 'y']].values
            pts2 = self.instances_by_type[t2][['x', 'y']].values
            ids1 = self.instances_by_type[t1]['id'].values
            ids2 = self.instances_by_type[t2]['id'].values

            tree = KDTree(pts2)
            for i, pt in enumerate(pts1):
                idxs = tree.query_ball_point(pt, self.radius)
                for j in idxs:
                    id1, id2 = ids1[i], ids2[j]
                    neighbor_map[(t1, id1)].add((t2, id2))
                    neighbor_map[(t2, id2)].add((t1, id1))

        candidate_instances = []
        for comb in product(*(self.instances_by_type[t]['id'].values for t in candidate)):
            id_map = {t: i for t, i in zip(candidate, comb)}
            valid = True
            for t1, t2 in combinations(candidate, 2):
                if (t2, id_map[t2]) not in neighbor_map.get((t1, id_map[t1]), set()):
                    valid = False
                    break
            if valid:
                candidate_instances.append(tuple(id_map[t] for t in candidate))

        return candidate_instances

    def _calculate_participation_index(self, candidate, instances):
        participants = {t: set() for t in candidate}
        for inst in instances:
            for t, i in zip(candidate, inst):
                participants[t].add(i)

        pis = [
            len(participants[t]) / len(self.instances_by_type[t])
            for t in candidate
            if len(self.instances_by_type[t]) > 0
        ]
        return min(pis) if pis else 0

    def get_patterns(self):
        return sorted(self.patterns, key=lambda p: -p.pi)



In [25]:
# Uruchomienie algorytmu
miner = ColocationMiner(radius=0.005, min_prevalence=0.3)
miner.fit(data)
patterns = miner.get_patterns()
patterns_df = pd.DataFrame([p.to_dict() for p in patterns])


Processing patterns of length: 3
Found 623 candidates


KeyboardInterrupt: 