<a href="https://colab.research.google.com/github/kendo58/uni/blob/main/Untitled57.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Import necessary libraries
import numpy as np
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, fcluster

# --- Function to Load and Parse the Data File ---
def load_data(file_path):
    """
    Reads the plant data file and converts it into a Python dictionary.
    """
    plants = {}
    with open(file_path, 'r', encoding='latin-1') as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split(',')
            plant_name = parts[0]
            locations = set(parts[1:])
            plants[plant_name] = locations
    return plants

# --- Function to Create a Binary Feature Matrix ---
def create_binary_matrix(plants):
    """
    Converts the dictionary of plants into a binary matrix (0s and 1s).
    """
    all_locations = sorted(list(set.union(*plants.values())))
    plant_names = sorted(plants.keys())
    plant_matrix = np.zeros((len(plant_names), len(all_locations)), dtype=int)
    location_to_index = {location: i for i, location in enumerate(all_locations)}
    for i, plant_name in enumerate(plant_names):
        for location in plants[plant_name]:
            if location in location_to_index:
                j = location_to_index[location]
                plant_matrix[i, j] = 1
    return plant_matrix, plant_names, all_locations

# --- Function to Perform Clustering and Find Locations ---
def find_clusters(plant_matrix, plant_names, all_locations, min_cluster_size=1000):
    """
    Performs hierarchical clustering and analyzes the locations within the top clusters.
    Returns the top 5 clusters and the full list of cluster assignments for all plants.
    """
    dist_matrix = pdist(plant_matrix, 'jaccard')
    linked = linkage(dist_matrix, method='average')
    for n_clusters in range(len(plant_names) // min_cluster_size, 1, -1):
        # This 'clusters' array contains the final cluster ID for every plant in the dataset
        clusters = fcluster(linked, n_clusters, criterion='maxclust')
        unique_clusters, counts = np.unique(clusters, return_counts=True)
        valid_cluster_indices = np.where(counts >= min_cluster_size)[0]
        if len(valid_cluster_indices) >= 5:
            tightest_clusters = []
            valid_cluster_ids = unique_clusters[valid_cluster_indices]
            for cluster_id in valid_cluster_ids:
                indices = np.where(clusters == cluster_id)[0]
                cluster_plants = [plant_names[i] for i in indices]
                cluster_sub_matrix = plant_matrix[indices, :]
                location_counts = np.sum(cluster_sub_matrix, axis=0)
                location_freq = [(all_locations[i], count) for i, count in enumerate(location_counts) if count > 0]
                location_freq.sort(key=lambda x: x[1], reverse=True)
                if len(indices) > 1:
                    intra_cluster_dist = np.mean(pdist(cluster_sub_matrix, 'jaccard'))
                else:
                    intra_cluster_dist = 0.0
                tightest_clusters.append((cluster_id, cluster_plants, intra_cluster_dist, location_freq))
            tightest_clusters.sort(key=lambda x: x[2])
            # Return both the top 5 analysis and the complete cluster assignments
            return tightest_clusters[:5], clusters
    # Return None if no solution is found
    return None, None

# --- Function to investigate highly localized plants ---
def investigate_rare_plants(plants_data):
    """
    Finds and prints statistics about plants that grow in very few locations.
    """
    one_location_plants = []
    two_location_plants = []
    three_location_plants = []
    for plant, locations in plants_data.items():
        num_locations = len(locations)
        if num_locations == 1:
            one_location_plants.append((plant, locations))
        elif num_locations == 2:
            two_location_plants.append((plant, locations))
        elif num_locations == 3:
            three_location_plants.append((plant, locations))

    print("\n" + "="*50)
    print("🔬 Investigation of Highly Localized Species 🔬")
    print("="*50)
    print(f"\nFound {len(one_location_plants)} plants growing in exactly ONE location.")
    print("A few examples:")
    for plant, locations in one_location_plants[:5]:
        print(f"  - {plant} (Location: {list(locations)[0]})")
    print(f"\nFound {len(two_location_plants)} plants growing in exactly TWO locations.")
    print("A few examples:")
    for plant, locations in two_location_plants[:5]:
        print(f"  - {plant} (Locations: {', '.join(locations)})")
    print(f"\nFound {len(three_location_plants)} plants growing in exactly THREE locations.")
    print("A few examples:")
    for plant, locations in three_location_plants[:5]:
        print(f"  - {plant} (Locations: {', '.join(locations)})")
    print("\n")

# --- Function to investigate the clustering of specific species ---
def investigate_specific_species(species_list, all_plant_names, cluster_assignments):
    """
    Checks which clusters a specific list of plants were assigned to.
    """
    # Create a dictionary mapping every plant name to its assigned cluster ID.
    plant_to_cluster_map = {name: cluster_id for name, cluster_id in zip(all_plant_names, cluster_assignments)}

    # Create a dictionary to group the specified plants by their cluster ID.
    # e.g., {cluster_5: ['plant_a', 'plant_b'], cluster_20: ['plant_c']}
    results = {}

    print("\n" + "="*50)
    print("🌿 Investigation of Specific Species Clustering 🌿")
    print("="*50 + "\n")

    # Loop through the list of species we want to find.
    for species in species_list:
        # Check if this species exists in our dataset map.
        if species in plant_to_cluster_map:
            # If it exists, get its cluster ID.
            cluster_id = plant_to_cluster_map[species]
            # Add the species to a list inside our results dictionary, grouped by its cluster ID.
            # setdefault ensures that if the key doesn't exist, it's created with an empty list.
            results.setdefault(cluster_id, []).append(species)
        else:
            # If the plant isn't in our dataset, report it.
            print(f"- NOTE: '{species}' was not found in the dataset.")

    # Print the grouped results.
    for cluster_id, plants in results.items():
        print(f"Found {len(plants)} plant(s) in Cluster ID {cluster_id}:")
        for plant in plants:
            print(f"  - {plant}")

# --- Main Execution Block ---
if __name__ == '__main__':
    # The specific list of plants to investigate
    species_to_investigate = [
        'allium yosemitense', 'sabal palmetto', 'cycas revoluta', 'rosa gallica',
        'lagerstroemia', 'syringa', 'hibiscus syriacus', 'huperzia lucidula', 'huperzia nutans'
    ]

    # Step 1: Load the data.
    plants_data = load_data('plants.data')

    # Step 2: Run the investigation of rare plants.
    investigate_rare_plants(plants_data)

    # Step 3: Create the binary matrix for clustering.
    plant_feature_matrix, plant_names_list, all_locations_list = create_binary_matrix(plants_data)

    # Step 4: Run the clustering algorithm. It now returns top clusters AND all assignments.
    found_clusters, all_cluster_assignments = find_clusters(plant_feature_matrix, plant_names_list, all_locations_list)

    # Step 5: Print the main clustering results.
    if found_clusters:
        print("🌿 Top 5 tightest clusters with at least 1000 plants each: 🌿")
        for i, (cluster_id, plants_in_cluster, distance, locations) in enumerate(found_clusters):
            num_plants = len(plants_in_cluster)
            print(f"\n--- Cluster {i+1} (ID: {cluster_id}, Avg Jaccard Distance: {distance:.4f}) ---")
            print(f"Number of plants: {num_plants}")
            # print("A few sample plants:", plants_in_cluster[:5]) # Optional: can be commented out for cleaner output
            print("Most Common Locations:")
            top_locations = locations[:10]
            for loc, count in top_locations:
                percentage = (count / num_plants) * 100
                print(f"  - {loc}: {count} plants ({percentage:.1f}%)")

        # Step 6: Run and print the investigation of specific species.
        investigate_specific_species(species_to_investigate, plant_names_list, all_cluster_assignments)

    else:
        print("Could not find 5 clusters that contain at least 1000 plants each.")


🔬 Investigation of Highly Localized Species 🔬

Found 11566 plants growing in exactly ONE location.
A few examples:
  - abies alba (Location: nc)
  - abies bracteata (Location: ca)
  - abies homolepis (Location: ny)
  - abronia alpina (Location: ca)
  - abronia ameliae (Location: tx)

Found 4874 plants growing in exactly TWO locations.
A few examples:
  - abelia (Locations: nc, fl)
  - abelia x grandiflora (Locations: nc, fl)
  - abelmoschus moschatus (Locations: hi, pr)
  - abies ×shastensis (Locations: ca, or)
  - abronia argillosa (Locations: co, ut)

Found 2954 plants growing in exactly THREE locations.
A few examples:
  - abies concolor var. lowiana (Locations: ca, nv, or)
  - abies lasiocarpa var. arizonica (Locations: co, az, nm)
  - abies procera (Locations: ca, or, wa)
  - abildgaardia (Locations: vi, pr, fl)
  - abildgaardia ovata (Locations: vi, pr, fl)


🌿 Top 5 tightest clusters with at least 1000 plants each: 🌿

--- Cluster 1 (ID: 5, Avg Jaccard Distance: 0.0955) ---
Numb