<a href="https://colab.research.google.com/github/krishna11-dot/hr-talent-ranking-system/blob/main/HR_Potential_Talents_Ranking__System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages for HR talent ranking system
from tabulate import tabulate
!pip install fuzzywuzzy python-Levenshtein scikit-learn pandas numpy

# Core imports for data processing and analysis
import pandas as pd  # Data manipulation
import numpy as np   # Numerical operations

# ML/NLP related imports
from sklearn.feature_extraction.text import TfidfVectorizer  # Text vectorization
from sklearn.cluster import KMeans  # Clustering algorithm
from sklearn.metrics.pairwise import cosine_similarity  # Similarity calculation
from fuzzywuzzy import fuzz  # String matching/comparison

# Standard library imports
import re  # Regular expressions for text processing
import copy  # Deep copying objects
from typing import Dict, List, Tuple  # Type hints
from collections import deque  # Fixed-size queue for performance history
import logging  # Error tracking and debugging

# Logging configuration
logging.basicConfig(
    level=logging.INFO,  # Set logging level to INFO
    format='%(asctime)s - %(levelname)s - %(message)s'  # Log format: timestamp - level - message
)
logger = logging.getLogger(__name__)  # Create logger instance



class GeneticTieBreaker:
    """
    Handles tie-breaking in candidate rankings using genetic algorithm optimization.

    Features:
    - Evolves candidate rankings to break ties optimally
    - Uses fitness scoring based on multiple agents
    - Implements crossover and mutation for population diversity

    Example:
    tie_breaker = GeneticTieBreaker(population_size=50, generations=10)
    optimized_df = tie_breaker.resolve_ties(candidates_df, evaluation_agents)
    """

    def __init__(self, population_size=50, generations=10, mutation_rate=0.1):
        """
        Initialize genetic algorithm parameters.

        Args:
            population_size: Number of rankings in each generation
            generations: Number of evolution cycles
            mutation_rate: Probability of random mutations (0-1)
        """
        self.population_size = population_size  # Size of population per generation
        self.generations = generations          # Number of evolution cycles
        self.mutation_rate = mutation_rate      # Mutation probability

    def fitness(self, candidate: pd.Series, agents: List) -> float:
        """
        Calculate fitness score for a candidate using weighted agent evaluations.

        Example:
        score = fitness(candidate, [experience_agent, skills_agent])
        """
        return sum(agent.evaluate(candidate) * agent.weight for agent in agents)


    def crossover(self, parent1: pd.DataFrame, parent2: pd.DataFrame) -> pd.DataFrame:
        """
        Create child ranking by combining two parent rankings.
        Splits parents at random point and merges their rankings.


        Example:
        >>> parent1 = pd.DataFrame({'id': [1, 2, 3]})
        >>> parent2 = pd.DataFrame({'id': [3, 1, 2]})
        >>> child = tie_breaker.crossover(parent1, parent2)
        >>> print(child['id'].tolist())
        [1, 2, 2]  # Combined at random crossover point
        """

        # Validate parents have same length
        if len(parent1) != len(parent2):
            return parent1

        # Select random crossover point
        crossover_point = np.random.randint(len(parent1))

        # Combine parent rankings
        child = pd.concat([
            parent1.iloc[:crossover_point],     # First part from parent1
            parent2.iloc[crossover_point:]      # Second part from parent2
        ]).reset_index(drop=True)

        return child

    def mutate(self, ranking: pd.DataFrame) -> pd.DataFrame:
        """
        Randomly swaps positions based on mutation rate.

        Example:
        >>> ranking = pd.DataFrame({'id': [1, 2, 3]})
        >>> mutated = tie_breaker.mutate(ranking)
        >>> print(mutated['id'].tolist())
        [1, 3, 2]  # Positions 2 and 3 swapped with 0.1 probability
        """
        if np.random.random() < self.mutation_rate and len(ranking) > 1:
            # Select two random positions
            idx1, idx2 = np.random.choice(len(ranking), 2, replace=False)
            # Swap candidates
            ranking.iloc[idx1], ranking.iloc[idx2] = ranking.iloc[idx2].copy(), ranking.iloc[idx1].copy()
        return ranking


    def evolve_population(self, tied_group: pd.DataFrame, agents: List) -> pd.DataFrame:
        """
        Evolves rankings through generations to find optimal ordering.

        Example:
        >>> tied_group = df[df['final_score'] == 0.95]  # Group with ties
        >>> best_ranking = tie_breaker.evolve_population(tied_group, agents)

        Process:
        1. Creates initial random population of rankings
        2. For each generation:
           - Calculates fitness scores
           - Selects parents using weighted probability
           - Creates children through crossover
           - Applies random mutations
           - Updates population with new generation
        3. Returns ranking with highest fitness
        """

        # Initialize population
        population = [tied_group.copy() for _ in range(self.population_size)]
        best_fitness = float('-inf')
        best_ranking = None

        # Evolution loop
        for generation in range(self.generations):
            # Calculate fitness for each ranking
            fitness_scores = []
            for ranking in population:
                total_fitness = sum(
                    self.fitness(candidate, agents)
                    for _, candidate in ranking.iterrows()
                )
                fitness_scores.append(total_fitness)

                # Track best ranking
                if total_fitness > best_fitness:
                    best_fitness = total_fitness
                    best_ranking = ranking.copy()

            # Calculate parent selection probabilities
            parent_probs = np.array(fitness_scores) / sum(fitness_scores)

            # Select parents for next generation
            parent_indices = np.random.choice(
                len(population),
                size=self.population_size,
                p=parent_probs
            )

            # Create new generation
            new_population = []
            for i in range(0, self.population_size, 2):
                # Get parent pairs
                parent1 = population[parent_indices[i]]
                parent2 = population[parent_indices[min(i+1, len(parent_indices)-1)]]

                # Create and mutate children
                child1 = self.crossover(parent1, parent2)
                child2 = self.crossover(parent2, parent1)
                child1 = self.mutate(child1)
                child2 = self.mutate(child2)
                new_population.extend([child1, child2])

            # Update population
            population = new_population[:self.population_size]

        return best_ranking

    def resolve_ties(self, df: pd.DataFrame, agents: List) -> pd.DataFrame:
        """
        Main method to break ties using genetic algorithm.

        Example:
        >>> df = pd.DataFrame({
        ...     'id': [1, 2, 3, 4],
        ...     'final_score': [0.95, 0.95, 0.90, 0.90]
        ... })
        >>> resolved = tie_breaker.resolve_ties(df, agents)
        >>> print(resolved['id'].tolist())
        [1, 2, 3, 4]  # Ties resolved with unique ordering

        Process:
        1. Groups candidates with identical scores
        2. For each tie group:
           - Creates initial population
           - Evolves through generations
           - Updates rankings with optimal order
        3. Returns dataframe with all ties resolved
        """

        try:
            # Find groups with tied scores
            tied_groups = df.groupby('final_score').filter(lambda x: len(x) > 1)
            if tied_groups.empty:
                return df

            logger.info(f"Found {len(tied_groups)} candidates in tie groups")
            result_df = df.copy()

            # Process each tie group
            for score in tied_groups['final_score'].unique():
                tie_group = tied_groups[tied_groups['final_score'] == score].copy()
                if len(tie_group) <= 1:
                    continue

                # Evolve optimal ranking for tie group
                optimized_ranking = self.evolve_population(tie_group, agents)

                # Ensure consistent data types
                for column in optimized_ranking.columns:
                    if column in result_df.columns:
                        optimized_ranking[column] = optimized_ranking[column].astype(
                            result_df[column].dtype
                        )

                # Update rankings
                for idx in optimized_ranking.index:
                    result_df.loc[idx, optimized_ranking.columns] = optimized_ranking.loc[idx]

            return result_df.sort_values('final_score', ascending=False)

        except Exception as e:
            logger.error(f"Tie breaking error: {str(e)}")
            return df.sort_values('final_score', ascending=False)



class TalentClusterer:
    """
    Clusters candidates based on job titles using TF-IDF and KMeans.

    Features:
    - TF-IDF vectorization of job titles
    - K-means clustering into role categories
    - Title similarity scoring
    - Enhanced role categorization

    Example:
    clusterer = TalentClusterer(n_clusters=5)
    clusters = clusterer.create_clusters(candidate_titles)
    """
    def __init__(self, n_clusters=5):
        # TF-IDF vectorizer config
        self.vectorizer = TfidfVectorizer(
            max_features=100,      # Limit features
            ngram_range=(1, 2),    # Single words and pairs
            stop_words='english'   # Remove common words
        )

        # KMeans clustering config
        self.kmeans = KMeans(
            n_clusters=n_clusters, # Number of clusters
            random_state=42,       # For reproducibility
            n_init=10             # Number of initializations
        )

        # Storage for model outputs
        self.cluster_centroids = None  # Cluster centers
        self.tfidf_matrix = None       # TF-IDF vectors

    def create_clusters(self, titles: pd.Series) -> np.ndarray:
        """
    Creates clusters of similar job titles using TF-IDF and K-means clustering.

    Args:
        titles (pd.Series): Series of job titles to cluster

    Returns:
        np.ndarray: Cluster assignments for each title

    Example:
    >>> titles = pd.Series(['HR Manager', 'HR Specialist', 'Sales Manager'])
    >>> clusters = clusterer.create_clusters(titles)
    >>> print(clusters)  # [0, 0, 1] - HR roles clustered together

    Process:
    1. Defines role categories with important terms:
       - target_primary: 'aspiring hr professional', etc.
       - target_seeking: 'seeking hr position', etc.
       - current_hr: 'hr specialist', etc.
       - senior_hr: 'director hr', etc.

    2. Processes each title:
       - Standardizes format
       - Applies term weighting
       - Handles non-HR categories

    3. Creates TF-IDF matrix:
       - Converts processed titles to vectors
       - Stores for similarity calculations

    4. Performs K-means clustering:
       - Groups similar titles
       - Stores cluster centroids
    """


        try:
            # Define key role terms for categorization
            important_terms = {
                'target_primary': [
                    'aspiring human resources professional',
                    'aspiring human resources specialist',
                    'aspiring human resources generalist',
                    'aspiring human resources manager'
                ],
                'target_seeking': [
                    'seeking human resources position',
                    'seeking human resources opportunities',
                    'seeking human resources role',
                    'seeking hr position'
                ],
                'current_hr': [
                    'human resources manager',
                    'human resources specialist',
                    'human resources coordinator',
                    'hr specialist',
                    'hr coordinator',
                    'hr generalist'
                ],
                'senior_hr': [
                    'director human resources',
                    'senior human resources',
                    'chief human resources',
                    'hr director',
                    'chro',
                    'vp hr'
                ]
            }

            # Process and standardize titles
            processed_titles = titles.apply(
                lambda x: self._process_title_for_clustering(x, important_terms)
            )

            # Create TF-IDF matrix
            self.tfidf_matrix = self.vectorizer.fit_transform(processed_titles)

            # Perform clustering and store centroids
            clusters = self.kmeans.fit_predict(self.tfidf_matrix)
            self.cluster_centroids = self.kmeans.cluster_centers_

            return clusters

        except Exception as e:
            logger.error(f"Clustering error: {str(e)}")
            return np.zeros(len(titles))

    def _process_title_for_clustering(self, title: str, important_terms: Dict) -> str:
        """
    Processes and standardizes job titles for clustering.

    Args:
        title (str): Raw job title
        important_terms (Dict): Dictionary of role terms by category

    Returns:
        str: Processed title with weighted terms

    Example:
    >>> terms = {'target_primary': ['aspiring hr']}
    >>> processed = clusterer._process_title_for_clustering(
    ...     "Aspiring HR Manager", terms
    ... )
    >>> print(processed)
    'aspiring hr professional aspiring hr professional aspiring hr professional aspiring hr professional'

    Processing Rules:
    1. Aspiring HR roles: Weighted 4x
    2. Seeking HR roles: Weighted 4x
    3. Senior roles: Weighted 3x
    4. Manager/Specialist roles: Weighted 2x
    5. Non-HR roles: Single category label
    """

        title = title.lower()
        processed_parts = []

        # Process by role category
        if 'aspiring human resources' in title:
            processed_parts.extend(['aspiring hr professional'] * 4)  # Higher weight
        elif 'seeking human resources' in title:
            processed_parts.extend(['seeking hr position'] * 4)
        elif any(x in title for x in ['chief', 'director', 'senior vice president']):
            processed_parts.extend(['senior hr executive'] * 3)
        elif any(x in title for x in ['manager', 'specialist', 'generalist']):
            role = next(x for x in ['manager', 'specialist', 'generalist'] if x in title)
            processed_parts.extend([f'hr {role}'] * 2)
        elif 'coordinator' in title:
            processed_parts.extend(['hr coordinator'] * 2)
        # Non-HR categorization
        elif any(x in title for x in ['teacher', 'education']):
            processed_parts.append('education role')
        elif any(x in title for x in ['engineer', 'programmer', 'systems']):
            processed_parts.append('technical role')
        elif 'student' in title:
            processed_parts.append('student')
        elif any(x in title for x in ['research', 'lab']):
            processed_parts.append('research role')
        elif 'business' in title:
            processed_parts.append('business role')
        else:
            processed_parts.append('other role')

        return ' '.join(processed_parts)

    def get_cluster_similarity(self, title: str, cluster_id: int) -> float:
        """
    Calculates cosine similarity between a title and cluster centroid.

    Args:
        title (str): Job title to compare
        cluster_id (int): ID of cluster to compare against

    Returns:
        float: Similarity score 0-1

    Example:
    >>> score = clusterer.get_cluster_similarity("HR Manager", 0)
    >>> print(f"{score:.2f}")  # 0.85 - High similarity to HR cluster

    Process:
    1. Converts title to TF-IDF vector
    2. Gets cluster centroid vector
    3. Calculates cosine similarity
    4. Returns normalized score (0-1)
    """

        try:
            if self.cluster_centroids is None:
                return 0.0

            # Convert title to vector and calculate similarity
            title_vec = self.vectorizer.transform([title])
            centroid = self.cluster_centroids[cluster_id].reshape(1, -1)
            return float(cosine_similarity(title_vec, centroid)[0][0])

        except Exception as e:
            logger.error(f"Similarity calculation error: {str(e)}")
            return 0.0



class RankingAgent:
    """
    Agent that evaluates candidates based on title, location, and network criteria.

    Features:
    - Title-based scoring
    - Location tier evaluation
    - Professional network scoring
    - Adaptive weights based on performance

    Example:
    agent = RankingAgent('title', processor, weight=1.0)
    score = agent.evaluate(candidate)
    """
    def __init__(self, expertise: str, processor, weight: float = 1.0):
        self.expertise = expertise        # Scoring criteria (title/location/connections)
        self.weight = weight             # Agent's importance weight
        self.initial_weight = weight     # Store initial weight
        self.performance_history = deque(maxlen=100)  # Track recent performance
        self.processor = processor       # Reference to main processor
        self.project_keywords = processor.project_keywords  # Keywords for matching

    def evaluate(self, candidate: pd.Series) -> float:
        """

        Evaluates candidate based on agent's expertise.

        Example:
        >>> title_agent = RankingAgent('title', processor)
        >>> location_agent = RankingAgent('location', processor)
        >>> scores = [
        ...     title_agent.evaluate(candidate),     # 0.90
        ...     location_agent.evaluate(candidate)   # 0.70
        ... ]

        Returns score between 0-1.
        """

        try:
            if self.expertise == 'title':
                return self._title_score(candidate)
            elif self.expertise == 'location':
                return self._location_score(candidate)
            elif self.expertise == 'connections':
                return self._connection_score(candidate)
            return 0.0
        except Exception as e:
            logger.error(f"Agent evaluation error: {str(e)}")
            return 0.0

    def _title_score(self, candidate) -> float:
        """
        Calculates job title relevance score.

        Example:
        >>> score = agent._title_score({
        ...     'cleaned_title': 'aspiring human resources manager'
        ... })
        >>> print(f"{score:.2f}")  # 0.95 - High relevance title
        """

        try:
            title = candidate.get('cleaned_title', '').lower()
            return self.processor.get_base_score(title)
        except Exception as e:
            logger.error(f"Title scoring error: {str(e)}")
            return 0.0

    def _location_score(self, candidate) -> float:
        """
        Scores location based on tech hub tiers.

        Returns:
        - 1.0: Tier 1 (Texas, California, New York)
        - 0.7: Tier 2 (Illinois, Massachusetts, NC)
        - 0.6: Focus areas (Canada, Turkey)
        - 0.3: Other locations

        Example:
        >>> score = agent._location_score({
        ...     'cleaned_location': 'texas'
        ... })
        >>> print(f"{score:.1f}")  # 1.0 - Tier 1 hub
        """

        location = candidate.get('cleaned_location', 'unknown').lower()
        if location in self.processor.tech_hubs['tier1']:
            return 1.0  # Top tech hubs
        elif location in self.processor.tech_hubs['tier2']:
            return 0.7  # Secondary tech hubs
        elif location in {'canada', 'turkey'}:
            return 0.6  # International focus areas
        return 0.3     # Other locations

    def _connection_score(self, candidate) -> float:
        """
        Score professional network size (0-1)
        Normalized to max of 500 connections
        """
        connections = candidate.get('cleaned_connections', 0)
        return min(connections / 500.0, 1.0)

    def update_weight(self, success_rate: float):
        """

        Adjusts agent weight based on performance.

        Example:
        >>> agent = RankingAgent('title', processor, weight=0.5)
        >>> agent.update_weight(0.8)  # Good performance
        >>> print(f"{agent.weight:.2f}")  # 0.53 - Weight increased
        >>> agent.update_weight(0.3)  # Poor performance
        >>> print(f"{agent.weight:.2f}")  # 0.48 - Weight decreased
        Weight increases/decreases by up to 10% based on success.
        """
        self.weight *= 1 + (success_rate - 0.5) * 0.1  # Adjust weight
        self.performance_history.append(success_rate)   # Track performance



class HRTalentProcessor:
    """
    Main processor for HR talent ranking and evaluation.

    Features:
    - Title standardization and cleaning
    - Location-based scoring
    - Professional network evaluation
    - Candidate clustering
    - Performance metrics tracking

    Example:
    processor = HRTalentProcessor()
    results = processor.process_data(candidates_df)
    """
    def __init__(self):
        # Define HR role keywords
        self.project_keywords = {
            'primary': {
                'aspiring': ['aspiring human resources'],
                'seeking': ['seeking human resources']
            },
            'variations': {
                'aspiring': [
                    'aspiring human resources professional',
                    'aspiring human resources specialist',
                    'aspiring human resources generalist',
                    'aspiring human resources manager',
                    'aspiring human resources analyst',
                    'aspiring hr professional'
                ],
                'seeking': [
                    'seeking human resources position',
                    'seeking human resources opportunities',
                    'seeking human resources hris',
                    'seeking hr position',
                    'seeking hr opportunities',
                    'seeking human resource'
                ]
            }
        }

        # Define location tiers
        self.tech_hubs = {
            'tier1': {'texas', 'california', 'new york'},
            'tier2': {'illinois', 'massachusetts', 'north carolina'}
        }

        # Initialize metrics tracking
        self.metrics = {
            'total_candidates': 0,
            'hr_candidates': 0,
            'role_categories': {
                'aspiring_hr': 0,
                'seeking_hr': 0,
                'senior_hr': 0,
                'mid_level_hr': 0,
                'junior_hr': 0,
                'hr_adjacent': 0,
                'other_roles': 0
            },
            'keyword_matches': {
                'aspiring': 0,
                'seeking': 0,
                'other_hr': 0
            },
            'starred_profiles': [],
            'performance_metrics': {
                'initial_cutoff': 0.0,
                'final_cutoff': 0.0,
                'agent_adaptations': 0,
                'ranking_changes': 0,
                'genetic_optimizations': 0
            }
        }

        # Initialize components
        self.clusterer = TalentClusterer(n_clusters=5)
        self.agents = self._initialize_agents()
        self.tie_breaker = GeneticTieBreaker()

        # Track history
        self.final_scores = []
        self.weight_history = []
        self.cluster_sizes = {}
        self.category_distribution = {}

    def _initialize_agents(self) -> List[RankingAgent]:
        """
        Creates weighted ranking agents for different evaluation criteria.

        Weights Distribution:
        - Title (90%): 3 agents × 30%
        - Location (5%): 2 agents × 2.5%
        - Network (5%): 2 agents × 2.5%

        Example:
        >>> agents = processor._initialize_agents()
        >>> len(agents)  # 7 total agents
        >>> [agent.weight for agent in agents]  # [0.3, 0.3, 0.3, 0.025, 0.025, 0.025, 0.025]
        """

        agents = []
        # Title agents
        for _ in range(3):
            agents.append(RankingAgent('title', self, weight=0.3))
        # Location agents
        for _ in range(2):
            agents.append(RankingAgent('location', self, weight=0.025))
        # Connection agents
        for _ in range(2):
            agents.append(RankingAgent('connections', self, weight=0.025))
        return agents

    def clean_title(self, title: str) -> str:
        """
        Standardizes job titles into categories.

        Example:
        >>> processor = HRTalentProcessor()
        >>> processor.clean_title('HR Manager')
        'human resources manager'
        >>> processor.clean_title('Teacher')
        'NON-HR: Education'

        Categories:
        1. Aspiring HR:
           - professional, specialist, generalist, manager
        2. Seeking HR:
           - position, opportunities, HRIS
        3. Senior HR:
           - CHRO, director, SVP
        4. Standard HR:
           - specialist, generalist, coordinator, manager
        5. Non-HR:
           - Education, Technical, Student, Management, Research, Business
        """

        if not isinstance(title, str):
            return "NON-HR: Invalid Title"

        title = title.lower().strip()

        # Process by category
        # Aspiring HR roles
        if 'aspiring human resources' in title:
            if 'professional' in title:
                return 'aspiring human resources professional'
            elif 'specialist' in title:
                return 'aspiring human resources specialist'
            elif 'generalist' in title:
                return 'aspiring human resources generalist'
            elif 'manager' in title:
                return 'aspiring human resources manager'
            return 'aspiring human resources professional'

        # Seeking HR roles
        if 'seeking human resources' in title:
            if 'hris' in title:
                return 'seeking human resources hris position'
            elif 'position' in title:
                return 'seeking human resources position'
            return 'seeking human resources opportunities'

        # Senior HR roles
        if 'chro' in title or ('svp' in title and 'hr' in title):
            return 'chief human resources officer'
        if 'director' in title and 'human resources' in title:
            return 'director human resources'

        # HR roles
        if any(x in title for x in ['human resources', 'hr']):
            if 'senior' in title or 'sr' in title:
                return 'senior human resources specialist'
            elif 'specialist' in title:
                return 'human resources specialist'
            elif 'generalist' in title:
                return 'human resources generalist'
            elif 'coordinator' in title:
                return 'human resources coordinator'
            elif 'manager' in title:
                return 'human resources manager'
            return 'human resources professional'

        # Adjacent roles
        if 'people development' in title:
            return 'people development coordinator'

        # Non-HR categorization
        if 'teacher' in title or 'education' in title:
            return 'NON-HR: Education'
        if any(x in title for x in ['engineer', 'programmer', 'systems']):
            return 'NON-HR: Technical'
        if 'student' in title and not any(x in title for x in ['hr', 'human resources']):
            return 'NON-HR: Student'
        if 'director' in title or 'administration' in title:
            return 'NON-HR: Management'
        if 'research' in title or 'lab' in title:
            return 'NON-HR: Research'
        if 'business' in title and not any(x in title for x in ['hr', 'human resources']):
            return 'NON-HR: Business'

        return 'NON-HR: Other'

    def clean_location(self, location: str) -> str:
        """
        Clean and standardize location names.
        Maps cities to states and handles international locations.

        Example:
        'New York City' -> 'new york'
        'İzmir' -> 'turkey'
        """
        if not isinstance(location, str):
            return "unknown"

        # Clean and normalize
        location = location.lower().strip()
        location = (location.encode('utf-8', 'ignore')
                          .decode('utf-8')
                          .lower()
                          .strip())

        # Location mappings
        metro_to_state = {
            'grand rapids': 'michigan',
            'san francisco bay': 'california',
            'houston': 'texas',
            'dallas': 'texas',
            'austin': 'texas',
            'new york city': 'new york',
            'boston': 'massachusetts',
            'chicago': 'illinois',
            'chattanooga': 'tennessee',
            'virginia beach': 'virginia'
        }

        international_mapping = {
            'kanada': 'canada',
            'ä°zmir': 'turkey',
            'izmir': 'turkey',
            'tã¼rkiye': 'turkey',
            'türkiye': 'turkey',
            'amerika birleåÿik devletleri': 'united states',
            'amerika birleşik devletleri': 'united states'
        }

        # Check international names
        for old, new in international_mapping.items():
            if old in location:
                return new

        # Clean location text
        location = re.sub(r'\b(greater|area|metropolitan)\b', '', location)

        # Process city,state format
        if ',' in location:
            parts = [part.strip() for part in location.split(',')]
            city = parts[0]
            if city in metro_to_state:
                return metro_to_state[city]
            if len(parts) > 1:
                state = parts[1].strip()
                state = re.sub(r'\s+area$', '', state)
                if state in self.tech_hubs['tier1'] or state in self.tech_hubs['tier2']:
                    return state
            return city

        # Check metro areas
        for metro, state in metro_to_state.items():
            if metro in location:
                return state

        return location

    def clean_connections(self, connections) -> int:
        """
        Clean and normalize connection counts.
        Handles string formatting and caps at 500.

        Example:
        '500+' -> 500
        '250' -> 250
        """
        try:
            if isinstance(connections, str):
                connections = connections.replace('+', '').strip()
            return min(int(connections), 500)
        except (ValueError, TypeError):
            return 0




      # Scoring and normalization methods for HR talent processor

    def calculate_agent_consensus(self, candidate: pd.Series) -> float:
        """
    Calculates the weighted average score from multiple ranking agents.

    Args:
        candidate (pd.Series): Candidate information containing title, location, connections

    Returns:
        float: Combined weighted score between 0-1

    Example:
    >>> # Initialize agents with different weights
    >>> agents = [
    ...     RankingAgent('title', weight=0.6),      # Title is 60% of score
    ...     RankingAgent('location', weight=0.3),    # Location is 30% of score
    ...     RankingAgent('connections', weight=0.1)  # Connections is 10% of score
    ... ]
    >>>
    >>> # Example candidate
    >>> candidate = pd.Series({
    ...     'cleaned_title': 'HR Manager',          # Title score: 0.65
    ...     'cleaned_location': 'new york',         # Location score: 1.0 (Tier 1)
    ...     'cleaned_connections': 400              # Connection score: 0.8 (400/500)
    ... })
    >>>
    >>> # Calculate weighted scores
    >>> scores = [
    ...     0.65 * 0.6,  # Title: 0.39
    ...     1.0 * 0.3,   # Location: 0.30
    ...     0.8 * 0.1    # Connections: 0.08
    ... ]
    >>>
    >>> # Final consensus score
    >>> final_score = sum(scores)  # 0.39 + 0.30 + 0.08 = 0.77

    Calculation Process:
    1. Each agent evaluates candidate in their domain:
       - Title agent looks at role relevance
       - Location agent checks tech hub tier
       - Connection agent normalizes network size

    2. Each score is multiplied by agent weight:
       - Title might be 60% of final score
       - Location might be 30%
       - Connections might be 10%

    3. Weighted scores are summed for final consensus

    Note: Weights should sum to 1.0 for proper normalization
    """
        try:
            scores = [agent.evaluate(candidate) * agent.weight
                      for agent in self.agents]
            return sum(scores)
        except Exception as e:
            logger.error(f"Consensus calculation error: {str(e)}")
            return 0.0

    def get_base_score(self, title: str) -> float:
        """
    Calculates base score from job title.

    Score ranges:
    - 0.90-1.00: Aspiring/seeking HR roles
    - 0.70-0.89: Senior HR roles (CHRO, Director)
    - 0.50-0.69: Standard HR roles
    - 0.30-0.49: HR-adjacent roles
    - 0.05: Non-HR roles

    Example:
    >>> processor.get_base_score('aspiring hr professional')
    0.95
    >>> processor.get_base_score('hr manager')
    0.65
    """
        if not isinstance(title, str) or not title:
            return 0.05

        # Score tiers
        if any(x in title.lower() for x in ['aspiring human resources', 'seeking human resources']):
            if 'professional' in title or 'specialist' in title:
                return 0.95
            if 'generalist' in title or 'manager' in title:
                return 0.92
            return 0.90

        if 'chief human resources officer' in title or 'chro' in title:
            return 0.85
        if 'director human resources' in title:
            return 0.82
        if 'senior human resources' in title:
            return 0.78

        if 'human resources manager' in title:
            return 0.65
        if 'human resources specialist' in title:
            return 0.62
        if 'human resources generalist' in title:
            return 0.60
        if 'human resources coordinator' in title:
            return 0.55
        if 'human resources professional' in title:
            return 0.52

        if 'people development' in title:
            return 0.45

        if title.startswith('NON-HR:'):
            return 0.05

        return 0.05

    def normalize_scores(self, df: pd.DataFrame) -> pd.DataFrame:
        """
    Normalizes candidate scores to 0-1 range.

    Example:
    >>> df = pd.DataFrame({'cleaned_title': ['HR Manager', 'Teacher']})
    >>> normalized = processor.normalize_scores(df)
    >>> print(normalized['final_score'])
    0    0.65  # HR Manager
    1    0.05  # Teacher
    """

        try:
            if df is None or df.empty:
                return df

            result_df = df.copy()
            result_df['base_score'] = result_df['cleaned_title'].apply(self.get_base_score)
            result_df['final_score'] = result_df['base_score']
            result_df['final_score'] = result_df['final_score'].clip(0, 1)

            return result_df

        except Exception as e:
            logger.error(f"Error in normalize_scores: {str(e)}")
            return df

    def prevent_bias(self, df: pd.DataFrame) -> pd.DataFrame:
        """
    Prevents scoring bias through cluster normalization.

    Process:
    1. Normalizes scores within clusters (z-score)
    2. Applies diversity factor (10% adjustment)
    3. Adjusts for keyword representation (10% adjustment)
    4. Clips final scores to 0-1 range

    Example:
    >>> df['cluster'] = [0, 0, 1]  # Two clusters
    >>> unbiased = processor.prevent_bias(df)
    """

        try:
            if 'cluster' not in df.columns:
                return df

            # Normalize cluster scores
            df['cluster_normalized_score'] = df.groupby('cluster')['final_score'].transform(
                lambda x: (x - x.mean()) / (x.std() if x.std() != 0 else 1)
            )

            # Calculate diversity metrics
            cluster_sizes = df.groupby('cluster').size()
            total_candidates = len(df)
            diversity_factor = 1 - (cluster_sizes / total_candidates)

            # Get keyword metrics
            cluster_keyword_ratio = df.groupby('cluster').apply(
                lambda g: sum(1 for title in g['cleaned_title']
                              if any(kw in title.lower()
                                    for kw in self.project_keywords['primary'])) / len(g),
                include_groups=False
            )

            # Apply adjustments
            for cluster in df['cluster'].unique():
                mask = df['cluster'] == cluster
                diversity_adjustment = diversity_factor[cluster] * 0.1
                keyword_adjustment = cluster_keyword_ratio[cluster] * 0.1

                df.loc[mask, 'final_score'] = df.loc[mask, 'final_score'].apply(
                    lambda x: x * (1 + diversity_adjustment + keyword_adjustment)
                )

            df['final_score'] = df['final_score'].clip(0, 1)
            return df

        except Exception as e:
            logger.error(f"Error in bias prevention: {str(e)}")
            return df

    def display_results(self, df: pd.DataFrame):
        """Display analysis results in GitHub-friendly markdown format"""
        print("\n# HR TALENT RANKING ANALYSIS")
        print("=" * 80 + "\n")

        # Distribution summary with markdown table
        total = len(df)
        distribution_data = [
            ['Target Roles (≥0.90)', len(df[df['final_score'] >= 0.9]), f"{len(df[df['final_score'] >= 0.9])/total*100:.3f}"],
            ['Senior HR (0.40-0.89)', len(df[(df['final_score'] >= 0.4) & (df['final_score'] < 0.9)]), f"{len(df[(df['final_score'] >= 0.4) & (df['final_score'] < 0.9)])/total*100:.3f}"],
            ['Other Roles (<0.40)', len(df[df['final_score'] < 0.4]), f"{len(df[df['final_score'] < 0.4])/total*100:.3f}"]
        ]

        print("## CANDIDATE DISTRIBUTION\n")
        print("| Category | Count | Percentage |")
        print("|----------|--------|------------|")
        for row in distribution_data:
            print(f"| {row[0]} | {row[1]} | {row[2]} |")

        # Score guide with markdown table
        print("\n## SCORE GUIDE\n")
        print("| Priority | Range | Category |")
        print("|----------|--------|----------|")
        print("| High | 0.90 - 1.00 | Primary Target |")
        print("| High | 0.40 - 0.89 | Senior HR |")
        print("| Low | 0.05 - 0.39 | Other/Non-HR |")

        def is_senior_role(title):
            """Determines if a role is actually senior (not aspiring/seeking)"""
            senior_keywords = ['SVP', 'CHRO', 'Chief', 'Director', 'Senior VP', 'Vice President', 'Senior', 'Manager']
            target_keywords = ['aspiring', 'seeking', 'entry-level', 'entry level', 'internship']

            if any(keyword.lower() in title.lower() for keyword in target_keywords):
                return False
            return any(keyword.lower() in title.lower() for keyword in senior_keywords)

        def is_target_role(title):
            """Determines if a role is a target role (aspiring/seeking)"""
            target_keywords = ['aspiring', 'seeking', 'entry-level', 'entry level', 'internship']
            return any(keyword.lower() in title.lower() for keyword in target_keywords)

        # Display candidates by score range
        def display_candidate_group(candidates, title):
            if candidates.empty:
                return

            filtered_candidates = candidates.copy()

            # Apply proper filtering
            if "PRIMARY TARGET" in title:
                filtered_candidates = filtered_candidates[
                    filtered_candidates['job_title'].apply(is_target_role) &
                    ~filtered_candidates['job_title'].apply(is_senior_role)
                ]
            elif "SENIOR HR PROFESSIONALS" in title:
                filtered_candidates = filtered_candidates[
                    ((filtered_candidates['job_title'].apply(is_senior_role)) |
                    (filtered_candidates['cleaned_title'].str.contains('senior|manager|specialist', case=False, na=False))) &
                    ~filtered_candidates['job_title'].str.contains('aspiring|seeking', case=False, na=False)
                ]

            if filtered_candidates.empty:
                return

            print(f"\n## {title}\n")
            display_cols = ['id', 'job_title', 'cleaned_title', 'cleaned_location',
                          'cleaned_connections', 'final_score']

            # Print table header
            headers = ['ID', 'Job Title', 'Cleaned Title', 'Location', 'Connections', 'Score']
            print("| " + " | ".join(headers) + " |")
            print("|-" + "-|-".join("-" * len(h) for h in headers) + "-|")

            # Print rows
            for _, row in filtered_candidates.head(10).iterrows():
                print(f"| {row['id']} | {row['job_title']} | {row['cleaned_title']} | {row['cleaned_location']} | {row['cleaned_connections']} | {row['final_score']:.3f} |")

        # Display each group
        display_candidate_group(
            df[df['final_score'] >= 0.9],
            "TOP 10 PRIMARY TARGET CANDIDATES (Score ≥ 0.90)"
        )

        display_candidate_group(
            df[(df['final_score'] >= 0.4) & (df['final_score'] < 0.9)],
            "TOP 10 SENIOR HR PROFESSIONALS (Score 0.40-0.89)"
        )

        display_candidate_group(
            df[df['final_score'] < 0.4],
            "TOP 10 OTHER ROLES (Score < 0.40)"
        )

    def _display_candidate_section(self, candidates: pd.DataFrame, title: str):
        """
        Display formatted candidate details grouped by category.
        Shows rank, ID, title, role, location, network and score.
        """
        if candidates.empty:
            return

        print(f"\n{title}")
        print("-"*100)

        for idx, (_, candidate) in enumerate(candidates.iterrows(), 1):
            print(f"\nRank #{idx}")
            print(f"Candidate ID: {candidate['id']}")
            print(f"Title: {candidate['job_title']}")
            print(f"Role/Category: {candidate['cleaned_title']}")
            print(f"Location: {candidate['cleaned_location']}")
            print(f"Network: {candidate['cleaned_connections']} connections")
            print(f"Score: {candidate['final_score']:.3f}")
            print("-"*50)

    def _display_cluster_analysis(self, df: pd.DataFrame):
        """
        Display cluster statistics and composition analysis.
        Shows metrics like average scores, sizes, keyword matches.
        """
        print("\nCANDIDATE CLUSTER ANALYSIS")
        print("=" * 50)

        # Calculate cluster stats
        cluster_stats = df.groupby('cluster').agg({
            'final_score': ['mean', 'count'],
            'cleaned_title': lambda x: sum(1 for title in x if any(
                kw in title.lower() for kw in self.project_keywords['primary']
            ))
        }).round(4)

        cluster_stats.columns = ['Average Score', 'Group Size', 'Keyword Matches']
        print("\nCluster Performance Metrics:")
        print(cluster_stats)

        # Show cluster compositions
        print("\nCluster Composition Analysis:")
        for cluster in df['cluster'].unique():
            cluster_df = df[df['cluster'] == cluster]
            print(f"\nCluster {cluster} Profile:")
            print(f"Total Members: {len(cluster_df)}")
            if len(cluster_df) > 0:
                print("Representative Titles:")
                for title in cluster_df['cleaned_title'].head(3):
                    print(f"- {title}")


    def display_project_challenges(self, df: pd.DataFrame):
        """
        Display key metrics and analysis results:
        - Algorithm effectiveness
        - Candidate distribution
        - Quality thresholds
        - Bias prevention measures
        """
        print("\n ANALYSIS OF PROJECT CHALLENGES")
        print("="*100)

        # Calculate metrics
        target_score = 0.90
        high_potential = len(df[df['final_score'] >= target_score])
        qualified_hr = len(df[(df['final_score'] >= 0.40) &
                            (df['final_score'] < target_score)])
        non_hr = len(df[df['final_score'] < 0.40])

        # Show algorithm performance
        print("\n1. Algorithm Performance & Effectiveness")
        print("-"*50)
        print("Our scoring system combines multiple factors with weighted importance:")
        print("• Job Title Analysis (90% of total score)")
        print("  - Identifies aspiring and seeking HR professionals")
        print("  - Evaluates current HR role levels")
        print("• Location Impact (5% of total score)")
        print("  - Considers tech hubs and major markets")
        print("• Professional Network (5% of total score)")
        print("  - Measures industry connections")

        # Show candidate distribution
        print("\n2. Candidate Pool Analysis")
        print("-"*50)
        print(f"From {len(df)} total candidates:")
        print(f"• Primary targets: {high_potential} ({high_potential/len(df)*100:.1f}%)")
        print(f"• Qualified HR professionals: {qualified_hr} ({qualified_hr/len(df)*100:.1f}%)")
        print(f"• Non-HR profiles: {non_hr} ({non_hr/len(df)*100:.1f}%)")

        # Show quality control
        print("\n3. Quality Control & Cutoff Analysis")
        print("-"*50)
        print("Dynamic scoring thresholds:")
        print(f"• Target threshold: {target_score:.2f}")
        print(f"• High-potential candidates: {high_potential} profiles")
        print("• Qualified HR threshold: 0.40")

        # Show bias prevention
        print("\n4. Bias Prevention & Fair Evaluation")
        print("-"*50)
        print("Multiple measures ensure unbiased assessment:")
        print("• Role-based scoring prioritizes relevant experience")
        print("• Location consideration limited to 5% impact")
        print("• Network size normalized to prevent seniority bias")
        print("• Cluster analysis promotes diverse candidate pool")


    def _get_rank_label(self, score: float) -> str:
        if score >= 0.90:
            return "Primary Target"
        elif score >= 0.70:
            return "Strong HR Match"
        elif score >= 0.40:
            return "Senior HR Professional"
        elif score >= 0.25:
            return "Mid-Level/Junior HR"
        elif score >= 0.15:
            return "HR Adjacent"
        else:
            return "Non-HR Role"


    def display_reranking_results(self, original_df: pd.DataFrame, reranked_df: pd.DataFrame):
        """
        Display changes in candidate rankings after reranking:
        - Position changes
        - Score updates
        - Rank label changes
        """
        print("\n RERANKING IMPACT ANALYSIS")
        print("="*100)

        changes = []
        for i in range(min(10, len(original_df))):
            old_id = original_df.iloc[i]['id']
            new_id = reranked_df.iloc[i]['id']
            if old_id != new_id:
                old_rank = original_df[original_df['id'] == old_id].index[0] + 1
                new_rank = reranked_df[reranked_df['id'] == new_id].index[0] + 1
                changes.append({
                    'id': new_id,
                    'old_rank': old_rank,
                    'new_rank': new_rank,
                    'title': reranked_df[reranked_df['id'] == new_id]['cleaned_title'].iloc[0],
                    'score': reranked_df[reranked_df['id'] == new_id]['final_score'].iloc[0],
                    'rank_label': self._get_rank_label(reranked_df[reranked_df['id'] == new_id]['final_score'].iloc[0])
                })

        if changes:
            print("\nSignificant Ranking Changes:")
            print("-"*50)
            for change in changes:
                print(f"\nCandidate ID: {change['id']}")
                print(f"Previous Rank: #{change['old_rank']}")
                print(f"New Rank: #{change['new_rank']}")
                print(f"Role: {change['title']}")
                print(f"Score: {change['score']:.3f}")
                print(f"Rank Label: {change['rank_label']}")

            print(f"\nTotal Position Changes: {len(changes)}")
            avg_movement = sum(abs(c['new_rank'] - c['old_rank']) for c in changes) / len(changes)
            print(f"Average Rank Movement: {avg_movement:.1f} positions")
        else:
            print("\nNo significant changes in rankings")


    def process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Process candidate data for HR talent ranking.

        Steps:
        1. Validate input data
        2. Clean and standardize fields
        3. Calculate scores
        4. Apply clustering
        5. Prevent bias

        Example:
        results = processor.process_data(candidates_df)
        """
        try:
            logger.info("Starting data processing pipeline...")

            # Validate DataFrame
            if df is None or df.empty:
                logger.error("Invalid input DataFrame")
                return pd.DataFrame()

            # Create working copy & map columns
            processed_df = df.copy()
            processed_df = processed_df.rename(columns={
                'Job Title': 'job_title',
                'Location': 'location',
                'Connection': 'connection',
                'ID': 'id'
            })

            # Validate required columns exist
            required_columns = {'id', 'job_title', 'location', 'connection'}
            if not required_columns.issubset(set(map(str.lower, processed_df.columns))):
                logger.error(f"Missing columns: {required_columns - current_columns}")
                return pd.DataFrame()

            # Convert data types
            processed_df['id'] = processed_df['id'].fillna(0).astype(int)
            processed_df['job_title'] = processed_df['job_title'].fillna('').astype(str)
            processed_df['location'] = processed_df['location'].fillna('').astype(str)
            processed_df['connection'] = processed_df['connection'].fillna('0').astype(str)

            # Clean and standardize fields
            processed_df['cleaned_title'] = processed_df['job_title'].apply(self.clean_title)
            processed_df['cleaned_location'] = processed_df['location'].apply(self.clean_location)
            processed_df['cleaned_connections'] = processed_df['connection'].apply(self.clean_connections)

            # Calculate scores
            processed_df['agent_score'] = processed_df.apply(self.calculate_agent_consensus, axis=1)
            processed_df['final_score'] = processed_df['agent_score']

            # Apply normalization
            processed_df = self.normalize_scores(processed_df)

            # Apply clustering if enough candidates
            if len(processed_df) >= 5:
                processed_df['cluster'] = self.clusterer.create_clusters(processed_df['cleaned_title'])
                processed_df['cluster_similarity'] = processed_df.apply(
                    lambda x: self.clusterer.get_cluster_similarity(
                        x['cleaned_title'],
                        int(x['cluster'])
                    ),
                    axis=1
                )
                processed_df = self.prevent_bias(processed_df)

            return processed_df

        except Exception as e:
            logger.error(f"Processing error: {str(e)}")
            return pd.DataFrame()


    def rerank_after_starring(self, df: pd.DataFrame, starred_id) -> pd.DataFrame:
        """
    Reranks all candidates based on their similarity to a starred/selected candidate.

    Args:
        df (pd.DataFrame): Dataframe containing candidate information
        starred_id: ID of the selected candidate to compare against

    Returns:
        pd.DataFrame: Reranked dataframe with updated scores

    Example:
    >>> processor = HRTalentProcessor()
    >>> df = pd.DataFrame({
    ...     'id': [1, 2, 3],
    ...     'cleaned_title': ['HR Manager', 'HR Specialist', 'Sales Manager'],
    ...     'cleaned_location': ['Texas', 'Texas', 'California']
    ... })
    >>> reranked_df = processor.rerank_after_starring(df, starred_id=1)
    >>> print(reranked_df['final_score'])
    0    1.00  # HR Manager (exact match)
    1    0.70  # HR Specialist (role mismatch but HR)
    2    0.30  # Sales Manager (different domain)

    Implementation:
    1. Validates starred_id exists in dataframe
    2. Calculates three similarity scores:
       - Role type match (40%): Matches job level (manager, specialist etc)
       - Experience match (30%): Matches seniority level
       - Title similarity (30%): Overall keyword matching
    3. Adds minor location bonus (5%) for same location
    4. Combines scores for final ranking
    5. Sorts by new scores and cleans temporary columns
    """


        try:
            # Get starred candidate
            starred_id = int(starred_id)
            if 'id' not in df.columns or starred_id not in df['id'].values:
                return df
            starred = df[df['id'] == starred_id].iloc[0]

            def calculate_role_type_match(title: str, starred_title: str) -> float:
                """
    Calculates similarity score based on role type matching.

    Args:
        title (str): Candidate's title
        starred_title (str): Selected candidate's title

    Returns:
        float: 1.0 if same role type, 0.0 otherwise

    Example:
    >>> calculate_role_type_match("HR Manager", "Sales Manager")
    1.0  # Both are manager roles
    >>> calculate_role_type_match("HR Manager", "HR Specialist")
    0.0
    """
                role_types = ['specialist', 'manager', 'coordinator', 'generalist', 'director']
                title_type = next((role for role in role_types if role in title.lower()), '')
                starred_type = next((role for role in role_types if role in starred_title.lower()), '')
                return 1.0 if title_type == starred_type else 0.0

            def calculate_experience_match(title: str, starred_title: str) -> float:
                """
    Calculates similarity score based on experience level matching.

    Args:
        title (str): Candidate's title
        starred_title (str): Selected candidate's title

    Returns:
        float: 1.0 if same experience level, 0.0 otherwise

    Example:
    >>> calculate_experience_match("Senior HR Manager", "Senior Analyst")
    1.0  # Both senior roles
    >>> calculate_experience_match("Junior HR", "Senior HR")
    0.0  # Different levels
    """

                levels = ['senior', 'lead', 'principal', 'junior', 'associate', 'entry']
                title_level = next((level for level in levels if level in title.lower()), '')
                starred_level = next((level for level in levels if level in starred_title.lower()), '')
                return 1.0 if title_level == starred_level else 0.0

            def calculate_title_similarity(title: str, starred_title: str) -> float:
                """
    Calculates overall title similarity using keyword matching.

    Args:
        title (str): Candidate's title
        starred_title (str): Selected candidate's title

    Returns:
        float: Jaccard similarity score between title keywords (0.0-1.0)

    Example:
    >>> calculate_title_similarity("HR Manager Sales", "HR Director Marketing")
    0.25  # 1 common word (HR) out of 4 unique words
    """

                title_keywords = set(title.lower().split())
                starred_keywords = set(starred_title.lower().split())
                common_keywords = title_keywords.intersection(starred_keywords)
                return len(common_keywords) / max(len(title_keywords), len(starred_keywords))

            # Calculate new scores without rank preservation
            df['role_type_score'] = df['cleaned_title'].apply(
                lambda x: calculate_role_type_match(x, starred['cleaned_title'])
            ) * 0.40  # 40% weight

            df['experience_score'] = df['cleaned_title'].apply(
                lambda x: calculate_experience_match(x, starred['cleaned_title'])
            ) * 0.30  # 30% weight

            df['title_similarity_score'] = df['cleaned_title'].apply(
                lambda x: calculate_title_similarity(x, starred['cleaned_title'])
            ) * 0.30  # 30% weight

            # Minor location boost (not part of main scoring)
            df['location_bonus'] = df['cleaned_location'].apply(
                lambda x: 0.05 if x == starred['cleaned_location'] else 0.0
            )

            # Calculate final reranked score without previous rank preservation
            df['reranked_score'] = (
                df['role_type_score'] +
                df['experience_score'] +
                df['title_similarity_score'] +
                df['location_bonus']
            )

            # Update final scores and sort
            df['final_score'] = df['reranked_score']
            df = df.sort_values('final_score', ascending=False).reset_index(drop=True)

            # Clean up temporary columns
            df = df.drop(['role_type_score', 'experience_score',
                        'title_similarity_score', 'location_bonus',
                        'reranked_score'], axis=1)

            return df

        except Exception as e:
            logger.error(f"Reranking error: {str(e)}")
            return df

    def _update_role_categories(self, title: str) -> None:
        """
    Updates metrics tracking for different HR role categories based on job title.
    Increments counters for role categories and keyword matches in self.metrics.

    Example:
    >>> processor = HRTalentProcessor()
    >>> processor._update_role_categories("Aspiring Human Resources Manager")
    # Updates metrics:
    # self.metrics['role_categories']['aspiring_hr'] += 1
    # self.metrics['keyword_matches']['aspiring'] += 1

    Args:
        title (str): Cleaned job title to categorize

    Implementation:
    1. If empty title, increment other_roles counter
    2. Check for "aspiring human resources" -> aspiring_hr category
    3. Check for "seeking human resources" -> seeking_hr category
    4. Check for senior titles (chief, SVP, director) -> senior_hr
    5. Check for mid-level titles (specialist, manager) -> mid_level_hr
    6. Check for junior titles (coordinator) -> junior_hr
    7. Check for non-HR prefix -> other_roles
    8. Default to hr_adjacent if none of above
    """
        if not title:
            self.metrics['role_categories']['other_roles'] += 1
            return

        # Target Roles
        if 'aspiring human resources' in title:
            self.metrics['role_categories']['aspiring_hr'] += 1
            self.metrics['keyword_matches']['aspiring'] += 1
            return

        if 'seeking human resources' in title:
            self.metrics['role_categories']['seeking_hr'] += 1
            self.metrics['keyword_matches']['seeking'] += 1
            return

        # Senior HR
        if any(x in title for x in ['chief human resources', 'senior vice president', 'director']):
            self.metrics['role_categories']['senior_hr'] += 1
            self.metrics['keyword_matches']['other_hr'] += 1
            return

        # Mid-Level HR
        if any(x in title for x in ['specialist', 'generalist', 'manager']):
            self.metrics['role_categories']['mid_level_hr'] += 1
            self.metrics['keyword_matches']['other_hr'] += 1
            return

        # Junior HR
        if 'coordinator' in title:
            self.metrics['role_categories']['junior_hr'] += 1
            self.metrics['keyword_matches']['other_hr'] += 1
            return

        # HR Adjacent/Other
        if title.startswith('NON-HR:'):
            self.metrics['role_categories']['other_roles'] += 1
        else:
            self.metrics['role_categories']['hr_adjacent'] += 1


def main():
    """
    Main execution function for HR talent ranking system.
    Handles end-to-end process from data loading to results display.

    Implementation steps:
    1. Print system configuration and target profiles
    2. Load candidate data from CSV file
       - Handle file reading errors
    3. Initialize HRTalentProcessor with:
       - Multi-agent evaluation
       - Genetic tie-breaking
       - Clustering capability
    4. Process candidate data:
       - Clean and standardize fields
       - Calculate initial rankings
    5. Display initial ranking analysis
    6. Demonstrate expert feedback:
       - Select example candidate (7 ranked)
       - Perform reranking based on starred candidate
       - Display updated rankings
    7. Return processor and results or (None, None) if errors

     """
    try:
        print("\nHR TALENT RANKING SYSTEM")
        print("=" * 80)
        print("\nSystem configuration focuses on project keywords:")
        print("- 'aspiring human resources': Primary target profile")
        print("- 'seeking human resources': Alternative target profile")

        print("\nInitiating data loading process...")
        try:
            # Load CSV file
            df = pd.read_csv('/content/potential-talents - Aspiring human resources - seeking human resources (2).csv')
            print(f"Successfully loaded {len(df)} candidate profiles")
        except Exception as e:
            print(f"Data loading error: {str(e)}")
            return None, None

        print("\nConfiguring HR talent processor...")
        processor = HRTalentProcessor()
        print("Processor initialized with:")
        print("- Multi-agent evaluation system")
        print("- Genetic tie-breaking algorithm")
        print("- Clustering capability")

        print("\nExecuting initial candidate evaluation...")
        results = processor.process_data(df)

        if results.empty:
            print("\nProcessing Error: No valid candidates identified")
            print("Please verify data format and content")
            return None, None

        print("\nINITIAL RANKING ANALYSIS")
        processor.display_results(results)

        # Display project challenges analysis
        processor.display_project_challenges(results)

        print("\nDEMONSTRATING EXPERT FEEDBACK MECHANISM")
        if len(results) >= 7:
            # Take 7th candidate as example
            starred_id = results.iloc[6]['id']
            starred = results.iloc[6]

            print(f"\nSimulating expert feedback by starring candidate:")
            print(f"ID: {starred['id']}")
            print(f"Title: {starred['job_title']}")

            # Perform reranking
            updated_results = processor.rerank_after_starring(results, starred_id)

            print("\nUPDATED RANKING ANALYSIS (Post-Feedback)")
            processor.display_results(updated_results)

        return processor, results

    except Exception as e:
        logger.error(f"\nCritical error in main execution: {str(e)}")
        print(f"\nSystem encountered an unexpected error: {str(e)}")
        print("Please check the log for detailed error information")
        return None, None

if __name__ == "__main__":
    processor, results = main()

    if processor is None or results is None:
        print("\nSystem execution failed. Please check error messages above.")
    else:
        print("\nSystem execution completed successfully.")
        print(f"Processed {len(results)} candidates")
        print("Use processor.display_results(results) for detailed analysis")


HR TALENT RANKING SYSTEM

System configuration focuses on project keywords:
- 'aspiring human resources': Primary target profile
- 'seeking human resources': Alternative target profile

Initiating data loading process...
Successfully loaded 104 candidate profiles

Configuring HR talent processor...
Processor initialized with:
- Multi-agent evaluation system
- Genetic tie-breaking algorithm
- Clustering capability

Executing initial candidate evaluation...

INITIAL RANKING ANALYSIS

# HR TALENT RANKING ANALYSIS

## CANDIDATE DISTRIBUTION

| Category | Count | Percentage |
|----------|--------|------------|
| Target Roles (≥0.90) | 49 | 47.115 |
| Senior HR (0.40-0.89) | 27 | 25.962 |
| Other Roles (<0.40) | 28 | 26.923 |

## SCORE GUIDE

| Priority | Range | Category |
|----------|--------|----------|
| High | 0.90 - 1.00 | Primary Target |
| High | 0.40 - 0.89 | Senior HR |
| Low | 0.05 - 0.39 | Other/Non-HR |

## TOP 10 PRIMARY TARGET CANDIDATES (Score ≥ 0.90)

| ID | Job Title | Cle