In [18]:
# Install required packages for HR talent ranking system
!pip install fuzzywuzzy python-Levenshtein scikit-learn pandas numpy

# Core imports for data processing and analysis
import pandas as pd  # Data manipulation
import numpy as np   # Numerical operations

# ML/NLP related imports
from sklearn.feature_extraction.text import TfidfVectorizer  # Text vectorization
from sklearn.cluster import KMeans  # Clustering algorithm
from sklearn.metrics.pairwise import cosine_similarity  # Similarity calculation
from fuzzywuzzy import fuzz  # String matching/comparison

# Standard library imports
import re  # Regular expressions for text processing
import copy  # Deep copying objects
from typing import Dict, List, Tuple  # Type hints
from collections import deque  # Fixed-size queue for performance history
import logging  # Error tracking and debugging

# Logging configuration
logging.basicConfig(
    level=logging.INFO,  # Set logging level to INFO
    format='%(asctime)s - %(levelname)s - %(message)s'  # Log format: timestamp - level - message
)
logger = logging.getLogger(__name__)  # Create logger instance



class GeneticTieBreaker:
    """
    Handles tie-breaking in candidate rankings using genetic algorithm optimization.

    Features:
    - Evolves candidate rankings to break ties optimally
    - Uses fitness scoring based on multiple agents
    - Implements crossover and mutation for population diversity

    Example:
    tie_breaker = GeneticTieBreaker(population_size=50, generations=10)
    optimized_df = tie_breaker.resolve_ties(candidates_df, evaluation_agents)
    """

    def __init__(self, population_size=50, generations=10, mutation_rate=0.1):
        """
        Initialize genetic algorithm parameters.

        Args:
            population_size: Number of rankings in each generation
            generations: Number of evolution cycles
            mutation_rate: Probability of random mutations (0-1)
        """
        self.population_size = population_size  # Size of population per generation
        self.generations = generations          # Number of evolution cycles
        self.mutation_rate = mutation_rate      # Mutation probability

    def fitness(self, candidate: pd.Series, agents: List) -> float:
        """
        Calculate fitness score for a candidate using weighted agent evaluations.

        Example:
        score = fitness(candidate, [experience_agent, skills_agent])
        """
        return sum(agent.evaluate(candidate) * agent.weight for agent in agents)

    def crossover(self, parent1: pd.DataFrame, parent2: pd.DataFrame) -> pd.DataFrame:
        """
        Create child ranking by combining two parent rankings.
        Splits parents at random point and merges their rankings.

        Example:
        child = crossover(ranking1, ranking2)
        """
        # Validate parents have same length
        if len(parent1) != len(parent2):
            return parent1

        # Select random crossover point
        crossover_point = np.random.randint(len(parent1))

        # Combine parent rankings
        child = pd.concat([
            parent1.iloc[:crossover_point],     # First part from parent1
            parent2.iloc[crossover_point:]      # Second part from parent2
        ]).reset_index(drop=True)

        return child

    def mutate(self, ranking: pd.DataFrame) -> pd.DataFrame:
        """
        Randomly swap positions in ranking based on mutation rate.

        Example:
        mutated = mutate(ranking)  # May swap random candidates
        """
        if np.random.random() < self.mutation_rate and len(ranking) > 1:
            # Select two random positions
            idx1, idx2 = np.random.choice(len(ranking), 2, replace=False)
            # Swap candidates
            ranking.iloc[idx1], ranking.iloc[idx2] = ranking.iloc[idx2].copy(), ranking.iloc[idx1].copy()
        return ranking

    def evolve_population(self, tied_group: pd.DataFrame, agents: List) -> pd.DataFrame:
        """
        Evolve population of rankings through multiple generations.

        Process:
        1. Create initial population
        2. Calculate fitness scores
        3. Select parents based on fitness
        4. Create new generation through crossover/mutation
        5. Repeat for specified generations
        """
        # Initialize population
        population = [tied_group.copy() for _ in range(self.population_size)]
        best_fitness = float('-inf')
        best_ranking = None

        # Evolution loop
        for generation in range(self.generations):
            # Calculate fitness for each ranking
            fitness_scores = []
            for ranking in population:
                total_fitness = sum(
                    self.fitness(candidate, agents)
                    for _, candidate in ranking.iterrows()
                )
                fitness_scores.append(total_fitness)

                # Track best ranking
                if total_fitness > best_fitness:
                    best_fitness = total_fitness
                    best_ranking = ranking.copy()

            # Calculate parent selection probabilities
            parent_probs = np.array(fitness_scores) / sum(fitness_scores)

            # Select parents for next generation
            parent_indices = np.random.choice(
                len(population),
                size=self.population_size,
                p=parent_probs
            )

            # Create new generation
            new_population = []
            for i in range(0, self.population_size, 2):
                # Get parent pairs
                parent1 = population[parent_indices[i]]
                parent2 = population[parent_indices[min(i+1, len(parent_indices)-1)]]

                # Create and mutate children
                child1 = self.crossover(parent1, parent2)
                child2 = self.crossover(parent2, parent1)
                child1 = self.mutate(child1)
                child2 = self.mutate(child2)
                new_population.extend([child1, child2])

            # Update population
            population = new_population[:self.population_size]

        return best_ranking

    def resolve_ties(self, df: pd.DataFrame, agents: List) -> pd.DataFrame:
        """
        Main method to resolve ties in candidate rankings.

        Process:
        1. Identify tied groups
        2. Evolve optimal ranking for each tie group
        3. Update final rankings

        Example:
        final_df = resolve_ties(candidates_df, evaluation_agents)
        """
        try:
            # Find groups with tied scores
            tied_groups = df.groupby('final_score').filter(lambda x: len(x) > 1)
            if tied_groups.empty:
                return df

            logger.info(f"Found {len(tied_groups)} candidates in tie groups")
            result_df = df.copy()

            # Process each tie group
            for score in tied_groups['final_score'].unique():
                tie_group = tied_groups[tied_groups['final_score'] == score].copy()
                if len(tie_group) <= 1:
                    continue

                # Evolve optimal ranking for tie group
                optimized_ranking = self.evolve_population(tie_group, agents)

                # Ensure consistent data types
                for column in optimized_ranking.columns:
                    if column in result_df.columns:
                        optimized_ranking[column] = optimized_ranking[column].astype(
                            result_df[column].dtype
                        )

                # Update rankings
                for idx in optimized_ranking.index:
                    result_df.loc[idx, optimized_ranking.columns] = optimized_ranking.loc[idx]

            return result_df.sort_values('final_score', ascending=False)

        except Exception as e:
            logger.error(f"Tie breaking error: {str(e)}")
            return df.sort_values('final_score', ascending=False)



class TalentClusterer:
    """
    Clusters candidates based on job titles using TF-IDF and KMeans.

    Features:
    - TF-IDF vectorization of job titles
    - K-means clustering into role categories
    - Title similarity scoring
    - Enhanced role categorization

    Example:
    clusterer = TalentClusterer(n_clusters=5)
    clusters = clusterer.create_clusters(candidate_titles)
    """
    def __init__(self, n_clusters=5):
        # TF-IDF vectorizer config
        self.vectorizer = TfidfVectorizer(
            max_features=100,      # Limit features
            ngram_range=(1, 2),    # Single words and pairs
            stop_words='english'   # Remove common words
        )

        # KMeans clustering config
        self.kmeans = KMeans(
            n_clusters=n_clusters, # Number of clusters
            random_state=42,       # For reproducibility
            n_init=10             # Number of initializations
        )

        # Storage for model outputs
        self.cluster_centroids = None  # Cluster centers
        self.tfidf_matrix = None       # TF-IDF vectors

    def create_clusters(self, titles: pd.Series) -> np.ndarray:
        """
        Create clusters from job titles.

        Process:
        1. Define important role terms
        2. Process titles for clustering
        3. Create TF-IDF matrix
        4. Perform k-means clustering

        Example:
        clusters = create_clusters(df['job_title'])
        """
        try:
            # Define key role terms for categorization
            important_terms = {
                'target_primary': [
                    'aspiring human resources professional',
                    'aspiring human resources specialist',
                    'aspiring human resources generalist',
                    'aspiring human resources manager'
                ],
                'target_seeking': [
                    'seeking human resources position',
                    'seeking human resources opportunities',
                    'seeking human resources role',
                    'seeking hr position'
                ],
                'current_hr': [
                    'human resources manager',
                    'human resources specialist',
                    'human resources coordinator',
                    'hr specialist',
                    'hr coordinator',
                    'hr generalist'
                ],
                'senior_hr': [
                    'director human resources',
                    'senior human resources',
                    'chief human resources',
                    'hr director',
                    'chro',
                    'vp hr'
                ]
            }

            # Process and standardize titles
            processed_titles = titles.apply(
                lambda x: self._process_title_for_clustering(x, important_terms)
            )

            # Create TF-IDF matrix
            self.tfidf_matrix = self.vectorizer.fit_transform(processed_titles)

            # Perform clustering and store centroids
            clusters = self.kmeans.fit_predict(self.tfidf_matrix)
            self.cluster_centroids = self.kmeans.cluster_centers_

            return clusters

        except Exception as e:
            logger.error(f"Clustering error: {str(e)}")
            return np.zeros(len(titles))

    def _process_title_for_clustering(self, title: str, important_terms: Dict) -> str:
        """
        Process job titles for consistent clustering.

        Rules:
        - Standardize HR role categories
        - Weight primary target roles higher
        - Categorize non-HR roles

        Example:
        processed = _process_title_for_clustering("HR Manager", terms_dict)
        """
        title = title.lower()
        processed_parts = []

        # Process by role category
        if 'aspiring human resources' in title:
            processed_parts.extend(['aspiring hr professional'] * 4)  # Higher weight
        elif 'seeking human resources' in title:
            processed_parts.extend(['seeking hr position'] * 4)
        elif any(x in title for x in ['chief', 'director', 'senior vice president']):
            processed_parts.extend(['senior hr executive'] * 3)
        elif any(x in title for x in ['manager', 'specialist', 'generalist']):
            role = next(x for x in ['manager', 'specialist', 'generalist'] if x in title)
            processed_parts.extend([f'hr {role}'] * 2)
        elif 'coordinator' in title:
            processed_parts.extend(['hr coordinator'] * 2)
        # Non-HR categorization
        elif any(x in title for x in ['teacher', 'education']):
            processed_parts.append('education role')
        elif any(x in title for x in ['engineer', 'programmer', 'systems']):
            processed_parts.append('technical role')
        elif 'student' in title:
            processed_parts.append('student')
        elif any(x in title for x in ['research', 'lab']):
            processed_parts.append('research role')
        elif 'business' in title:
            processed_parts.append('business role')
        else:
            processed_parts.append('other role')

        return ' '.join(processed_parts)

    def get_cluster_similarity(self, title: str, cluster_id: int) -> float:
        """
        Calculate similarity between title and cluster centroid.

        Example:
        score = get_cluster_similarity("HR Manager", 0)  # Returns 0-1 score
        """
        try:
            if self.cluster_centroids is None:
                return 0.0

            # Convert title to vector and calculate similarity
            title_vec = self.vectorizer.transform([title])
            centroid = self.cluster_centroids[cluster_id].reshape(1, -1)
            return float(cosine_similarity(title_vec, centroid)[0][0])

        except Exception as e:
            logger.error(f"Similarity calculation error: {str(e)}")
            return 0.0



class RankingAgent:
    """
    Agent that evaluates candidates based on title, location, and network criteria.

    Features:
    - Title-based scoring
    - Location tier evaluation
    - Professional network scoring
    - Adaptive weights based on performance

    Example:
    agent = RankingAgent('title', processor, weight=1.0)
    score = agent.evaluate(candidate)
    """
    def __init__(self, expertise: str, processor, weight: float = 1.0):
        self.expertise = expertise        # Scoring criteria (title/location/connections)
        self.weight = weight             # Agent's importance weight
        self.initial_weight = weight     # Store initial weight
        self.performance_history = deque(maxlen=100)  # Track recent performance
        self.processor = processor       # Reference to main processor
        self.project_keywords = processor.project_keywords  # Keywords for matching

    def evaluate(self, candidate: pd.Series) -> float:
        """
        Evaluate candidate based on agent's expertise area.
        Returns score between 0-1.
        """
        try:
            if self.expertise == 'title':
                return self._title_score(candidate)
            elif self.expertise == 'location':
                return self._location_score(candidate)
            elif self.expertise == 'connections':
                return self._connection_score(candidate)
            return 0.0
        except Exception as e:
            logger.error(f"Agent evaluation error: {str(e)}")
            return 0.0

    def _title_score(self, candidate) -> float:
        """Score candidate based on job title relevance"""
        try:
            title = candidate.get('cleaned_title', '').lower()
            return self.processor.get_base_score(title)
        except Exception as e:
            logger.error(f"Title scoring error: {str(e)}")
            return 0.0

    def _location_score(self, candidate) -> float:
        """
        Score candidate location based on tech hub tiers:
        - Tier 1 hubs: 1.0
        - Tier 2 hubs: 0.7
        - Canada/Turkey: 0.6
        - Other: 0.3
        """
        location = candidate.get('cleaned_location', 'unknown').lower()
        if location in self.processor.tech_hubs['tier1']:
            return 1.0  # Top tech hubs
        elif location in self.processor.tech_hubs['tier2']:
            return 0.7  # Secondary tech hubs
        elif location in {'canada', 'turkey'}:
            return 0.6  # International focus areas
        return 0.3     # Other locations

    def _connection_score(self, candidate) -> float:
        """
        Score professional network size (0-1)
        Normalized to max of 500 connections
        """
        connections = candidate.get('cleaned_connections', 0)
        return min(connections / 500.0, 1.0)

    def update_weight(self, success_rate: float):
        """
        Update agent's weight based on performance.
        Weight increases/decreases by up to 10% based on success.
        """
        self.weight *= 1 + (success_rate - 0.5) * 0.1  # Adjust weight
        self.performance_history.append(success_rate)   # Track performance



class HRTalentProcessor:
    """
    Main processor for HR talent ranking and evaluation.

    Features:
    - Title standardization and cleaning
    - Location-based scoring
    - Professional network evaluation
    - Candidate clustering
    - Performance metrics tracking

    Example:
    processor = HRTalentProcessor()
    results = processor.process_data(candidates_df)
    """
    def __init__(self):
        # Define HR role keywords
        self.project_keywords = {
            'primary': {
                'aspiring': ['aspiring human resources'],
                'seeking': ['seeking human resources']
            },
            'variations': {
                'aspiring': [
                    'aspiring human resources professional',
                    'aspiring human resources specialist',
                    'aspiring human resources generalist',
                    'aspiring human resources manager',
                    'aspiring human resources analyst'
                ],
                'seeking': [
                    'seeking human resources position',
                    'seeking human resources opportunities',
                    'seeking human resources hris',
                    'seeking hr position',
                    'seeking hr opportunities'
                ]
            }
        }

        # Define location tiers
        self.tech_hubs = {
            'tier1': {'texas', 'california', 'new york'},
            'tier2': {'illinois', 'massachusetts', 'north carolina'}
        }

        # Initialize metrics tracking
        self.metrics = {
            'total_candidates': 0,
            'hr_candidates': 0,
            'role_categories': {
                'aspiring_hr': 0,
                'seeking_hr': 0,
                'senior_hr': 0,
                'mid_level_hr': 0,
                'junior_hr': 0,
                'hr_adjacent': 0,
                'other_roles': 0
            },
            'keyword_matches': {
                'aspiring': 0,
                'seeking': 0,
                'other_hr': 0
            },
            'starred_profiles': [],
            'performance_metrics': {
                'initial_cutoff': 0.0,
                'final_cutoff': 0.0,
                'agent_adaptations': 0,
                'ranking_changes': 0,
                'genetic_optimizations': 0
            }
        }

        # Initialize components
        self.clusterer = TalentClusterer(n_clusters=5)
        self.agents = self._initialize_agents()
        self.tie_breaker = GeneticTieBreaker()

        # Track history
        self.final_scores = []
        self.weight_history = []
        self.cluster_sizes = {}
        self.category_distribution = {}

    def _initialize_agents(self) -> List[RankingAgent]:
        """
        Initialize ranking agents with weights:
        - Title (90%): 3 agents x 30% each
        - Location (5%): 2 agents x 2.5% each
        - Connections (5%): 2 agents x 2.5% each
        """
        agents = []
        # Title agents
        for _ in range(3):
            agents.append(RankingAgent('title', self, weight=0.3))
        # Location agents
        for _ in range(2):
            agents.append(RankingAgent('location', self, weight=0.025))
        # Connection agents
        for _ in range(2):
            agents.append(RankingAgent('connections', self, weight=0.025))
        return agents

    def clean_title(self, title: str) -> str:
        """
        Clean and standardize job titles into categories.
        Returns standardized title or NON-HR category.

        Example:
        'HR Manager' -> 'human resources manager'
        'Teacher' -> 'NON-HR: Education'
        """
        if not isinstance(title, str):
            return "NON-HR: Invalid Title"

        title = title.lower().strip()

        # Process by category
        # Aspiring HR roles
        if 'aspiring human resources' in title:
            if 'professional' in title:
                return 'aspiring human resources professional'
            elif 'specialist' in title:
                return 'aspiring human resources specialist'
            elif 'generalist' in title:
                return 'aspiring human resources generalist'
            elif 'manager' in title:
                return 'aspiring human resources manager'
            return 'aspiring human resources professional'

        # Seeking HR roles
        if 'seeking human resources' in title:
            if 'hris' in title:
                return 'seeking human resources hris position'
            elif 'position' in title:
                return 'seeking human resources position'
            return 'seeking human resources opportunities'

        # Senior HR roles
        if 'chro' in title or ('svp' in title and 'hr' in title):
            return 'chief human resources officer'
        if 'director' in title and 'human resources' in title:
            return 'director human resources'

        # HR roles
        if any(x in title for x in ['human resources', 'hr']):
            if 'senior' in title or 'sr' in title:
                return 'senior human resources specialist'
            elif 'specialist' in title:
                return 'human resources specialist'
            elif 'generalist' in title:
                return 'human resources generalist'
            elif 'coordinator' in title:
                return 'human resources coordinator'
            elif 'manager' in title:
                return 'human resources manager'
            return 'human resources professional'

        # Adjacent roles
        if 'people development' in title:
            return 'people development coordinator'

        # Non-HR categorization
        if 'teacher' in title or 'education' in title:
            return 'NON-HR: Education'
        if any(x in title for x in ['engineer', 'programmer', 'systems']):
            return 'NON-HR: Technical'
        if 'student' in title and not any(x in title for x in ['hr', 'human resources']):
            return 'NON-HR: Student'
        if 'director' in title or 'administration' in title:
            return 'NON-HR: Management'
        if 'research' in title or 'lab' in title:
            return 'NON-HR: Research'
        if 'business' in title and not any(x in title for x in ['hr', 'human resources']):
            return 'NON-HR: Business'

        return 'NON-HR: Other'

    def clean_location(self, location: str) -> str:
        """
        Clean and standardize location names.
        Maps cities to states and handles international locations.

        Example:
        'New York City' -> 'new york'
        'İzmir' -> 'turkey'
        """
        if not isinstance(location, str):
            return "unknown"

        # Clean and normalize
        location = location.lower().strip()
        location = (location.encode('utf-8', 'ignore')
                          .decode('utf-8')
                          .lower()
                          .strip())

        # Location mappings
        metro_to_state = {
            'grand rapids': 'michigan',
            'san francisco bay': 'california',
            'houston': 'texas',
            'dallas': 'texas',
            'austin': 'texas',
            'new york city': 'new york',
            'boston': 'massachusetts',
            'chicago': 'illinois',
            'chattanooga': 'tennessee',
            'virginia beach': 'virginia'
        }

        international_mapping = {
            'kanada': 'canada',
            'ä°zmir': 'turkey',
            'izmir': 'turkey',
            'tã¼rkiye': 'turkey',
            'türkiye': 'turkey',
            'amerika birleåÿik devletleri': 'united states'
        }

        # Check international names
        for old, new in international_mapping.items():
            if old in location:
                return new

        # Clean location text
        location = re.sub(r'\b(greater|area|metropolitan)\b', '', location)

        # Process city,state format
        if ',' in location:
            parts = [part.strip() for part in location.split(',')]
            city = parts[0]
            if city in metro_to_state:
                return metro_to_state[city]
            if len(parts) > 1:
                state = parts[1].strip()
                state = re.sub(r'\s+area$', '', state)
                if state in self.tech_hubs['tier1'] or state in self.tech_hubs['tier2']:
                    return state
            return city

        # Check metro areas
        for metro, state in metro_to_state.items():
            if metro in location:
                return state

        return location

    def clean_connections(self, connections) -> int:
        """
        Clean and normalize connection counts.
        Handles string formatting and caps at 500.

        Example:
        '500+' -> 500
        '250' -> 250
        """
        try:
            if isinstance(connections, str):
                connections = connections.replace('+', '').strip()
            return min(int(connections), 500)
        except (ValueError, TypeError):
            return 0




      # Scoring and normalization methods for HR talent processor

    def calculate_agent_consensus(self, candidate: pd.Series) -> float:
        """
        Calculate weighted average of agent scores for a candidate.
        Returns 0-1 score combining title, location, and network evaluations.
        """
        try:
            scores = [agent.evaluate(candidate) * agent.weight
                      for agent in self.agents]
            return sum(scores)
        except Exception as e:
            logger.error(f"Consensus calculation error: {str(e)}")
            return 0.0

    def get_base_score(self, title: str) -> float:
        """
        Calculate base score from job title with role-based thresholds:
        - Primary HR (0.90-1.00): Aspiring/seeking roles
        - Senior HR (0.70-0.89): Leadership roles
        - Current HR (0.50-0.69): Standard roles
        - Adjacent (0.30-0.49): Related roles
        - Non-HR (0.05): Other roles
        """
        if not isinstance(title, str) or not title:
            return 0.05

        # Score tiers
        if any(x in title.lower() for x in ['aspiring human resources', 'seeking human resources']):
            if 'professional' in title or 'specialist' in title:
                return 0.95
            if 'generalist' in title or 'manager' in title:
                return 0.92
            return 0.90

        if 'chief human resources officer' in title or 'chro' in title:
            return 0.85
        if 'director human resources' in title:
            return 0.82
        if 'senior human resources' in title:
            return 0.78

        if 'human resources manager' in title:
            return 0.65
        if 'human resources specialist' in title:
            return 0.62
        if 'human resources generalist' in title:
            return 0.60
        if 'human resources coordinator' in title:
            return 0.55
        if 'human resources professional' in title:
            return 0.52

        if 'people development' in title:
            return 0.45

        if title.startswith('NON-HR:'):
            return 0.05

        return 0.05

    def normalize_scores(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize candidate scores by:
        1. Calculating base title scores
        2. Setting initial final scores
        3. Clipping to 0-1 range
        """
        try:
            if df is None or df.empty:
                return df

            result_df = df.copy()
            result_df['base_score'] = result_df['cleaned_title'].apply(self.get_base_score)
            result_df['final_score'] = result_df['base_score']
            result_df['final_score'] = result_df['final_score'].clip(0, 1)

            return result_df

        except Exception as e:
            logger.error(f"Error in normalize_scores: {str(e)}")
            return df

    def prevent_bias(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Prevent scoring bias by:
        1. Normalizing within clusters
        2. Calculating diversity factors
        3. Adjusting for keyword representation
        4. Applying balanced adjustments
        """
        try:
            if 'cluster' not in df.columns:
                return df

            # Normalize cluster scores
            df['cluster_normalized_score'] = df.groupby('cluster')['final_score'].transform(
                lambda x: (x - x.mean()) / (x.std() if x.std() != 0 else 1)
            )

            # Calculate diversity metrics
            cluster_sizes = df.groupby('cluster').size()
            total_candidates = len(df)
            diversity_factor = 1 - (cluster_sizes / total_candidates)

            # Get keyword metrics
            cluster_keyword_ratio = df.groupby('cluster').apply(
                lambda g: sum(1 for title in g['cleaned_title']
                              if any(kw in title.lower()
                                    for kw in self.project_keywords['primary'])) / len(g),
                include_groups=False
            )

            # Apply adjustments
            for cluster in df['cluster'].unique():
                mask = df['cluster'] == cluster
                diversity_adjustment = diversity_factor[cluster] * 0.1
                keyword_adjustment = cluster_keyword_ratio[cluster] * 0.1

                df.loc[mask, 'final_score'] = df.loc[mask, 'final_score'].apply(
                    lambda x: x * (1 + diversity_adjustment + keyword_adjustment)
                )

            df['final_score'] = df['final_score'].clip(0, 1)
            return df

        except Exception as e:
            logger.error(f"Error in bias prevention: {str(e)}")
            return df

    def display_results(self, df: pd.DataFrame):
        """
        Display analysis results including:
        - Candidate distribution
        - Score guide
        - Categorized candidate listings
        - Project challenges

        Example output sections:
        - Distribution summary
        - Score ranges and priorities
        - Primary target candidates
        - Senior HR professionals
        - Non-HR roles
        """
        # Format header
        print("\n" + "="*100)
        print(" "*40 + "HR TALENT RANKING ANALYSIS")
        print("="*100)

        # Show distribution
        print("\n CANDIDATE DISTRIBUTION")
        print("-"*50)
        total = len(df)
        target = len(df[df['final_score'] >= 0.9])
        senior = len(df[(df['final_score'] >= 0.4) & (df['final_score'] < 0.9)])
        others = len(df[df['final_score'] < 0.4])

        print(f"Total Candidates Analyzed: {total}")
        print(f"\nBreakdown:")
        print(f"• Target Roles (Aspiring/Seeking HR):   {target:>4} ({target/total*100:>5.1f}%)")
        print(f"• Senior HR Professionals:              {senior:>4} ({senior/total*100:>5.1f}%)")
        print(f"• Other/Non-HR Roles:                  {others:>4} ({others/total*100:>5.1f}%)")

        # Show scoring guide
        print("\n SCORE GUIDE")
        print("-"*50)
        print("High Priority:")
        print("  0.90 - 1.00  |  Primary Target (Aspiring/Seeking HR)")
        print("  0.70 - 0.89  |  Strong HR Match")
        print("\nMid Priority:")
        print("  0.40 - 0.69  |  Senior HR Professional")
        print("  0.25 - 0.39  |  Mid-Level/Junior HR")
        print("\nLow Priority:")
        print("  0.15 - 0.24  |  HR Adjacent")
        print("  0.00 - 0.14  |  Non-HR Role")

        # Display candidate sections
        self._display_candidate_section(
            df[df['final_score'] >= 0.9],
            " PRIMARY TARGET CANDIDATES (Score ≥ 0.90)"
        )

        self._display_candidate_section(
            df[(df['final_score'] >= 0.4) & (df['final_score'] < 0.9)],
            " SENIOR HR PROFESSIONALS (Score 0.40-0.89)"
        )

        self._display_candidate_section(
            df[df['final_score'] < 0.4],
            " NON-HR AND OTHER ROLES (Score < 0.40)"
        )

        self.display_project_challenges(df)


    def _display_candidate_section(self, candidates: pd.DataFrame, title: str):
        """
        Display formatted candidate details grouped by category.
        Shows rank, ID, title, role, location, network and score.
        """
        if candidates.empty:
            return

        print(f"\n{title}")
        print("-"*100)

        for idx, (_, candidate) in enumerate(candidates.iterrows(), 1):
            print(f"\nRank #{idx}")
            print(f"Candidate ID: {candidate['id']}")
            print(f"Title: {candidate['job_title']}")
            print(f"Role/Category: {candidate['cleaned_title']}")
            print(f"Location: {candidate['cleaned_location']}")
            print(f"Network: {candidate['cleaned_connections']} connections")
            print(f"Score: {candidate['final_score']:.3f}")
            print("-"*50)

    def _display_cluster_analysis(self, df: pd.DataFrame):
        """
        Display cluster statistics and composition analysis.
        Shows metrics like average scores, sizes, keyword matches.
        """
        print("\nCANDIDATE CLUSTER ANALYSIS")
        print("=" * 50)

        # Calculate cluster stats
        cluster_stats = df.groupby('cluster').agg({
            'final_score': ['mean', 'count'],
            'cleaned_title': lambda x: sum(1 for title in x if any(
                kw in title.lower() for kw in self.project_keywords['primary']
            ))
        }).round(4)

        cluster_stats.columns = ['Average Score', 'Group Size', 'Keyword Matches']
        print("\nCluster Performance Metrics:")
        print(cluster_stats)

        # Show cluster compositions
        print("\nCluster Composition Analysis:")
        for cluster in df['cluster'].unique():
            cluster_df = df[df['cluster'] == cluster]
            print(f"\nCluster {cluster} Profile:")
            print(f"Total Members: {len(cluster_df)}")
            if len(cluster_df) > 0:
                print("Representative Titles:")
                for title in cluster_df['cleaned_title'].head(3):
                    print(f"- {title}")


    def display_project_challenges(self, df: pd.DataFrame):
        """
        Display key metrics and analysis results:
        - Algorithm effectiveness
        - Candidate distribution
        - Quality thresholds
        - Bias prevention measures
        """
        print("\n ANALYSIS OF PROJECT CHALLENGES")
        print("="*100)

        # Calculate metrics
        target_score = 0.90
        high_potential = len(df[df['final_score'] >= target_score])
        qualified_hr = len(df[(df['final_score'] >= 0.40) &
                            (df['final_score'] < target_score)])
        non_hr = len(df[df['final_score'] < 0.40])

        # Show algorithm performance
        print("\n1. Algorithm Performance & Effectiveness")
        print("-"*50)
        print("Our scoring system combines multiple factors with weighted importance:")
        print("• Job Title Analysis (90% of total score)")
        print("  - Identifies aspiring and seeking HR professionals")
        print("  - Evaluates current HR role levels")
        print("• Location Impact (5% of total score)")
        print("  - Considers tech hubs and major markets")
        print("• Professional Network (5% of total score)")
        print("  - Measures industry connections")

        # Show candidate distribution
        print("\n2. Candidate Pool Analysis")
        print("-"*50)
        print(f"From {len(df)} total candidates:")
        print(f"• Primary targets: {high_potential} ({high_potential/len(df)*100:.1f}%)")
        print(f"• Qualified HR professionals: {qualified_hr} ({qualified_hr/len(df)*100:.1f}%)")
        print(f"• Non-HR profiles: {non_hr} ({non_hr/len(df)*100:.1f}%)")

        # Show quality control
        print("\n3. Quality Control & Cutoff Analysis")
        print("-"*50)
        print("Dynamic scoring thresholds:")
        print(f"• Target threshold: {target_score:.2f}")
        print(f"• High-potential candidates: {high_potential} profiles")
        print("• Qualified HR threshold: 0.40")

        # Show bias prevention
        print("\n4. Bias Prevention & Fair Evaluation")
        print("-"*50)
        print("Multiple measures ensure unbiased assessment:")
        print("• Role-based scoring prioritizes relevant experience")
        print("• Location consideration limited to 5% impact")
        print("• Network size normalized to prevent seniority bias")
        print("• Cluster analysis promotes diverse candidate pool")


    def _get_rank_label(self, score: float) -> str:
        if score >= 0.90:
            return "Primary Target"
        elif score >= 0.70:
            return "Strong HR Match"
        elif score >= 0.40:
            return "Senior HR Professional"
        elif score >= 0.25:
            return "Mid-Level/Junior HR"
        elif score >= 0.15:
            return "HR Adjacent"
        else:
            return "Non-HR Role"


    def display_reranking_results(self, original_df: pd.DataFrame, reranked_df: pd.DataFrame):
        """
        Display changes in candidate rankings after reranking:
        - Position changes
        - Score updates
        - Rank label changes
        """
        print("\n RERANKING IMPACT ANALYSIS")
        print("="*100)

        changes = []
        for i in range(min(10, len(original_df))):
            old_id = original_df.iloc[i]['id']
            new_id = reranked_df.iloc[i]['id']
            if old_id != new_id:
                old_rank = original_df[original_df['id'] == old_id].index[0] + 1
                new_rank = reranked_df[reranked_df['id'] == new_id].index[0] + 1
                changes.append({
                    'id': new_id,
                    'old_rank': old_rank,
                    'new_rank': new_rank,
                    'title': reranked_df[reranked_df['id'] == new_id]['cleaned_title'].iloc[0],
                    'score': reranked_df[reranked_df['id'] == new_id]['final_score'].iloc[0],
                    'rank_label': self._get_rank_label(reranked_df[reranked_df['id'] == new_id]['final_score'].iloc[0])
                })

        if changes:
            print("\nSignificant Ranking Changes:")
            print("-"*50)
            for change in changes:
                print(f"\nCandidate ID: {change['id']}")
                print(f"Previous Rank: #{change['old_rank']}")
                print(f"New Rank: #{change['new_rank']}")
                print(f"Role: {change['title']}")
                print(f"Score: {change['score']:.3f}")
                print(f"Rank Label: {change['rank_label']}")

            print(f"\nTotal Position Changes: {len(changes)}")
            avg_movement = sum(abs(c['new_rank'] - c['old_rank']) for c in changes) / len(changes)
            print(f"Average Rank Movement: {avg_movement:.1f} positions")
        else:
            print("\nNo significant changes in rankings")


    def process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Process candidate data for HR talent ranking.

        Steps:
        1. Validate input data
        2. Clean and standardize fields
        3. Calculate scores
        4. Apply clustering
        5. Prevent bias

        Example:
        results = processor.process_data(candidates_df)
        """
        try:
            logger.info("Starting data processing pipeline...")

            # Validate DataFrame
            if df is None or df.empty:
                logger.error("Invalid input DataFrame")
                return pd.DataFrame()

            # Create working copy & map columns
            processed_df = df.copy()
            processed_df = processed_df.rename(columns={
                'Job Title': 'job_title',
                'Location': 'location',
                'Connection': 'connection',
                'ID': 'id'
            })

            # Validate required columns exist
            required_columns = {'id', 'job_title', 'location', 'connection'}
            if not required_columns.issubset(set(map(str.lower, processed_df.columns))):
                logger.error(f"Missing columns: {required_columns - current_columns}")
                return pd.DataFrame()

            # Convert data types
            processed_df['id'] = processed_df['id'].fillna(0).astype(int)
            processed_df['job_title'] = processed_df['job_title'].fillna('').astype(str)
            processed_df['location'] = processed_df['location'].fillna('').astype(str)
            processed_df['connection'] = processed_df['connection'].fillna('0').astype(str)

            # Clean and standardize fields
            processed_df['cleaned_title'] = processed_df['job_title'].apply(self.clean_title)
            processed_df['cleaned_location'] = processed_df['location'].apply(self.clean_location)
            processed_df['cleaned_connections'] = processed_df['connection'].apply(self.clean_connections)

            # Calculate scores
            processed_df['agent_score'] = processed_df.apply(self.calculate_agent_consensus, axis=1)
            processed_df['final_score'] = processed_df['agent_score']

            # Apply normalization
            processed_df = self.normalize_scores(processed_df)

            # Apply clustering if enough candidates
            if len(processed_df) >= 5:
                processed_df['cluster'] = self.clusterer.create_clusters(processed_df['cleaned_title'])
                processed_df['cluster_similarity'] = processed_df.apply(
                    lambda x: self.clusterer.get_cluster_similarity(
                        x['cleaned_title'],
                        int(x['cluster'])
                    ),
                    axis=1
                )
                processed_df = self.prevent_bias(processed_df)

            return processed_df

        except Exception as e:
            logger.error(f"Processing error: {str(e)}")
            return pd.DataFrame()


    def rerank_after_starring(self, df: pd.DataFrame, starred_id) -> pd.DataFrame:
        """
        Rerank candidates based on similarity to a starred candidate.

        Similarity factors:
        - Role type match (40%)
        - Experience level match (30%)
        - Role title match (30%)

        Example:
        reranked_df = rerank_after_starring(df, starred_id=123)
        """
        try:
            # Get starred candidate
            starred_id = int(starred_id)
            if 'id' not in df.columns or starred_id not in df['id'].values:
                return df
            starred = df[df['id'] == starred_id].iloc[0]

            # Calculate similarities
            df['role_similarity'] = df['cleaned_title'].apply(
                lambda x: get_role_similarity(x, starred['cleaned_title'])
            )
            df['location_similarity'] = df['cleaned_location'].apply(
                lambda x: 1.0 if x == starred['cleaned_location'] else
                        0.8 if x in self.tech_hubs['tier1'] else
                        0.6 if x in self.tech_hubs['tier2'] else 0.4
            )
            df['rank_preservation'] = 1 - (df.index / len(df))

            # Calculate weighted score
            df['reranked_score'] = (
                df['final_score'] * 0.50 +        # Original score
                df['role_similarity'] * 0.25 +    # Role match
                df['location_similarity'] * 0.15 + # Location match
                df['rank_preservation'] * 0.10    # Rank stability
            )

            # Update scores and sort
            max_original = df['final_score'].max()
            df['final_score'] = df.apply(
                lambda x: max(x['final_score'],
                            x['reranked_score'] * max_original),
                axis=1
            )
            df = df.sort_values('final_score', ascending=False).reset_index(drop=True)

            return df

        except Exception as e:
            logger.error(f"Reranking error: {str(e)}")
            return df

    def _update_role_categories(self, title: str) -> None:
        """Update role category metrics based on cleaned title"""
        if not title:
            self.metrics['role_categories']['other_roles'] += 1
            return

        # Target Roles
        if 'aspiring human resources' in title:
            self.metrics['role_categories']['aspiring_hr'] += 1
            self.metrics['keyword_matches']['aspiring'] += 1
            return

        if 'seeking human resources' in title:
            self.metrics['role_categories']['seeking_hr'] += 1
            self.metrics['keyword_matches']['seeking'] += 1
            return

        # Senior HR
        if any(x in title for x in ['chief human resources', 'senior vice president', 'director']):
            self.metrics['role_categories']['senior_hr'] += 1
            self.metrics['keyword_matches']['other_hr'] += 1
            return

        # Mid-Level HR
        if any(x in title for x in ['specialist', 'generalist', 'manager']):
            self.metrics['role_categories']['mid_level_hr'] += 1
            self.metrics['keyword_matches']['other_hr'] += 1
            return

        # Junior HR
        if 'coordinator' in title:
            self.metrics['role_categories']['junior_hr'] += 1
            self.metrics['keyword_matches']['other_hr'] += 1
            return

        # HR Adjacent/Other
        if title.startswith('NON-HR:'):
            self.metrics['role_categories']['other_roles'] += 1
        else:
            self.metrics['role_categories']['hr_adjacent'] += 1


def main():
    try:
        print("\nHR TALENT RANKING SYSTEM")
        print("=" * 80)
        print("\nSystem configuration focuses on project keywords:")
        print("- 'aspiring human resources': Primary target profile")
        print("- 'seeking human resources': Alternative target profile")

        print("\nInitiating data loading process...")
        try:
            # Load CSV file
            df = pd.read_csv('/content/potential-talents - Aspiring human resources - seeking human resources (2).csv')
            print(f"Successfully loaded {len(df)} candidate profiles")
        except Exception as e:
            print(f"Data loading error: {str(e)}")
            return None, None

        print("\nConfiguring HR talent processor...")
        processor = HRTalentProcessor()
        print("Processor initialized with:")
        print("- Multi-agent evaluation system")
        print("- Genetic tie-breaking algorithm")
        print("- Clustering capability")

        print("\nExecuting initial candidate evaluation...")
        results = processor.process_data(df)

        if results.empty:
            print("\nProcessing Error: No valid candidates identified")
            print("Please verify data format and content")
            return None, None

        print("\nINITIAL RANKING ANALYSIS")
        processor.display_results(results)

        # Display project challenges analysis
        processor.display_project_challenges(results)

        print("\nDEMONSTRATING EXPERT FEEDBACK MECHANISM")
        if len(results) >= 7:
            # Take 7th candidate as example
            starred_id = results.iloc[6]['id']
            starred = results.iloc[6]

            print(f"\nSimulating expert feedback by starring candidate:")
            print(f"ID: {starred['id']}")
            print(f"Title: {starred['job_title']}")

            # Perform reranking
            updated_results = processor.rerank_after_starring(results, starred_id)

            print("\nUPDATED RANKING ANALYSIS (Post-Feedback)")
            processor.display_results(updated_results)

        return processor, results

    except Exception as e:
        logger.error(f"\nCritical error in main execution: {str(e)}")
        print(f"\nSystem encountered an unexpected error: {str(e)}")
        print("Please check the log for detailed error information")
        return None, None

if __name__ == "__main__":
    processor, results = main()

    if processor is None or results is None:
        print("\nSystem execution failed. Please check error messages above.")
    else:
        print("\nSystem execution completed successfully.")
        print(f"Processed {len(results)} candidates")
        print("Use processor.display_results(results) for detailed analysis")


HR TALENT RANKING SYSTEM

System configuration focuses on project keywords:
- 'aspiring human resources': Primary target profile
- 'seeking human resources': Alternative target profile

Initiating data loading process...
Successfully loaded 104 candidate profiles

Configuring HR talent processor...
Processor initialized with:
- Multi-agent evaluation system
- Genetic tie-breaking algorithm
- Clustering capability

Executing initial candidate evaluation...

INITIAL RANKING ANALYSIS

                                        HR TALENT RANKING ANALYSIS

 CANDIDATE DISTRIBUTION
--------------------------------------------------
Total Candidates Analyzed: 104

Breakdown:
• Target Roles (Aspiring/Seeking HR):     49 ( 47.1%)
• Senior HR Professionals:                27 ( 26.0%)
• Other/Non-HR Roles:                    28 ( 26.9%)

 SCORE GUIDE
--------------------------------------------------
High Priority:
  0.90 - 1.00  |  Primary Target (Aspiring/Seeking HR)
  0.70 - 0.89  |  Strong HR Ma

ERROR:__main__:Reranking error: name 'get_role_similarity' is not defined



1. Algorithm Performance & Effectiveness
--------------------------------------------------
Our scoring system combines multiple factors with weighted importance:
• Job Title Analysis (90% of total score)
  - Identifies aspiring and seeking HR professionals
  - Evaluates current HR role levels
• Location Impact (5% of total score)
  - Considers tech hubs and major markets
• Professional Network (5% of total score)
  - Measures industry connections

2. Candidate Pool Analysis
--------------------------------------------------
From 104 total candidates:
• Primary targets: 49 (47.1%)
• Qualified HR professionals: 27 (26.0%)
• Non-HR profiles: 28 (26.9%)

3. Quality Control & Cutoff Analysis
--------------------------------------------------
Dynamic scoring thresholds:
• Target threshold: 0.90
• High-potential candidates: 49 profiles
• Qualified HR threshold: 0.40

4. Bias Prevention & Fair Evaluation
--------------------------------------------------
Multiple measures ensure unbiased ass