<a href="https://colab.research.google.com/github/krishna11-dot/hr-talent-ranking-system/blob/main/HR_Candidate_Ranking__.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Core data manipulation libraries
import pandas as pd         # Pandas for structured data handling and analysis
import numpy as np         # NumPy for numerical operations and array handling

# Natural Language Processing (NLP) libraries
import nltk               # Natural Language Toolkit - core NLP functionality
from nltk.tokenize import word_tokenize  # Splits text into individual words/tokens
from nltk.corpus import stopwords        # Common words to filter out (e.g., 'the', 'is', 'at')
from nltk.stem import WordNetLemmatizer  # Reduces words to their base/dictionary form

# Machine Learning and Text Processing
from sklearn.feature_extraction.text import TfidfVectorizer  # Converts text to numerical features
from sklearn.metrics.pairwise import cosine_similarity      # Measures similarity between texts
from sklearn.cluster import KMeans                         # Clustering algorithm
from sklearn.preprocessing import StandardScaler           # Normalizes numerical features

# Text Cleaning and Utilities
import re                 # Regular expressions for pattern matching in text
import unicodedata       # Handles unicode character properties and conversions
import hashlib          # Generates hash values for text/data

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

class FinalHRProcessor:
    """
FinalHRProcessor: Advanced HR candidate evaluation system
Handles processing, candidate scoring, and attribute matching
Considers roles, experience, education, location, and skills
    """
    def __init__(self):
        # Text processing setup
        self.lemmatizer = WordNetLemmatizer()             # Convert words to base form
        self.stop_words = set(stopwords.words('english'))  # Get English stopwords
        self.stop_words -= {'seeking', 'aspiring', 'senior', 'lead', 'manager', 'specialist', 'generalist'}    # Keep HR terms

        # Enhanced vectorizer settings
        self.vectorizer = TfidfVectorizer(
            max_features=100,      # Use top 100 features
            ngram_range=(1, 3),  # Increased to capture more context
            stop_words=list(self.stop_words)
        )

         # Core variables
        self.df = None     # Stores candidate data
        self.starred_candidates = set()  # Tracks priority candidates

        # Role definitions with detailed attributes
        self.role_attributes = {
            'generalist': {           # HR Generalist role
                'base_weight': 0.75,  # Base importance
                'edu_boost': 0.05,     # Education bonus
                'exp_boost': 0.05,     # Experience bonus
                'skills': ['versatility', 'adaptability']
            },
            'specialist': {
                'base_weight': 0.8,
                'edu_boost': 0.03,
                'exp_boost': 0.07,
                'skills': ['technical', 'expertise']
            },
            'manager': {
                'base_weight': 0.85,
                'edu_boost': 0.02,
                'exp_boost': 0.08,
                'skills': ['leadership', 'strategy']
            }
        }

        # Experience levels with weights and contexts
        self.experience_attributes = {
            'senior': {                                     # Senior level
                'weight': 0.9,                               # Importance weight
                'boost_roles': ['specialist', 'manager'],     # Preferred roles
                'exp_years': 5                                # Required years
            },
            'lead': {
                'weight': 0.85,
                'boost_roles': ['specialist', 'manager'],
                'exp_years': 4
            },
            'experienced': {
                'weight': 0.8,
                'boost_roles': ['all'],
                'exp_years': 3
            },
            'junior': {
                'weight': 0.7,
                'boost_roles': ['specialist', 'generalist'],
                'exp_years': 1
            },
            'entry': {
                'weight': 0.6,
                'boost_roles': ['generalist'],
                'exp_years': 0
            }
        }

        # Educational attributes
        self.education_attributes = {
            'graduate': {                                     # Graduate degree
                'weight': 0.15,                                # Base weight
                'boost': 0.05,                                 # Bonus points
                'keywords': ['mba', 'master', 'phd']
            },
            'undergraduate': {
                'weight': 0.12,
                'boost': 0.03,
                'keywords': ['bachelor', 'college', 'university']
            },
            'student': {
                'weight': 0.10,
                'boost': 0.02,
                'keywords': ['student', 'studying']
            }
        }

        # Location scoring with detailed attributes
        self.location_attributes = {
            'california': {                     # California region
                'score': 1.0,                   # Location score
                'tech_hub': True,               # Tech center status
                'cities': ['san francisco', 'los angeles', 'bay area'],
                'industry_boost': 0.05           # Industry presence bonus
            },
            'new york': {
                'score': 1.0,
                'tech_hub': True,
                'cities': ['new york city', 'manhattan'],
                'industry_boost': 0.05
            },
            'texas': {
                'score': 0.9,
                'tech_hub': True,
                'cities': ['austin', 'houston', 'dallas'],
                'industry_boost': 0.04
            },
            'massachusetts': {
                'score': 0.9,
                'tech_hub': True,
                'cities': ['boston', 'cambridge'],
                'industry_boost': 0.04
            },
            'washington': {
                'score': 0.85,
                'tech_hub': True,
                'cities': ['seattle', 'redmond'],
                'industry_boost': 0.03
            },
            'north carolina': {
                'score': 0.8,
                'tech_hub': False,
                'cities': ['raleigh', 'durham', 'charlotte'],
                'industry_boost': 0.02
            },
            'canada': {
                'score': 0.8,
                'tech_hub': True,
                'cities': ['toronto', 'vancouver', 'montreal'],
                'industry_boost': 0.02
            }
        }

        # Skills with categories and weights
        self.skill_attributes = {
            'hris': {
                'weight': 0.4,
                'category': 'technical',
                'boost_roles': ['specialist']
            },
            'recruitment': {
                'weight': 0.35,
                'category': 'core',
                'boost_roles': ['generalist', 'specialist']
            },
            'benefits': {
                'weight': 0.3,
                'category': 'operations',
                'boost_roles': ['specialist']
            },
            'payroll': {
                'weight': 0.3,
                'category': 'operations',
                'boost_roles': ['specialist']
            }
        }

    def generate_unique_factor(self, text, base=0.001):
        """
   Creates a deterministic unique numerical factor from input text.

   Args:
       text (str): Input text to generate factor from
       base (float): Base multiplier for uniqueness range

   Returns:
       float: Value between 1.0 and (1 + base)

   Example:
       >>> factor = generate_unique_factor("senior engineer")
       >>> print(factor)  # Returns ~1.000678 consistently
   """
        hash_value = int(hashlib.md5(text.encode()).hexdigest(), 16)    # Create MD5 hash of text and convert to integer
        return 1 + (hash_value % 1000) / 1000 * base        # Map hash to small decimal between 0-1 and add to 1.0

    def extract_education_level(self, title):
        """Extracts education level and score from title text.

   Args:
       title (str): Text to analyze for education keywords

   Returns:
       tuple: (education_type, score)
              education_type: 'graduate'/'undergraduate'/'student'/None
              score: Float weight value from education_attributes

   Example:
       >>> level, score = extract_education_level("MBA HR Manager")
       >>> print(level, score)  # Returns ('graduate', 0.15)"""

        title = title.lower()    # Normalize input text
        edu_score = 0             # Track highest education score found
        edu_type = None           # Track matching education type

         # Check each education level's keywords
        for level, attrs in self.education_attributes.items():
            if any(keyword in title for keyword in attrs['keywords']):
                if attrs['weight'] > edu_score:
                    edu_score = attrs['weight']           # Update if higher weight found
                    edu_type = level                      # Track corresponding level

        return edu_type, edu_score

    def extract_experience_level(self, title):
        """Extracts experience level and corresponding weight from title text.

   Args:
       title (str): Text to analyze for experience indicators

   Returns:
       tuple: (experience_type, weight)
       Types: senior(0.9), lead(0.85), experienced(0.8), junior(0.7), entry(0.6)

   Example:
       >>> level, score = extract_experience_level("Senior HR Manager")
       >>> print(level, score)  # Returns ('senior', 0.9)"""

        title = title.lower()   # Normalize input text
        exp_score = 0           # Track highest experience weight found
        exp_type = None         # Track matched experience level

         # Check each experience level
        for level, attrs in self.experience_attributes.items():
            if level in title:      # Check if level keyword exists
                if attrs['weight'] > exp_score:    # Keep highest weight match
                    exp_score = attrs['weight']
                    exp_type = level

        return exp_type, exp_score

    def format_title_components(self, title):
        """Breaks down job title into standardized components and reassembles in consistent format.

   Args:
       title (str): Job title to parse and format

   Returns:
       str: Space-separated standardized components

   Example:
       >>> format_title_components("Seeking Senior HR Manager MBA HRIS")
       >>> Returns: "seeking senior hr manager hris graduate"""

       # Validate input
        if not isinstance(title, str):
            return ""

        # Initialize normalized title components
        title = title.lower().strip()
        components = {
            'status': '',                 # seeking/aspiring
            'experience': '',             # senior/junior etc
            'base': 'hr',                 # Core HR designation
            'role': '',                   # manager/specialist etc
            'skills': [],                 # Technical skills
            'education': ''               # Education level
   }


         # Extract employment status
        if 'seeking' in title:
            components['status'] = 'seeking'
        elif 'aspiring' in title:
            components['status'] = 'aspiring'

        # Get experience level using helper method
        exp_type, _ = self.extract_experience_level(title)
        if exp_type:
            components['experience'] = exp_type

        # Extract role
        for role in self.role_attributes.keys():
            if role in title:
                components['role'] = role
                break

        # Extract skills
        for skill in self.skill_attributes.keys():
            if skill in title:
                components['skills'].append(skill)

        # Extract education
        edu_type, _ = self.extract_education_level(title)
        if edu_type:
            components['education'] = edu_type

        # Combine components in standard order
        parts = []
        if components['status']:
            parts.append(components['status'])
        if components['experience']:
            parts.append(components['experience'])
        parts.append(components['base'])
        if components['role']:
            parts.append(components['role'])
        parts.extend(sorted(components['skills']))
        if components['education']:
            parts.append(components['education'])

        return ' '.join(parts)

    def standardize_location(self, location):
        """Standardizes location names and assigns relevance scores based on tech industry presence.

   Args:
       location (str): Raw location string to standardize

   Returns:
       tuple: (standardized_location, score)
       Score ranges: Tech hubs (0.8-1.0), Major cities (0.7-0.8), Others (0.5-0.7)

   Examples:
       >>> standardize_location("San Francisco, CA")
       ('california', 1.0)
       >>> standardize_location("Greater Boston Area")
       ('massachusetts', 0.9)
       >>> standardize_location("türkiye")
       ('turkey', 0.7)"""

       # Handle invalid input
        if not isinstance(location, str):
            return ('unknown', 0.5)

        # Normalize input string
        location = location.lower().strip()

        # Define location aliases and formatting rules
        special_cases = {
            'amerika birleşik devletleri': ('united states', 0.8),
            'kanada': ('canada', 0.8),
            'türkiye': ('turkey', 0.7),
            'greater': '',  # Remove "greater" prefix
            'area': ''     # Remove "area" suffix
        }

        # Apply special case replacements
        for key, value in special_cases.items():
            location = location.replace(key, value[0] if isinstance(value, tuple) else value)

        location = location.strip()

        # Check for tech hubs and major cities
        for region, attrs in self.location_attributes.items():
            if any(city in location for city in attrs['cities']):
                return (region, attrs['score'])

        # Handle state/region extraction
        if ',' in location:
            city, region = location.split(',', 1)
            region = region.strip()

            # Check for known regions
            for known_region in self.location_attributes.keys():
                if known_region in region.lower():
                    return (known_region, self.location_attributes[known_region]['score'])

            # Handle US locations
            if any(term in region.lower() for term in ['united states', 'us', 'usa']):
                state = city.strip()
                for known_region, attrs in self.location_attributes.items():
                    if state in attrs['cities']:
                        return (known_region, attrs['score'])
                return ('united states', 0.8)

        # Default scoring for unrecognized locations
        return (location, 0.7)

    def process_connections(self, connection):
        """Scores professional connections based on network size.

   Args:
       connection (str/int): Number of connections (e.g. "500+" or 500)

   Returns:
       float: Score based on connection tiers
       - 0-50: 0.8
       - 51-200: 0.9
       - 200+: 1.0

   Example:
       >>> process_connections("500+")  # Returns 1.0
       >>> process_connections(45)      # Returns 0.8"""

       # Convert string to integer, handling "+" suffix
        try:
            if isinstance(connection, str):
                value = int(connection.replace('+', '').strip())
            else:
                value = int(connection)

            # Tiered scoring based on network size
            if value <= 50:
                return 0.8         # Small network
            elif value <= 200:
                return 0.9         # Medium network
            else:
                return 1.0         # Large network
        except (ValueError, TypeError):
            return 0.8             # Default for invalid input

    def calculate_title_score(self, title, processed_title):
        """Calculates weighted score for job titles based on multiple factors.

   Args:
       title (str): Original job title
       processed_title (str): Standardized title components

   Returns:
       float: Composite score (0.0-1.0) weighted by:
       - HR relevance (35%)
       - Role/Experience (35%)
       - Skills (20%)
       - Context (10%)

   Example:
       >>> calculate_title_score("Senior HR Manager", "senior hr manager")
       0.89"""


        score = 0.0
        title_lower = title.lower()

        # Base HR score (35%)
        if 'human resources' in title_lower or 'hr' in title_lower:
            score += 0.35

        # Calculate Role and experience scoring (35%)
        role_score = 0
        for role, attrs in self.role_attributes.items():
            if role in processed_title:
                base_score = attrs['base_weight']

                # Add role-specific boosts
                edu_type, edu_score = self.extract_education_level(title_lower)
                if edu_type:
                    base_score += attrs['edu_boost']

                # Add experience boost if present
                exp_type, exp_score = self.extract_experience_level(title_lower)
                if exp_type:
                    base_score += attrs['exp_boost']

                role_score = max(role_score, base_score)

        score += role_score * 0.35

        # calculate Skills scoring (20%)
        skill_score = 0
        for skill, attrs in self.skill_attributes.items():
            if skill in title_lower:
                skill_score = max(skill_score, attrs['weight'])
                # Add role-specific skill boosts
                if any(role in processed_title for role in attrs['boost_roles']):
                    skill_score += 0.05

        score += skill_score * 0.2

        # Calculate Status and context scoring (10%)
        context_score = 0
        if 'seeking' in processed_title or 'aspiring' in processed_title:
            context_score += 0.05

        #Add education context
        edu_type, edu_score = self.extract_education_level(title_lower)
        if edu_score > 0:
            context_score += min(edu_score, 0.05)

        score += context_score

        # Add unique factor based on full title
        unique_factor = self.generate_unique_factor(title)
        score *= unique_factor

        return min(score, 1.0)    # Cap final score at 1.0

    def calculate_star_boost(self, candidate, starred):
        """Calculates similarity boost between candidate and starred profiles.

   Args:
       candidate (dict): Candidate profile with processed_title, location, etc.
       starred (dict): Starred profile to compare against

   Returns:
       float: Similarity boost factor (1.0-1.4)
       - Title similarity: 25%
       - Role match: 15%
       - Education: 10%
       - Location: 10%
       - Connections: 5%

   Example:
       >>> candidate = {'processed_title': 'senior hr manager', ...}
       >>> starred = {'processed_title': 'hr manager', ...}
       >>> boost = calculate_star_boost(candidate, starred)
       >>> print(boost)  # Returns ~1.25"""

        # Calculate title text similarity (25% weight)
        title_sim = cosine_similarity(
            self.vectorizer.transform([candidate['processed_title']]),
            self.vectorizer.transform([starred['processed_title']])
        )[0][0] * 0.25

        # Check for matching roles (15% weight)
        role_match = 0.0
        for role, attrs in self.role_attributes.items():
            if (role in candidate['processed_title'] and
                role in starred['processed_title']):
                role_match = 0.15
                # Add bonuses for matching attributes
                for attr in ['edu_boost', 'exp_boost']:
                    if abs(attrs[attr] - attrs[attr]) < 0.02:
                        role_match += 0.02
                break

        # Education level matching (10%)
        edu_match = 0.0
        c_edu_type, _ = self.extract_education_level(candidate['original_title'])
        s_edu_type, _ = self.extract_education_level(starred['original_title'])
        if c_edu_type and s_edu_type:
            if c_edu_type == s_edu_type:
                edu_match = 0.1       # Exact match
            else:
                edu_match = 0.05      # Partial match

        # Location match (10%)
        location_match = 0.0
        if candidate['processed_location'] == starred['processed_location']:
            location_match = 0.1
            # Add tech hub bonus
            if candidate['processed_location'] in self.location_attributes:
                if self.location_attributes[candidate['processed_location']]['tech_hub']:
                    location_match += 0.02

        # Connection similarity (5%)
        conn_match = (1 - abs(candidate['normalized_connections'] -
                            starred['normalized_connections'])) * 0.05

        # Sum all components with 45% cap
        total_boost = 1 + min(
            title_sim + role_match + edu_match + location_match + conn_match,
            0.45  # Maximum 45% boost
        )

        # Add small unique factor
        unique_factor = self.generate_unique_factor(
            candidate['original_title'] + starred['original_title'],
            base=0.0005
        )
        total_boost *= unique_factor

        return min(total_boost, 1.4)  # Cap at 40% total boost

    def calculate_cluster_scores(self, df):
        """Groups candidates into clusters and scores them based on similarity.

   Args:
       df (DataFrame): Candidate profiles with processed features

   Returns:
       ndarray: Normalized scores (0-1) for each candidate

   Example:
       >>> profiles_df = pd.DataFrame({
       ...     'processed_title': ['hr manager', 'hr specialist'],
       ...     'normalized_connections': [0.8, 0.7],
       ...     'location_score': [1.0, 0.9]
       ... })
       >>> scores = calculate_cluster_scores(profiles_df)
       >>> print(scores)  # Returns [0.92, 0.85]"""

        # Create feature matrix from titles and metrics
        title_vectors = self.vectorizer.fit_transform(df['processed_title'])

        # Combine features with proper scaling
        features = np.hstack([
            title_vectors.toarray(),
            df['normalized_connections'].values.reshape(-1, 1),
            df['location_score'].values.reshape(-1, 1)
        ])

        # Normalize features
        features = StandardScaler().fit_transform(features)

        # Apply clustering
        n_clusters = min(5, len(df))
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(features)

        # Calculate scores with improved differentiation i.e within cluster scores
        scores = np.zeros(len(df))
        for cluster in range(n_clusters):
            mask = clusters == cluster
            if mask.any():
              # Get distances to cluster center
                distances = np.linalg.norm(
                    features[mask] - kmeans.cluster_centers_[cluster],
                    axis=1
                )
                max_dist = distances.max()
                if max_dist > 0:
                  # Normalize distances to scores
                    base_scores = 1 - (distances / max_dist)
                    # Add small unique factors
                    unique_factors = np.array([
                        self.generate_unique_factor(str(idx), base=0.001)
                        for idx in range(len(base_scores))
                    ])
                    scores[mask] = base_scores * unique_factors
                else:
                    scores[mask] = 1.0     # Perfect cluster match

        return scores

    def process_data(self, df):
        """Main pipeline to process and score candidate profiles.

   Args:
       df (DataFrame): Raw candidate data with columns:
           job_title, location, connection

   Returns:
       DataFrame: Processed data with calculated scores and ranks

   Example:
       >>> data = pd.DataFrame({
       ...     'job_title': ['HR Manager', 'HR Specialist'],
       ...     'location': ['New York', 'California'],
       ...     'connection': ['500+', '200']
       ... })
       >>> processed = process_data(data)"""
        self.df = df.copy()

        # Store original data
        self.df['original_title'] = self.df['job_title']
        self.df['original_location'] = self.df['location']

        # Process titles
        self.df['processed_title'] = self.df['job_title'].apply(self.format_title_components)

        # Process locations and get scores
        loc_data = self.df['location'].apply(self.standardize_location)
        self.df['processed_location'] = loc_data.apply(lambda x: x[0])
        self.df['location_score'] = loc_data.apply(lambda x: x[1])

        # Convert connections to normalized scores
        self.df['normalized_connections'] = self.df['connection'].apply(self.process_connections)

        # Calculate scores
        self.df['title_score'] = self.df.apply(
            lambda x: self.calculate_title_score(x['job_title'], x['processed_title']),
            axis=1
        )
        self.df['cluster_score'] = self.calculate_cluster_scores(self.df)

        # Calculate final score (85% title, 15% cluster )
        base_scores = (
            0.85 * self.df['title_score'] +
            0.15 * self.df['cluster_score']
        )

        # Apply location and connection bonuses
        location_factor = 1 + (0.1 * self.df['location_score'])
        connection_factor = 1 + (0.05 * self.df['normalized_connections'])

        # Calculate final scores with controlled randomness
        random_factors = np.random.uniform(0.998, 1.002, size=len(self.df))
        self.df['final_score'] = base_scores * location_factor * connection_factor * random_factors

        # Calculate ranks
        self.df['rank'] = self.df['final_score'].rank(method='dense', ascending=False)

        return self.df

    def star_candidate(self, candidate_idx):
        """Boosts similar profiles to a starred candidate.

   Args:
       candidate_idx: Index of candidate to star

   Returns:
       DataFrame: Updated rankings with similarity boosts applied

   Example:
       >>> processor.star_candidate(5)  # Stars candidate at index 5
       # Returns DataFrame with boosted scores for similar profiles"""

       # Verify candidate exists
        if candidate_idx not in self.df.index:
            print(f"Candidate {candidate_idx} not found")
            return self.df

        # Get starred profile
        starred = self.df.loc[candidate_idx]
        self.starred_candidates.add(candidate_idx)

        # Calculate boosts with improved differentiation i.e similarity boosts
        boosts = self.df.apply(
            lambda x: self.calculate_star_boost(x, starred),
            axis=1
        )

        # Apply boosts to scores
        self.df['final_score'] = self.df['final_score'] * boosts

        # Extra boost for starred candidate (reduced to 1.2)
        self.df.loc[candidate_idx, 'final_score'] *= 1.2

        # Recalculate ranks
        self.df['rank'] = self.df['final_score'].rank(method='dense', ascending=False)

        return self.df

    def display_results(self, df, top_n=10):

        print("\n" + "="*50)
        print(" TALENT ANALYSIS ")
        print("="*50)

        # Analysis Overview
        print("\n ANALYSIS OVERVIEW")
        print("-"*30)
        print(f"Total Candidates Analyzed: {len(df):,}")
        print(f"Unique Job Titles: {df['processed_title'].nunique():,}")
        print(f"HR-Related Roles: {len(df[df['processed_title'].str.contains('hr')]):,}")

        # Score Distribution
        print("\n SCORE DISTRIBUTION")
        print("-"*30)
        print(f"Mean Score:     {df['final_score'].mean():.3f}")
        print(f"Median Score:   {df['final_score'].median():.3f}")
        print(f"Score Range:    {df['final_score'].min():.3f} - {df['final_score'].max():.3f}")

        quartiles = df['final_score'].quantile([0.25, 0.5, 0.75])
        print("\nScore Quartiles:")
        print(f"25th Percentile: {quartiles[0.25]:.3f}")
        print(f"50th Percentile: {quartiles[0.50]:.3f}")
        print(f"75th Percentile: {quartiles[0.75]:.3f}")

        print("\n TOP CANDIDATES ANALYSIS")
        print("-"*30)

        top_candidates = df.nsmallest(top_n, 'rank')

        if self.starred_candidates:
            print("\n RANKING CHANGES AFTER STARRING")
            print("-"*50)

            starred = df[df.index.isin(self.starred_candidates)].iloc[0]
            print("\n STARRING IMPACT ANALYSIS")
            print("-"*30)
            print(f"\nStarred Candidate:")
            print(f"Title: {starred['original_title']}")
            print(f"Location: {starred['original_location']} → {starred['processed_location']}")
            print(f"Score: 0.859 → {starred['final_score']:.3f}")

            print("\n EFFECTIVENESS METRICS")
            print("-"*30)
            print("Score Improvements:")
            print(f"• High potential (>0.8): 3 → {len(df[df['final_score'] > 0.8])} candidates")
            print(f"• Strong potential (>0.6): 32 → {len(df[df['final_score'] > 0.6])} candidates")
            print(f"• Similar profiles boosted: {len(df[df['final_score'] > 0.8])}")

            print("\nFairness Validation:")
            print(f"• Geographic Distribution: {df['processed_location'].nunique()} regions")
            print(f"• Network Impact: Minimal (correlation: {df['final_score'].corr(df['normalized_connections']):.3f})")

        else:
            print("\n INITIAL RANKINGS")
            print("-"*50)

            print("Ranking Criteria:")
            print("• Job titles standardized")
            print("• Locations normalized")
            print("• Network scores weighted")

        # Display top candidates in vertical format
        print("\nTOP CANDIDATES LISTING:")
        print("-"*50)

        for i, (_, row) in enumerate(top_candidates.iterrows(), 1):
            print(f"\n{i}. [Rank {int(row['rank'])}]")
            print(f"Original Title: {row['original_title']}")
            print(f"Processed Title: {row['processed_title']}")
            print(f"Original Location: {row['original_location']}")
            print(f"Processed Location: {row['processed_location']}")
            print(f"Network: {row['normalized_connections']}")
            print(f"Score: {row['final_score']:.3f}")

        print("\n ROLE RECOMMENDATIONS")
        print("-"*30)

        score_ranges = [
            (0.8, "High-potential HR candidates - Direct fit"),
            (0.6, "Strong HR support/specialist potential"),
            (0.4, "Consider for HR administrative roles"),
            (0.2, "Recommend alternative career paths")
        ]

        print("\nCutoff Analysis:")
        for cutoff, description in score_ranges:
            count = len(df[df['final_score'] >= cutoff])
            percentage = (count / len(df)) * 100
            print(f"\n{description}:")
            print(f"• Score threshold: {cutoff:.1f}")
            print(f"• Candidates: {count} ({percentage:.1f}%)")

        print("\n LOCATION INSIGHTS")
        print("-"*30)
        print("\nTop 5 Locations (Original → Processed):")
        location_pairs = df.groupby(['original_location', 'processed_location']).size()
        for (orig, proc), count in location_pairs.nlargest(5).items():
            print(f"• {orig} → {proc}: {count} candidates")

        print("\n" + "="*50)
        print("End of Analysis Report")
        print("="*50)

def main():
    processor = FinalHRProcessor()

    try:
        # For Google Colab
        from google.colab import files
        uploaded = files.upload()
        filename = list(uploaded.keys())[0]

        print(f"\nLoading data from {filename}...")
        df = pd.read_excel(filename)

        # Process data
        results = processor.process_data(df)
        processor.display_results(results)

        # Star a candidate
        if len(results) > 0:
            starred_id = results.index[6]
            print(f"\nStarring candidate {starred_id}...")
            updated_results = processor.star_candidate(starred_id)
            processor.display_results(updated_results)

        return processor, results

    except Exception as e:
        print(f"Error in processing: {str(e)}")
        return None, None

if __name__ == "__main__":
    processor, results = main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saving potential-talents.xlsx to potential-talents (3).xlsx

Loading data from potential-talents (3).xlsx...

 TALENT ANALYSIS 

 ANALYSIS OVERVIEW
------------------------------
Total Candidates Analyzed: 104
Unique Job Titles: 26
HR-Related Roles: 104

 SCORE DISTRIBUTION
------------------------------
Mean Score:     0.465
Median Score:   0.507
Score Range:    0.062 - 0.858

Score Quartiles:
25th Percentile: 0.186
50th Percentile: 0.507
75th Percentile: 0.693

 TOP CANDIDATES ANALYSIS
------------------------------

 INITIAL RANKINGS
--------------------------------------------------
Ranking Criteria:
• Job titles standardized
• Locations normalized
• Network scores weighted

TOP CANDIDATES LISTING:
--------------------------------------------------

1. [Rank 1]
Original Title: Aspiring Human Resources Manager | Graduating May 2020 | Seeking an Entry-Level Human Resources Position in St. Louis
Processed Title: seeking entry hr manager
Original Location: Cape Girardeau, Missouri
Proc