In [6]:
"""

A comprehensive system for processing and analyzing HR candidate data, with focus on
identifying potential HR talent through natural language processing and machine learning.

Example usage:
    # Initialize processor
    processor = HRDataProcessor()

    # Load data (in Google Colab)
    from google.colab import files
    uploaded = files.upload()
    filename = list(uploaded.keys())[0]
    df = pd.read_excel(filename)

    # Process data
    results = processor.process_data(df)
"""

# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import re
import unicodedata

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

class HRDataProcessor:
    """
    A class for processing HR candidate data and ranking potential talent.

    This class implements various NLP and ML techniques to:
    1. Clean and standardize job titles and locations
    2. Process connection data
    3. Calculate similarity scores
    4. Implement unsupervised ranking
    5. Handle starred candidates

    Example:
        processor = HRDataProcessor()
        df = pd.read_excel('candidates.xlsx')
        results = processor.process_data(df)
        processor.star_candidate(results.iloc[0]['id'])
    """

    def __init__(self):
        """
        Initialize the HRDataProcessor with necessary NLP tools and mapping dictionaries.

        Sets up:
        - NLP tools (lemmatizer, stop words, vectorizer)
        - Target phrases for HR roles
        - HR-related term mappings
        - Standard job titles
        - Terms and patterns to remove
        - Location mappings
        """
        # Initialize NLP tools
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.vectorizer = TfidfVectorizer()
        self.starred_candidates = set()
        self.df = None

        # Target phrases (high-priority candidates)
        self.target_phrases = [
            'aspiring human resources',
            'seeking human resources'
        ]

        # Core HR-related terms with standardized mappings
        self.hr_mappings = {
            'hr': 'human resources',
            'chro': 'chief human resources officer',
            'hro': 'human resources officer',
            'hrbp': 'human resources business partner',
            'people and culture': 'human resources',
            'people operations': 'human resources',
            'talent acquisition': 'human resources',
            'recruitment': 'human resources',
            'personnel': 'human resources'
        }

        # Standard HR job titles for consistency
        self.hr_job_titles = {
            'generalist': 'human resources generalist',
            'coordinator': 'human resources coordinator',
            'specialist': 'human resources specialist',
            'manager': 'human resources manager',
            'director': 'human resources director',
            'assistant': 'human resources assistant',
            'analyst': 'human resources analyst'
        }

        # Terms to remove (typically noise or irrelevant terms)
        self.remove_terms = {
            'current', 'student', 'graduate', 'member',
            'learning', 'development', 'teacher', 'instructor',
            'trainer', 'professor', 'educator'
        }

        # Patterns to remove (regex patterns for cleaning)
        self.patterns_to_remove = [
            r'\b[A-Z]\.[A-Z]\.',  # Remove initials
            r'\b\d{4}\b',         # Remove years
            r'\b\w+\s+(university|college|institute|school|academy)\b',
            r'\buniversity\b',
            r'\bcollege\b',
            r'\binstitute\b',
            r'\bschool\b',
            r'\blearning and development\b'
        ]

        # Location standardization mappings
        self.location_mappings = {
            'houston': 'texas, united states',
            'dallas': 'texas, united states',
            'austin': 'texas, united states',
            'san francisco': 'california, united states',
            'los angeles': 'california, united states',
            'bay area': 'california, united states',
            'kanada': 'canada',
            'new york city': 'new york, united states',
            'greater new york city area': 'new york, united states'
        }

    def preprocess_text(self, text):
        """
        Preprocess text data with enhanced lemmatization and cleaning.

        Args:
            text (str): Input text to process

        Returns:
            str: Cleaned and preprocessed text

        Example:
            >>> processor = HRDataProcessor()
            >>> processor.preprocess_text("HR Manager at ABC University")
            'human resources manager'
        """
        if not isinstance(text, str):
            return ""

        # Basic cleaning
        text = text.lower().strip()
        text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

        # Remove patterns
        for pattern in self.patterns_to_remove:
            text = re.sub(pattern, ' ', text, flags=re.IGNORECASE)

        # Remove special characters and extra spaces
        text = re.sub(r'[^a-z\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()

        # Tokenize and lemmatize with handling of bigrams
        tokens = word_tokenize(text)
        lemmatized_tokens = []

        i = 0
        while i < len(tokens):
            if i < len(tokens) - 1:
                bigram = f"{tokens[i]} {tokens[i+1]}"
                if bigram in self.hr_mappings:
                    lemmatized_tokens.extend(self.hr_mappings[bigram].split())
                    i += 2
                    continue

            token = tokens[i]
            if token in self.hr_mappings:
                lemmatized_tokens.extend(self.hr_mappings[token].split())
            elif token not in self.stop_words and token not in self.remove_terms:
                lemmatized_tokens.append(self.lemmatizer.lemmatize(token))
            i += 1

        return ' '.join(lemmatized_tokens)

    def clean_job_title(self, title):
        """
        Clean and standardize job titles with focus on HR roles.

        Args:
            title (str): Input job title

        Returns:
            str: Standardized job title

        Example:
            >>> processor = HRDataProcessor()
            >>> processor.clean_job_title("Aspiring HR Professional")
            'aspiring human resources'
        """
        if not isinstance(title, str):
            return ""

        processed_title = self.preprocess_text(title)

        # First priority: Aspiring/seeking HR roles
        if 'aspiring' in processed_title and ('hr' in processed_title or 'human resource' in processed_title):
            return 'aspiring human resources'

        if 'seeking' in processed_title and ('hr' in processed_title or 'human resource' in processed_title):
            return 'seeking human resources'

        # Second priority: Standard HR roles
        if any(term in processed_title for term in ['hr', 'human resource']):
            for role, full_title in self.hr_job_titles.items():
                if role in processed_title:
                    return full_title
            return 'human resources'

        return ''

    def standardize_location(self, location):
        """
        Standardize location names for consistency.

        Args:
            location (str): Input location string

        Returns:
            str: Standardized location string

        Example:
            >>> processor = HRDataProcessor()
            >>> processor.standardize_location("NYC")
            'new york, united states'
        """
        if not isinstance(location, str):
            return "unknown"

        location = location.lower().strip()

        # Handle specific mappings
        for key, value in self.location_mappings.items():
            if key in location:
                return value

        # General location processing
        if ',' in location:
            city, region = location.split(',', 1)
            region = region.strip()
            if region.lower() not in ['canada', 'turkey', 'united states']:
                return f"{region.strip()}, united states"

        return location

    def process_connections(self, connection):
        """
        Process connection counts with gentle normalization.

        Args:
            connection (str/int): Connection count

        Returns:
            float: Normalized connection score

        Example:
            >>> processor = HRDataProcessor()
            >>> processor.process_connections("500+")
            1.0
        """
        try:
            if isinstance(connection, str):
                value = int(connection.replace('+', '').strip())
            else:
                value = int(connection)

            # Gentle normalization
            if value <= 50:
                return 0.8  # Base score for new profiles
            elif value <= 200:
                return 0.9
            else:
                return 1.0

        except (ValueError, TypeError):
            return 0.8

    def calculate_similarity_scores(self, df):
        """
        Calculate similarity scores based on job titles.

        Args:
            df (pandas.DataFrame): Input DataFrame

        Returns:
            numpy.array: Array of similarity scores
        """
        processed_titles = df['processed_job_title'].fillna('')
        scores = np.zeros(len(df))

        # Exact matches get highest score
        scores[processed_titles == 'aspiring human resources'] = 1.0
        scores[processed_titles == 'seeking human resources'] = 1.0

        # Partial matches get intermediate scores
        for title in processed_titles.unique():
            if 'human resources' in title and title not in self.target_phrases:
                mask = processed_titles == title
                scores[mask] = 0.5

        return scores

    def create_feature_matrix(self, df):
        """
        Create feature matrix for clustering.

        Args:
            df (pandas.DataFrame): Input DataFrame

        Returns:
            numpy.array: Standardized feature matrix
        """
        title_vectorizer = TfidfVectorizer(max_features=20)
        title_features = title_vectorizer.fit_transform(
            df['processed_job_title'].fillna('')
        )

        connection_features = df['normalized_connections'].values.reshape(-1, 1)

        feature_matrix = np.hstack([
            title_features.toarray(),
            connection_features
        ])

        return StandardScaler().fit_transform(feature_matrix)

    def implement_unsupervised_ranking(self, df):
        """
        Implement unsupervised ranking using K-means clustering.

        Args:
            df (pandas.DataFrame): Input DataFrame

        Returns:
            numpy.array: Array of cluster-based scores
        """
        if len(df) == 0:
            return np.array([])

        feature_matrix = self.create_feature_matrix(df)

        n_clusters = min(5, len(df))
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)

        cluster_labels = kmeans.fit_predict(feature_matrix)
        cluster_scores = np.zeros(len(df))

        for cluster in range(n_clusters):
            cluster_mask = cluster_labels == cluster
            if np.any(cluster_mask):
                cluster_scores[cluster_mask] = -np.linalg.norm(
                    feature_matrix[cluster_mask] -
                    kmeans.cluster_centers_[cluster],
                    axis=1
                )

        cluster_scores = (cluster_scores - cluster_scores.min()) / (
            cluster_scores.max() - cluster_scores.min()
        )

        return cluster_scores

    def star_candidate(self, candidate_id):
        """
        Star a candidate and boost similar profiles.

        Args:
            candidate_id (int): ID of candidate to star

        Returns:
            pandas.DataFrame: Updated DataFrame with new scores

        Example:
            >>> processor = HRDataProcessor()
            >>> df = processor.process_data(input_df)
            >>> processor.star_candidate(75)  # Stars candidate with ID 75
        """
        if self.df is None:
            print("Please process data first before starring candidates")
            return

        if candidate_id not in self.df['id'].values:
            print(f"Candidate {candidate_id} not found")
            return

        print(f"\nBefore starring - Candidate {candidate_id}:")
        before_data = self.df[self.df['id'] == candidate_id][
            ['id', 'processed_job_title', 'final_score', 'rank']
        ].iloc[0]
        print(before_data)

        # Get starred candidate's features
        starred_candidate = self.df[self.df['id'] == candidate_id].iloc[0]

        # Add to starred set
        self.starred_candidates.add(candidate_id)

        # Calculate title similarity with starred candidate
        all_titles = list(self.df['processed_job_title']) + [starred_candidate['processed_job_title']]
        title_vectors = self.vectorizer.fit_transform(all_titles)
        similarities = cosine_similarity(
            title_vectors[:-1],
            title_vectors[-1:]
        ).flatten()

        # Calculate location boost
        location_boost = (self.df['processed_location'] ==
                         starred_candidate['processed_location']).astype(float) * 0.1

        # Calculate connection similarity
        connection_diff = abs(
            self.df['normalized_connections'] -
            starred_candidate['normalized_connections']
        )
        connection_boost = (1 - connection_diff) * 0.05

        # Calculate total boost factor
        boost_factor = 1 + (0.7 * similarities +
                           0.2 * location_boost +
                           0.1 * connection_boost)

        # Apply boost to scores
        self.df['final_score'] = self.df['final_score'] * boost_factor

        # Extra boost for the starred candidate
        self.df.loc[self.df['id'] == candidate_id, 'final_score'] *= 1.3

        # Recalculate ranks
        self.df['rank'] = self.df['final_score'].rank(method='dense', ascending=False)

        self._display_starring_results(candidate_id, before_data, boost_factor)
        return self.df

    def _display_starring_results(self, candidate_id, before_data, boost_factor):
        """
        Display results after starring a candidate (helper method).

        Args:
            candidate_id (int): ID of starred candidate
            before_data (pandas.Series): Candidate data before starring
            boost_factor (numpy.array): Boost factors for all candidates
        """
        print(f"\nAfter starring - Candidate {candidate_id}:")
        after_data = self.df[self.df['id'] == candidate_id][
            ['id', 'processed_job_title', 'final_score', 'rank']
        ].iloc[0]
        print(after_data)

        # Show influenced candidates
        print("\nTop candidates after starring:")
        top_df = self.df.nsmallest(10, 'rank')[
            ['id', 'processed_job_title', 'processed_location', 'final_score', 'rank']
        ]
        print(top_df)

        # Show similar candidates that were boosted
        boosted_mask = boost_factor > 1.1  # Show candidates with >10% boost
        if boosted_mask.any():
            print("\nCandidates most similar to starred candidate:")
            similar_df = self.df[boosted_mask].sort_values('final_score', ascending=False)[
                ['id', 'processed_job_title', 'processed_location', 'final_score', 'rank']
            ].head()
            print(similar_df)

    def calculate_final_score(self, row):
        """
        Calculate final score for a candidate with reduced connection impact.

        Args:
            row (pandas.Series): Row containing candidate data

        Returns:
            float: Final calculated score

        Example:
            >>> processor = HRDataProcessor()
            >>> score = processor.calculate_final_score(candidate_row)
        """
        base_score = (
            0.85 * row['title_similarity'] +
            0.15 * row['cluster_score']
        )

        # Only slightly modify base score with connections
        connection_modifier = 1 + (0.05 * (row['normalized_connections'] - 0.8))
        return base_score * connection_modifier

    def process_data(self, df):
        """
        Process input DataFrame through the complete pipeline.

        Args:
            df (pandas.DataFrame): Input DataFrame with candidate data

        Returns:
            pandas.DataFrame: Processed and ranked DataFrame

        Example:
            >>> processor = HRDataProcessor()
            >>> df = pd.read_excel('candidates.xlsx')
            >>> results = processor.process_data(df)
        """
        print("\nInput DataFrame Structure:")
        print("-" * 30)
        print("\nColumns:", df.columns.tolist())
        print("\nShape:", df.shape)
        print("\nSample of input data:")
        print(df.head())

        print("Starting data processing...")

        self.df = df.copy()

        # Basic preprocessing
        self.df['processed_job_title'] = df['job_title'].apply(self.clean_job_title)
        self.df = self.df[self.df['processed_job_title'] != '']
        self.df['processed_location'] = df['location'].apply(self.standardize_location)
        self.df['normalized_connections'] = df['connection'].apply(self.process_connections)

        # Calculate scores
        self.df['title_similarity'] = self.calculate_similarity_scores(self.df)
        self.df['cluster_score'] = self.implement_unsupervised_ranking(self.df)
        self.df['final_score'] = self.df.apply(self.calculate_final_score, axis=1)

        # Handle starred candidates
        if self.starred_candidates:
            self.df = self.star_candidate(list(self.starred_candidates)[0])

        # Final ranking
        self.df['rank'] = self.df['final_score'].rank(method='dense', ascending=False)

        self.display_results(self.df)
        return self.df.sort_values('rank')

    def display_results(self, df):
        """
        Display comprehensive results of the processing pipeline.

        Args:
            df (pandas.DataFrame): Processed DataFrame to display results for

        Example:
            >>> processor = HRDataProcessor()
            >>> processor.display_results(results_df)


        """


        print("\n=== Data Processing Steps ===")
        print("-" * 50)

    # Format the transformations in a clear table
        cleaning_example = pd.DataFrame({
            'Original Location': df['location'].head(3),
            'Standardized Location': df['processed_location'].head(3),
            'Original Connection': df['connection'].head(3),
            'Normalized Connection': df['normalized_connections'].head(3)
        })

        print("\nLocation and Connection Transformations:")
        print(cleaning_example.to_string(index=True))

        print("\nConnection Normalization Rules:")
        print("- Connections ≤ 50: Score = 0.8")
        print("- Connections 51-200: Score = 0.9")
        print("- Connections > 200: Score = 1.0")



        print("\nData Quality Report:")
        print("=" * 50)

        print(f"\nTotal records: {len(df)}")
        print(f"\nUnique job titles: {df['processed_job_title'].nunique()}")
        print("\nMost common job titles:")
        print(df['processed_job_title'].value_counts().head())

        print(f"\nUnique locations: {df['processed_location'].nunique()}")
        print("\nLocation distribution:")
        print(df['processed_location'].value_counts().head())

        print("\nScore Distribution:")
        print("\nTitle Similarity Scores:")
        print(f"Mean: {df['title_similarity'].mean():.3f}")
        print(f"Median: {df['title_similarity'].median():.3f}")
        print(f"Max: {df['title_similarity'].max():.3f}")

        if self.starred_candidates:
            print("\nStarred Candidates:")
            starred_df = df[df['id'].isin(self.starred_candidates)]
            print(starred_df[['id', 'processed_job_title', 'final_score', 'rank']])

        print("\nTop 10 Candidates:")
        display_cols = [
            'id', 'processed_job_title', 'title_similarity',
            'normalized_connections', 'final_score', 'rank'
        ]
        print(df.sort_values('rank')[display_cols].head(10))

def main():
    """
    Main function to run the HR Data Processor in Google Colab.

    Example:
        >>> processor, results = main()
    """
    processor = HRDataProcessor()

    try:
        # Upload and load data
        from google.colab import files
        print("Please upload your Excel file...")
        uploaded = files.upload()

        # Get the filename from uploaded files
        filename = list(uploaded.keys())[0]

        # Load and process data
        print(f"\nLoading data from {filename}...")
        df = pd.read_excel(filename)

        # Initial processing
        results_df = processor.process_data(df)

        # Example: Star a candidate (e.g., 7th ranked candidate)
        seventh_candidate = results_df.iloc[6]['id']
        print(f"\nStarring candidate {seventh_candidate} (7th in initial ranking)...")
        processor.star_candidate(seventh_candidate)

        print("\nProcessing complete!")

        return processor, results_df

    except Exception as e:
        print(f"Error in processing: {str(e)}")
        return None, None

if __name__ == "__main__":
    processor, results = main()

Please upload your Excel file...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Saving potential-talents.xlsx to potential-talents (3).xlsx

Loading data from potential-talents (3).xlsx...

Input DataFrame Structure:
------------------------------

Columns: ['id', 'job_title', 'location', 'connection', 'fit']

Shape: (104, 5)

Sample of input data:
   id                                          job_title  \
0   1  2019 C.T. Bauer College of Business Graduate (...   
1   2  Native English Teacher at EPIK (English Progra...   
2   3              Aspiring Human Resources Professional   
3   4             People Development Coordinator at Ryan   
4   5    Advisory Board Member at Celal Bayar University   

                              location connection  fit  
0                       Houston, Texas         85  NaN  
1                               Kanada      500+   NaN  
2  Raleigh-Durham, North Carolina Area         44  NaN  
3                        Denton, Texas      500+   NaN  
4                       İzmir, Türkiye      500+   NaN  
Starting data processing..