In [None]:
import PyPDF2
import os
import re
import faiss
from unidecode import unidecode
import numpy as np
import pandas as pd
from openai import Client
from typing import List
from confighowdy import PROCESSED_FILE


Functions are ordered by appearance, except for prompt-related functions, which are placed at the beginning for easy access and modification.

# Prompt functions:

In [None]:
def create_profile_prompt(input_dataframe: pd.DataFrame) -> str:
    """
    Generates a profile prompt string using specific information from the input DataFrame.
    
    Args:
        input_dataframe (pd.DataFrame): A DataFrame containing columns such as 'Name', 
                                        'Partner', 'Industry', 'Technologies', and 'Processed_CV'.
                                        
    Returns:
        str: A formatted string that describes the person's profile based on the DataFrame content.
    """
    profile_prompt_string = f""" {input_dataframe["Name"]} works in 
    {input_dataframe["Partner"]}, a company of {input_dataframe["Industry"]}
    which works with these technologies {input_dataframe["Technologies"]}.
    He has expertise in {input_dataframe["Processed_CV"]}
    """
    
    return profile_prompt_string


In [None]:
def create_prompt(input_cv: str, prompt_version: int) -> str:
    """
    Generates a prompt based on the input CV and the selected prompt version.

    Parameters:
    input_cv (str): The curriculum vitae provided as input.
    prompt_version (int): The version of the prompt to be created. 
                    Version 1 extracts detailed information, while Version 2 provides a brief specialization list.

    Returns:
    str: A formatted prompt string with instructions based on the chosen version.
    """
    
    # Version 1 prompt: Extracts specific information from the CV.
    if prompt_version == 1:
        content = f"""I want you to read the following CV: {input_cv}

        From this, please extract the following information (dates are not important):
        1 - Age, seniority, and current job
        2 - Education
        3 - Previous jobs
        4 - Areas of expertise
        5 - General knowledge
        6 - A TECHNICAL evaluation of one paragraph regarding the individual
        7 - Possible gaps in their education

        Do not add any additional sections.
        """
    
    # Version 2 prompt: Requests a brief list of five areas of specialization from the CV.
    if prompt_version == 2:
        content = f"""I want you to read the following CV: {input_cv}

        Provide a brief list of five areas of specialization. No need to describe the specifics of each area.
        """
    
    # Returns the constructed prompt based on the selected version.
    return content


# PDF processsing functions:

In [None]:

def process_all_pdfs(folder: str) -> dict:
    """
    Processes all PDF files in the specified folder, extracts their text content, and stores the content
    in a dictionary where the keys are the filenames and the values are the extracted text.

    Parameters:
    folder (str): The path to the folder containing the PDF files.

    Returns:
    dict: A dictionary where the keys are the PDF filenames and the values are the extracted text content of each PDF.
    """
    
    # Dictionary to store the content of each PDF file
    cvs_dict = {}

    # Iterate over all files in the specified folder
    for file in os.listdir(folder):
        # Check if the file has a .pdf extension
        if file.endswith('.pdf'):
            path = os.path.join(folder, file)
            
            # Open and read the PDF file
            with open(path, 'rb') as pdf_file:
                reader = PyPDF2.PdfReader(pdf_file)
                all_text = ""
                
                # Extract text from all pages of the PDF
                for page_num in range(len(reader.pages)):
                    page = reader.pages[page_num]
                    all_text += page.extract_text()
                
                # Store the extracted text in the dictionary with the filename as the key
                cvs_dict[file] = all_text

    # Return the dictionary with all the extracted text
    return cvs_dict


# Process CVs:

In [None]:
def summarize_cv(client: Client, query_gpt: str) -> str:
    """
    Sends a query to a GPT-based chat model and returns a summarized response based on the input CV.

    Parameters:
    client: The client object used to make API calls to the GPT chat model.
    query_gpt (str): The prompt or query to be sent to the GPT model for processing.

    Returns:
    str: The content of the response generated by the GPT model.
    """
    
    # Make the call to the GPT chat model
    cv_helper_chat = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": query_gpt,
            }
        ],
        model="gpt-4o",
    )

    # Extract the content of the response message
    response_content = cv_helper_chat.choices[0].message.content

    return response_content


In [None]:
def process_partners_cvs(cvs_dict: dict, save: bool = True) -> dict:
    """
    Processes a dictionary of CVs, summarizes each one, and optionally saves the result.

    Args:
        cvs_dict (dict): A dictionary where keys are partner names and values are their corresponding CVs.
        save (bool): A flag indicating whether to save the processed CVs to a file. Default is True.

    Returns:
        dict: A dictionary where keys are partner names and values are the summarized CVs.
    """
    
    cvs_dict_keys = list(cvs_dict.keys())  # Extracting the keys from the input dictionary
    keys = cvs_dict_keys
    dict_size = len(keys)
    all_cvs_processed = {}  # Dictionary to store the processed CVs

    for i in range(dict_size):
        # Choose prompt version and CV
        prompt_version = 2
        name = cvs_dict_keys[i]
        input_cv = cvs_dict[name]

        # Create prompt for GPT
        query_gpt = create_prompt(input_cv, prompt_version)

        # Append to the list of all processed CVs
        output_cv = summarize_cv(client, query_gpt)
        all_cvs_processed[name] = output_cv
    
    # Convert the processed CVs dictionary into a DataFrame
    df_cvs_processed = pd.DataFrame.from_dict(all_cvs_processed, orient='index', columns=['Processed_CV'])

    if save:
        # Save the DataFrame as a CSV file
        df_cvs_processed.to_csv(PROCESSED_FILE)
    
    return all_cvs_processed


In [None]:
def organize_dataframe(df_cvs_processed: pd.DataFrame) -> pd.DataFrame:
    """
    Organizes and processes a DataFrame by renaming columns, modifying entries, and applying transformations.

    Args:
        df_cvs_processed (pd.DataFrame): The input DataFrame containing CV data that needs processing.

    Returns:
        pd.DataFrame: The processed DataFrame with organized and cleaned data.
    """
    
    # Rename the 'Unnamed: 0' column to 'Name'
    df_cvs_processed.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)

    # Replace underscores with spaces in the 'Name' column entries
    df_cvs_processed['Name'] = df_cvs_processed['Name'].str.replace('_', ' ')

    # Remove the last 11 characters ("Resume.pdf") from entries in the 'Name' column
    df_cvs_processed['Name'] = df_cvs_processed['Name'].str[:-11]

    # Remove accents from the 'Name' column
    df_cvs_processed['Name'] = df_cvs_processed['Name'].apply(unidecode)

    # Apply the 'extract_sections' function to the 'Processed_CV' column and create the new column 'Processed_CV_list'
    df_cvs_processed['Processed_CV_list'] = df_cvs_processed['Processed_CV'].apply(extract_sections)
    
    return df_cvs_processed


In [None]:

def extract_sections(text: str) -> List[str]:
    """
    Extracts sections of text that are delimited by single digits followed by text, until the next digit or the end of the string.
    
    Args:
        text (str): The input string from which to extract sections.
    
    Returns:
        List[str]: A list of extracted and cleaned text sections.
    """
    
    # Define the regex pattern to find sections of text between single digits
    pattern = r"\b\d\b\s*(.*?)(?=\b\d\b|$)"
    
    # Extract matches from the text using the regex pattern
    matches = re.findall(pattern, text, re.DOTALL)
    
    # Clean the matches by removing any trailing '.' and '\n'
    cleaned_matches = [match.strip('. \n') for match in matches]
    
    return cleaned_matches


# Partner database processing functions

In [None]:

def get_partner_information(file_path: str) -> pd.DataFrame:
    """
    Loads Excel sheets, merges two specific sheets on the 'Partner' column, and processes the 'Name' column.

    Args:
        file_path (str): The path to the Excel file containing multiple sheets.

    Returns:
        pd.DataFrame: A merged DataFrame that combines information from the 'Developers' and 'Partners' sheets.
    """
    
    # Load all sheets into a dictionary of DataFrames
    all_sheets = pd.read_excel(file_path, sheet_name=None)

    # Access specific sheets
    df_sheet1 = all_sheets["Developers"]
    df_sheet2 = all_sheets["Partners"]

    # Merge the two sheets on the 'Partner' column
    merged_df = pd.merge(df_sheet1, df_sheet2, on="Partner", how="inner")

    # Normalize the 'Name' column by removing accents
    merged_df['Name'] = merged_df['Name'].apply(unidecode)

    return merged_df


# Embedding and similarity related functions:

In [None]:
def create_embedding(client: Client, cv_to_embed: str) -> list:
    """
    Generates embeddings for a given text (in this case, a CV) using the specified client and model.

    Parameters:
    client: The client object used to interact with the OpenAI API.
    cv_to_embed (str): The text (CV) that you want to convert into embeddings.

    Returns:
    list: A list of embeddings (floating point numbers) representing the semantic representation of the input text.
    """
    
    # Call the method to generate embeddings using the specified model
    embedding_response = client.embeddings.create(
        input=cv_to_embed,
        model="text-embedding-ada-002"  # Recommended model for embeddings
    )

    # Extract the embeddings from the response
    embeddings = embedding_response.data[0].embedding

    return embeddings


In [None]:

def process_and_normalize_embeddings(client: Client, profile_df: pd.DataFrame, save: bool = True) -> np.ndarray:
    """
    Generates embeddings for a list of processed CVs, normalizes them, and returns the normalized embedding matrix.

    Parameters:
    client: The OpenAI client object used to generate embeddings via the `create_embedding` function.
    all_cvs_processed (list): A list of processed CVs, where each CV is a string.
    n_cvs (int): The total number of CVs to process.

    Returns:
    np.ndarray: A 2D array of normalized embeddings (with shape `(n_cvs, embedding_dimension)`).
    """
    
    # Step 1: Initialize an empty list to store the embeddings
    embedding_matrix = []

    # Step 2: Loop through the number of CVs and generate embeddings for each
    for n in range(len(profile_df)):
        profile_prompt = create_profile_prompt(profile_df.iloc[n])
        cv_embedded = create_embedding(client, profile_prompt)  # Create embedding for each CV
        embedding_matrix.append(cv_embedded)  # Append the embedding to the list

    # Step 3: Convert the list of embeddings to a NumPy array of type float32
    embedding_matrix_array = np.array(embedding_matrix).astype('float32')
    
    # Step 4: Normalize the embeddings (just in case, to ensure they have unit norm)
    normalized_embeddings = normalize(embedding_matrix_array)
    
    if save:
        np.save("normalized_embeddings.npy", normalized_embeddings)

    return normalized_embeddings

# Similarity search functions:

In [None]:

def normalize(vectors: np.ndarray) -> np.ndarray:
    """
    Normalizes a set of vectors so that each vector has a magnitude (or length) of 1.

    Parameters:
    vectors (np.ndarray): A 2D NumPy array where each row is a vector to be normalized.

    Returns:
    np.ndarray: A 2D NumPy array of the same shape where each vector has been normalized to have a unit length.
    """
    
    # Calculate the norms (magnitudes) of each vector along the rows (axis=1)
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    
    # Normalize each vector by dividing by its corresponding norm
    return vectors / norms


In [None]:

def cosine_similarity(v1: np.ndarray, v2: np.ndarray) -> float:
    """
    Computes the cosine similarity between two vectors.

    Cosine similarity is a measure of similarity between two vectors of an inner product space 
    that measures the cosine of the angle between them. The value ranges from -1 to 1, where:
    - 1 indicates that the two vectors are identical in direction,
    - 0 indicates that the vectors are orthogonal (no similarity),
    - -1 indicates that the vectors are diametrically opposed.

    Parameters:
    v1 (np.ndarray): The first vector.
    v2 (np.ndarray): The second vector.

    Returns:
    float: The cosine similarity between `v1` and `v2`.
    """
    
    # Calculate the dot product of the two vectors
    dot_product = np.dot(v1, v2)
    
    # Calculate the norms (magnitudes) of both vectors
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    
    # Compute the cosine similarity
    return dot_product / (norm_v1 * norm_v2)


In [None]:

def find_nearest_neighbors_faiss(normalized_embeddings: np.ndarray, unit_vector: np.ndarray, k: int = 4) -> tuple:
    """
    Creates a FAISS index using L2 distance (which acts like cosine similarity for normalized vectors), 
    adds normalized embeddings to the index, and finds the k nearest neighbors to a query vector.

    Parameters:
    normalized_embeddings (np.ndarray): A 2D array of normalized embeddings to be added to the FAISS index.
    unit_vector (np.ndarray): The query vector, which should also be normalized and reshaped to (1, dimension).
    k (int): The number of nearest neighbors to find. Default is 4.

    Returns:
    tuple: A tuple containing two elements:
        - indices (np.ndarray): The indices of the k nearest neighbors in the embedding space.
        - distances (np.ndarray): The corresponding distances to the nearest neighbors.
    """
    
    # Step 1: Get the dimensionality of the embeddings
    dimension = normalized_embeddings.shape[1]
    
    # Step 2: Create a FAISS index based on L2 distance
    index = faiss.IndexFlatL2(dimension)
    
    # Step 3: Add normalized embeddings to the FAISS index
    index.add(normalized_embeddings)
    
    # Step 4: Ensure the query vector is reshaped to (1, dimension)
    query_vector = unit_vector.reshape(1, -1)
    
    # Step 5: Perform the search for the k nearest neighbors
    distances, indices = index.search(query_vector, k)
    
    # Return the indices of the nearest neighbors and their corresponding distances
    return indices, distances

# Response functions

In [None]:
def gpt_completion(model: str, prompt: str) -> dict:
    """
    Sends a prompt to the GPT model and retrieves the completion.

    Args:
        model (str): The model to be used, such as "gpt-4".
        prompt (str): The prompt or system message to be sent to the GPT model.

    Returns:
        dict: The response from the GPT model, containing the completion.
    """
    chat = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": prompt,
            }
        ]
    )
    return chat


In [None]:
def build_system_prompt(user_prompt: str, selected_professionals_string: str):
    system_prompt = f""" You are a chatbot for a staff enhancement company. 
        Your goal is to assist employees in finding professionals inside the company who can help them if they have questions.
        
        The user query in this case is: {user_prompt}
        
        And you have the next list of experts who can help him:
        {selected_professionals_string}.

        Answer cordially with the name of the experts and a brief description of the their expertise and technology knowledge. 
        In the end, make a subjective evaluation of the expert, and say how much do you think he/she can help
    """
    return system_prompt

In [None]:
# Function to classify the user's intent using GPT directly
def classify_user_intent_with_gpt(user_prompt: str) -> str:
    """
    Uses the GPT model to classify the user's intent.
    Returns "search expert" or "general inquiry" depending on the user's query.
    """
    classification_prompt = f"""You are a helpful assistant. The user has asked: '{user_prompt}'.
    
    Please classify this query as one of the following:
    1. "search expert" (The user is looking for an expert to assist them with a specific topic.)
    2. "general inquiry" (The user is asking a general question that doesn't require a company expert.)
    
    Respond only with one of the two options: "search expert" or "general inquiry".
    """

    # Call to GPT to classify the intent
    classification_response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": "You are an assistant that classifies user queries."
            },
            {
                "role": "user",
                "content": classification_prompt
            }
        ]
    )
    
    # Extract the classification from the model
    intent = classification_response.choices[0].message.content
    
    #print(intent)
    return intent

def build_general_prompt(user_prompt: str) -> str:
    """
    Builds the system prompt for general inquiries.
    """
    system_prompt = f"""You are a helpful assistant. The user asked: {user_prompt}. 
        Please respond with a useful and informative answer."""
    return system_prompt



In [None]:
# Function to generate embeddings for the user's prompt
def generate_user_embeddings(user_prompt: str) -> np.ndarray:
    """
    Generates embeddings for the user prompt using the specified model.
    """
    embedding_user = client.embeddings.create(
        input=user_prompt,
        model="text-embedding-ada-002"
    )
    return np.array(embedding_user.data[0].embedding)

# Function to find the nearest neighbors using FAISS
def find_similar_experts(embedding_user_array: np.ndarray, k: int = 4):
    """
    Finds the top k closest experts based on the user's embeddings.
    """
    # Load precomputed normalized embeddings
    normalized_embeddings = np.load("normalized_embeddings.npy")
    
    # Find the nearest neighbors using FAISS
    indices, distances = find_nearest_neighbors_faiss(normalized_embeddings, embedding_user_array, k=k)
    
    return indices, distances

# Function to select the most similar expert profiles
def select_professionals(indices: np.ndarray, profile_df: pd.DataFrame) -> str:
    """
    Selects the profiles of the most similar experts based on the provided indices.
    """
    selected_professionals = []
    
    # Append the profiles of the most similar professionals
    for n in indices[0]:
        selected_professionals.append(create_profile_prompt(profile_df.iloc[n]))

    # Convert the list into a single string, separated by new lines
    return "\n".join(selected_professionals)

# Function to handle the expert search logic
def handle_expert_search(user_prompt: str) -> str:
    """
    Handles the complete expert search logic based on the user prompt.
    """
    # Generate embeddings for the user prompt
    embedding_user_array = generate_user_embeddings(user_prompt)
    
    # Find the nearest experts using FAISS
    indices, distances = find_similar_experts(embedding_user_array)
    
    # Load professional profiles directly here
    profile_df = pd.read_csv("experts_profile.csv")
    
    # Select the profiles of the most similar experts
    selected_professionals_string = select_professionals(indices, profile_df)
    
    # Build the system prompt by combining the user prompt and the selected expert profiles
    system_prompt = f""" You are a chatbot for a staff enhancement company. 
        Your goal is to assist employees in finding professionals inside the company who can help them if they have questions.
        
        The user query in this case is: {user_prompt}
        
        And you have the next list of experts who can help him:
        {selected_professionals_string}.

        Answer cordially with the name of the experts and a brief description of their expertise and technology knowledge. 
        In the end, make a subjective evaluation of the expert, and say how much do you think he/she can help.
    """
    
    return system_prompt

# Main function to generate the response
def generate_response(user_prompt: str) -> str:
    """
    Generates the complete response depending on whether the user is looking for an expert or has a general inquiry.
    """
    # Classify the user's intent using GPT directly
    user_intent = classify_user_intent_with_gpt(user_prompt)

    if user_intent == "search expert":
        print("Wait a few moments while I search the database")
        print("This shouldn't take more than 20 seconds")
        print()
        
        # Handle expert search
        expert_search_prompt = handle_expert_search(user_prompt)
        
        # Generate the response using the GPT-4 model
        expert_helper_chat = gpt_completion("gpt-4", expert_search_prompt)
        
        return expert_helper_chat.choices[0].message.content
    
    else:
        # If it's a general inquiry, we construct a generic prompt
        system_prompt = build_general_prompt(user_prompt)

        # Generate the response using the GPT-4 model
        standard_chat = gpt_completion("gpt-4", system_prompt)
        
        return standard_chat.choices[0].message.content
