### 1. Setup

In [15]:
%%capture
!pip install langchain-openai
!pip install python-dotenv

### 2. Imports

In [17]:
from typing import Optional
import json
import sys
import requests

from dotenv import load_dotenv, find_dotenv
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field


In [18]:
# Env variable
sys.path.append('../')
load_dotenv(find_dotenv())

True

### 3. Load profiles db

In [31]:
def load_profiles():
  with open("data/profiles.json", "r", encoding="utf-8") as f:
    return json.load(f)

In [32]:
profiles = load_profiles()
profiles_pd = pd.DataFrame.from_dict(profiles)

In [33]:
profiles_pd.head()

Unnamed: 0,id,full_name,title,location,spoken_languages,skills,experience_in_years,level_of_study
0,1,Mohamed Smith,Consultant Cloud,Rabat,"[Anglais, Arabe]","[Python, SQL, BigQuery]",5,Bac +5
1,2,Fatima Ben Ali,Développeur JavaScript,Casablanca,"[Espagnol, Français]","[JavaScript, React, Bootstrap]",5,Bac +5
2,3,Kamal Martin,Expert Frontend,Casablanca,[Anglais],"[Angular, JavaScript, Figma]",8,Doctorat
3,4,David El Fassi,Ingénieur Data,Casablanca,"[Anglais, Français]","[Python, Java, SQL, BigQuery]",3,Bac +5
4,5,John Lahmar,Développeur PHP,Casablanca,[Anglais],"[PHP, Laravel, Angular]",2,Bac +3


### 4. Define Search Query

In [11]:
class Experience(BaseModel):
    """Expérience requise"""
    min: Optional[int] = Field(default=None, description="Minimum d'expérience en années.")
    max: Optional[int] = Field(default=None, description="Maximum d'expérience en années.")

In [12]:
class SearchQuery(BaseModel):
    """La requête de recherche"""
    localisation: Optional[str] = Field(default=None, description="Localisation souhaitée.")
    niveau_etudes: Optional[str] = Field(default=None, description="Niveau d'étude souhaité.")
    experience: Optional[Experience] = Field(default=None, description="Expérience souhaité.")
    competences: Optional[list[str]] = Field(default=[], description="Compétences requises.")
    langues_parlees: Optional[list[str]] = Field(default=[], description="Langues parlées requises.")

### 5. Build Search with pandas

In [10]:
class PandasSearch:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def search(self, search_query: SearchQuery) -> pd.DataFrame:
        filtered_df = self.df.copy()
        if search_query.localisation:
            filtered_df = self._filter_by_value(filtered_df, "location", search_query.localisation)
        if search_query.niveau_etudes:
            filtered_df = self._filter_by_value(filtered_df, "level_of_study", search_query.niveau_etudes)
        if search_query.experience:
            range_values = [search_query.experience.min, search_query.experience.max]
            filtered_df = self._filter_by_range(filtered_df, "experience_in_years", range_values)
        if search_query.competences:
            filtered_df = self._filter_by_list(filtered_df, "skills", search_query.competences)
        if search_query.langues_parlees:
            filtered_df = self._filter_by_list(filtered_df, "spoken_languages", search_query.langues_parlees)
        return filtered_df

    @staticmethod
    def _filter_by_value(df: pd.DataFrame, column: str, value: str | int | float) -> pd.DataFrame:
        return df[df[column].apply(lambda val: str(val).lower() == str(value).lower())]

    @staticmethod
    def _filter_by_list(df: pd.DataFrame, column: str, items: list[any]) -> pd.DataFrame:
        if items:
            return df[df[column].apply(lambda x: bool(set(items) <= set(x)))]
        return df

    @staticmethod
    def _filter_by_range(df: pd.DataFrame, column: str, range_values: list[int]) -> pd.DataFrame:
        if not range_values:
            raise ValueError("range_values should not be None or empty")
        if len(range_values) != 2:
            raise ValueError("range_values should contain 2 values a min and a max")

        min_value, max_value = range_values
        if min_value is None and max_value is None:
            return df

        if min_value is None:
            return df[df[column].apply(lambda val: val <= max_value)]
        elif max_value is None:
            return df[df[column].apply(lambda val: val >= min_value)]
        else:
            return df[df[column].apply(lambda val: (min_value <= val <= max_value))]


### 6. Transcription

In [13]:
TRANSCRIPTION_DATA = {
  "response_format": "json",
  "language": "french",
  "prompt": ("Ce transcript il s'agit d'une requête de recherche des profils IT contenant leurs compétences, "
              "les langues, la disponibilité et le niveau d'étude. Les profils sont basés sur le Maroc.")
}

In [23]:
from config import TRANSCRIPTION_API_URL, TRANSCRIPTION_API_KEY

class Transcribe:
    """Transcription"""

    @staticmethod
    def transcribe(audio_file):
        headers = {
            "Authorization": f"Bearer {TRANSCRIPTION_API_KEY}"
        }
        files = {
            "file": ("test.wav", audio_file, "audio/wav"),
        }
        return requests.post(TRANSCRIPTION_API_URL, headers=headers, data=TRANSCRIPTION_DATA, files=files)


In [24]:
with open("audio_request.wav", "rb") as audio_file:
  transcription = Transcribe.transcribe(audio_file)

In [26]:
## Transcription
search_query = transcription.json()["text"]

### 7. Filters Extraction (Query Parsing)

In [27]:
PARSING_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
PARSING_PARAMS = {
    "max_tokens": 512,
    "temperature": 0,
}

In [28]:
from config import PARSING_API_URL, PARSING_API_KEY

class Parser:
    """Structurer la requête de recherche"""
    def __init__(self):
        self.llm = self._get_lmm()
        self.parser = self._get_parser()
        self.prompt = self._get_template().partial(schema=self.parser.get_format_instructions())
        self.chain = self.prompt | self.llm | self.parser

    def parse(self, query: str) -> SearchQuery:
        return self.chain.invoke({"text": query})

    @staticmethod
    def _get_lmm() -> ChatOpenAI:
        return ChatOpenAI(
            base_url=PARSING_API_URL,
            api_key=PARSING_API_KEY,
            model=PARSING_MODEL,
            max_tokens=PARSING_PARAMS["max_tokens"],
            temperature=PARSING_PARAMS["temperature"],
        )

    @staticmethod
    def _get_template() -> ChatPromptTemplate:
        return ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    "Analyser et extraire les informations pertinentes de la requête de recherche suivante et les structurer selon le schéma ci-dessous en format JSON :\n"
                    "`json\n{schema}\n`\n"
                    "Instructions :\n"
                    "- Niveau d'études : Déterminer le niveau d'éducation (ex. : 'Licence', 'Master', 'Doctorat').\n"
                    "- Expérience : Extraire les années d’expérience minimum et maximum. Si la forme de plus X ans "
                    "est utilisée remplacer mettez la valeur X dans le min et le max null. Le contraire pour la form de moins X ans.\n"
                    "- Compétences : Identifier les compétences techniques et comportementales (ex. : 'Python', 'gestion de projet').\n"
                    "- Langues parlées : Extraire les langues parlées mentionnées dans la requête (ex. : 'Anglais', 'Français', 'Allemand').\n"
                    "- Répondez avec la sortie structurée selon le schéma donné sans explications.\n"
                    "- N'ajoutez pas de \ aux champs du schéma de sortie comme langues\_parlees il faut préserver le format langues_parlees.\n"
                    "- Répondez sans explications."
                ),
                ("human", "{text}"),
            ]
        )

    @staticmethod
    def _get_parser() -> PydanticOutputParser:
        return PydanticOutputParser(pydantic_object=SearchQuery)

In [29]:
parsed_filters = Parser().parse(search_query)

In [30]:
parsed_filters

SearchQuery(localisation='Casablanca', niveau_etudes='bac plus 5', experience=Experience(min=3, max=5), competences=['Java', 'Python'], langues_parlees=['Anglais'])

### 8. Apply parsed filters on profiles db

In [38]:
## Map bac plus 5 to Bac +5
LEVEL_OF_STUDY_MAPPING = {
    "Bac plus 5": "Bac +5",
    "Bac plus 3": "Bac +3"
}

parsed_filters.niveau_etudes = LEVEL_OF_STUDY_MAPPING[parsed_filters.niveau_etudes.capitalize()]

In [39]:
parsed_filters

SearchQuery(localisation='Casablanca', niveau_etudes='Bac +5', experience=Experience(min=3, max=5), competences=['Java', 'Python'], langues_parlees=['Anglais'])

In [40]:
PandasSearch(profiles_pd).search(parsed_filters)

Unnamed: 0,id,full_name,title,location,spoken_languages,skills,experience_in_years,level_of_study
3,4,David El Fassi,Ingénieur Data,Casablanca,"[Anglais, Français]","[Python, Java, SQL, BigQuery]",3,Bac +5
