<a href="https://colab.research.google.com/github/joshuaalpuerto/ML-guide/blob/main/user_data_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qU langchain --progress-bar off
!pip install -qU langchainhub --progress-bar off
!pip install -qU duckduckgo-search --progress-bar off
!pip install -qU fireworks-ai --progress-bar off
!pip install -qU openai --progress-bar off
!pip install -qU langchain-fireworks --progress-bar off

In [2]:
# @title load fireworks API key
#connect to google drive
from google.colab import drive
import json
import os


drive.mount('/content/drive')

with open('/content/drive/MyDrive/env/env.json') as jsonfile:
    env = json.load(jsonfile)

os.environ["FIREWORKS_API_KEY"] = env['fireworks.ai']['apiKey']


Mounted at /content/drive


In [3]:
import openai
# This is required to make it work for old version of openai < 1
openai.api_base = "https://api.fireworks.ai/inference/v1"
openai.api_key = env['fireworks.ai']['apiKey']

In [4]:
from langchain.globals import set_llm_cache, set_debug
from langchain.cache import InMemoryCache

set_llm_cache(InMemoryCache())
# Turn this on only if you want to debug other wise it's hard to see the conversations.
set_debug(True)

In [8]:
from pydantic import BaseModel, Field
from langchain.agents import tool
from langchain.tools import BaseTool
from typing import Optional, Literal, Type
from typing_extensions import Annotated

from enum import Enum
from pydantic import BaseModel, Field


YesNoEnum = Literal["yes", "no", "unsure", "n/a"]
ResidenceEnum = Literal["european", "non european", "portugal", "unsure", "n/a"]
StayPeriodEnum = Literal["more than 12 months", "less than 12 months", "n/a"]
BlueCardDurationEnum = Literal["more than 18 months", "less than 18 months", "n/a"]
SalaryEnum = Literal["above 1500", "below 1500", "n/a"]
WorkExpEnum = Literal["more than 4 years", "less than 4 years", "n/a"]
EmploymentTypeEnum = Literal["contractor", "new employment", "extension of employment", "n/a"]
CurrentlyEmployedEnum = Literal["more than 12 months", "less than 12 months", "n/a"]

class ApplicantDataInput(BaseModel):
    has_work_contract: Optional[YesNoEnum] = Field(
        description="Indicates whether the applicant has a work contract.",
    )

    is_european: Optional[YesNoEnum] = Field(
        description="Indicates whether the applicant's nationality is in any country that is a member of European member state.",
    )

    residence: Optional[ResidenceEnum] = Field(
        description="Indicates whether the applicant is residence in any country that is member of European member state.",
    )

    intended_stay_in_portugal: Optional[StayPeriodEnum] = Field(
        description="Specifies the intended duration of the applicant's stay in Portugal, if applicable",
    )

    has_blue_card: Optional[YesNoEnum] = Field(
        description="Indicates whether the applicant holds an EU Blue Card.",
    )

    blue_card_duration: Optional[BlueCardDurationEnum] = Field(
        description="Specifies the duration of the EU Blue Card, if applicable.",
    )

    salary: Optional[SalaryEnum] = Field(
        description="Specifies the applicant's salary level.",
    )

    work_experience: Optional[WorkExpEnum] = Field(
        description="Specifies the applicant's years of work experience, if applicable.",
    )

    is_work_related: Optional[YesNoEnum] = Field(
        description="Indicates whether the applicant's work experience is related to the current application."
    )

    employment_type: Optional[EmploymentTypeEnum] = Field(
        description="Specifies the type of employment sought by the applicant, if applicable.",
    )

    duration_of_current_employment: Optional[CurrentlyEmployedEnum] = Field(
        description="Specifies the duration of the applicant's current employment, if applicable."
    )



class Visa_Recommender(BaseTool):
    name = "visa_recommender"
    description = "Recommend possible visa the user is applicable base on current data provided."
    args_schema: Type[BaseModel] = ApplicantDataInput
    return_direct = True

    def _run(
        self,
        has_work_contract='n/a',
        is_european_citizen='n/a',
        current_residence='n/a',
        # intended_stay_in_portugal='n/a',
        # has_blue_card='n/a',
        # blue_card_duration='n/a',
        # salary='n/a',
        # work_experience='n/a',
        # is_work_related='n/a',
        # employment_type='n/a',
        # duration_of_current_employment='n/a',
    ) -> str:
        pass

In [9]:
from langchain.tools.render import format_tool_to_openai_function
visa_recommender = Visa_Recommender()
functions = [visa_recommender]
tools = [{ "type": "function", "function": format_tool_to_openai_function(f)} for f in functions]

print(tools)

[{'type': 'function', 'function': {'name': 'visa_recommender', 'description': 'Recommend possible visa the user is applicable base on current data provided.', 'parameters': {'properties': {'has_work_contract': {'anyOf': [{'enum': ['yes', 'no', 'unsure', 'n/a'], 'type': 'string'}, {'type': 'null'}], 'description': 'Indicates whether the applicant has a work contract.'}, 'is_european': {'anyOf': [{'enum': ['yes', 'no', 'unsure', 'n/a'], 'type': 'string'}, {'type': 'null'}], 'description': "Indicates whether the applicant's nationality is in any country that is a member of European member state."}, 'residence': {'anyOf': [{'enum': ['european', 'non european', 'portugal', 'unsure', 'n/a'], 'type': 'string'}, {'type': 'null'}], 'description': 'Indicates whether the applicant is residence in any country that is member of European member state.'}, 'intended_stay_in_portugal': {'anyOf': [{'enum': ['more than 12 months', 'less than 12 months', 'n/a'], 'type': 'string'}, {'type': 'null'}], 'desc

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools.render import format_tool_to_openai_function
from textwrap import dedent


# Initialize a Fireworks chat model
# For function calling we cannot use ChatFireworks integration as it doesn't properly pass functions
llm = ChatOpenAI(model="accounts/fireworks/models/firefunction-v1",
                 openai_api_key=env['fireworks.ai']['apiKey'],
                 openai_api_base="https://api.fireworks.ai/inference/v1",
                 temperature= 0, max_tokens= 4096,
                 model_kwargs={ "tools":tools, "tool_choice": {"type": "function", "function": {"name":visa_recommender.name}} }
                )


prompt = ChatPromptTemplate.from_messages(
    [
        ("user", dedent("""\
You are an intelligent extractor. You will receive a user message.
From the user message you need carefully extract the values for the corresponding fields.

If you do your BEST WORK, I'll give you a $10,000 tip!

User Message: {input}
""")),
    ]
)

chain = prompt | llm

chain.invoke({
    "input": dedent("""
    I have an applicant who has a valid work contract for portugal, with spanish nationality, residence of Philippines and with salary of 4000 euros.
    """),
    "tools": [function.name for function in functions]
})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "\nI have an applicant who has a valid work contract for portugal, with spanish nationality,\n",
  "tools": [
    "visa_recommender"
  ]
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m{
  "input": "\nI have an applicant who has a valid work contract for portugal, with spanish nationality,\n",
  "tools": [
    "visa_recommender"
  ]
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [1ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: You are an intelligent extractor. You will receive a user message.\nFrom the user message you need carefully extract the values for the corresponding fields.\n\nIf you do your BEST WORK, I

AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_xnm27iPLDfdEqXR5udVZKX29', 'function': {'arguments': '{"has_work_contract": "yes", "is_european": "no", "residence": "portugal", "intended_stay_in_portugal": "less than 12 months", "has_blue_card": "n/a", "blue_card_duration": "n/a", "salary": "n/a", "work_experience": "n/a", "is_work_related": "n/a", "employment_type": "n/a", "duration_of_current_employment": "n/a"}', 'name': 'visa_recommender'}, 'type': 'function', 'index': 0}]}, response_metadata={'finish_reason': 'tool_calls', 'logprobs': None})

# Mistral testing

NOTE: It extracts data well compared to function calling

In [None]:
from langchain import PromptTemplate

SYSTEM_PROMPT = """<s>[INST] You are an intelligent extractor. You will receive a user message.
From the user message you need carefully extract the values for the corresponding fields.
You MUST not include the field in your final output if user does not provide the information for that specific field.
{{
       "has_work_contract": "Indicates whether the applicant has a work contract. (Yes | No)",
       "is_european_citizen": "Indicates whether the nationality of the applicant is part of  European union. (Yes | No)",
       "residence": "Indicates whether the applicant is residence in any European member state country. (European | Non European)",
       "intended_stay_in_portugal": "Specifies the intended duration of the applicant's stay in Portugal. (more than 12 months | less than 12 months)",
       "has_blue_card": "Indicates whether the applicant are holding an EU Blue Card. (Yes | No)",
       "blue_card_duration": "Specifies the duration of the EU Blue Card. (more than 18 months | less than 18 months)",
       "salary": "Specifies the applicant's salary level. (above 1500 | below 1500)",
       "work_experience": "Specifies the applicant's years of work experience. (more than 4 years | less than 4 years)",
       "is_work_related": "Indicates whether the applicant's work experience is related to the current application. (Yes | No)",
       "employment_type": "Specifies the type of employment sought by the applicant. (contractor | new employment | extension of employment)",
       "duration_of_current_employment": "Specifies the duration of the applicant's current employment. (more than 12 months | less than 12 months)"
}}

You MUST follow the rules below when generating your answer:
1. You must explain your answer first.
2. Your output should always be a json with the fields and values extracted from the user message.
3. Your output should remove fields which has null values.

If you do your BEST WORK, I'll give you a $10,000 tip!

User Message: {input}[/INST]"""

prompt = PromptTemplate(template=SYSTEM_PROMPT, input_variables=['input'])

In [None]:
from langchain_fireworks import Fireworks

# Initialize a Fireworks chat model
# For function calling we cannot use ChatFireworks integration as it doesn't properly pass functions
mixtral_llm = Fireworks(
      model="accounts/fireworks/models/mixtral-8x7b-instruct",
      fireworks_api_key=env['fireworks.ai']['apiKey'],
      base_url="https://api.fireworks.ai/inference/v1/completions",
      temperature= 0, max_tokens= 4096
    )

chain = prompt | mixtral_llm

# "I have an applicant who has a valid work contract, a greek national, currently residing in the Philippines and salary of 4000 euros."
# "My applicant is in Japan and will be back to his residence is Spain next month. He has a salary of 4000 euros." (confused)
chain.invoke({
    "input": "My applicant is in Japan and will be back to his residence is Spain next month. He has a salary of 4000 euros.",
})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "My applicant is in Japan and will be back to his residence is Spain next month. He has a salary of 4000 euros."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:prompt:PromptTemplate] Entering Prompt run with input:
[0m{
  "input": "My applicant is in Japan and will be back to his residence is Spain next month. He has a salary of 4000 euros."
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:prompt:PromptTemplate] [1ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:RunnableSequence > 3:llm:Fireworks] Entering LLM run with input:
[0m{
  "prompts": [
    "<s>[INST] You are an intelligent extractor. You will receive a user message. \nFrom the user message you need carefully extract the values for the corresponding fields. \nYou MUST not include the field in your final output if user does not provide the 

' Based on the user message, here is the extracted information in JSON format:\n\n{\n  "residence": "Non European",\n  "salary": "above 1500"\n}\n\nExplanation:\n\n1. The applicant is currently in Japan, so we cannot determine if they are a resident of a European member state country. Therefore, we set "residence" to "Non European".\n2. The user mentioned the applicant\'s salary is 4000 euros, so we set "salary" to "above 1500".\n3. Other fields like "has_work_contract", "is_european_citizen", "intended_stay_in_portugal", "has_blue_card", "blue_card_duration", "work_experience", "is_work_related", "employment_type", and "duration_of_current_employment" are not mentioned or cannot be determined from the user message, so they are not included in the output.'