In [1]:
import torch

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig
from langchain.llms import HuggingFacePipeline
from langchain.llms import HuggingFaceHub

  from .autonotebook import tqdm as notebook_tqdm


# Call api

In [18]:
# Access token
HF_TOKEN = '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'
# create llm model
llm = HuggingFaceHub(
    repo_id="google/flan-t5-large",  # Replace with the model of your choice
    model_kwargs={"temperature": 0.5, "max_length": 256},
    huggingfacehub_api_token=HF_TOKEN
)

In [19]:
%%time
# Test the model with a prompt
response = llm.invoke("Explain the concept of machine learning in simple terms.")
print(response)

Machine learning is the process of learning to recognize patterns in data.
CPU times: total: 0 ns
Wall time: 1.71 s


In [20]:
# Access token
HF_TOKEN = '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'
# create llm model
llm = HuggingFaceHub(
    repo_id="microsoft/phi-2",  # Replace with the model of your choice
    model_kwargs={"temperature": 0.5, "max_length": 256},
    huggingfacehub_api_token=HF_TOKEN
)

In [21]:
%%time
# Test the model with a prompt
response = llm.invoke("Explain the concept of machine learning in simple terms.")
print(response)

Explain the concept of machine learning in simple terms.
## INPUT

##OUTPUT
Machine learning is a way of teaching computers to learn from data and make predictions or decisions without being explicitly programmed. For example, a machine learning algorithm can learn from a large amount of pictures of cats and dogs and then be able to identify whether a new picture is of a cat or a dog. Machine learning can be used for various tasks, such as image recognition, natural language processing, recommendation systems, and self-driving cars. Machine learning is based on the idea that computers can learn from patterns and examples in data, and improve their performance over time.

CPU times: total: 15.6 ms
Wall time: 795 ms


In [24]:
%%time
# Load the tokenizer corresponding to the specified model.
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CPU times: total: 203 ms
Wall time: 6.22 s


In [28]:
%%time
# Load the tokenizer corresponding to the specified model.
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

CPU times: total: 156 ms
Wall time: 558 ms


# Self-hosting

In [3]:
'''
model_name: str = 'microsoft/phi-2'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True
)
'''

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards: 100%|██████████| 2/2 [03:34<00:00, 107.04s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:47<00:00, 23.91s/it]


# Connect function

In [70]:
"""Additional Parameters and Considerations:
While these are the primary parameters used in the given code, there are other potential parameters you might consider depending on the specific model and task:

top_p: Controls the diversity of the generated text by selecting the top-p most probable tokens.
num_beams: Specifies the number of beams to use for beam search decoding, which can improve the quality of the generated text.
early_stopping: Determines whether to stop the generation process early if the generated text reaches a certain quality threshold.
no_repeat_ngram_size: Prevents the model from repeating the same ngram (sequence of tokens) multiple times."""

'Additional Parameters and Considerations:\nWhile these are the primary parameters used in the given code, there are other potential parameters you might consider depending on the specific model and task:\n\ntop_p: Controls the diversity of the generated text by selecting the top-p most probable tokens.\nnum_beams: Specifies the number of beams to use for beam search decoding, which can improve the quality of the generated text.\nearly_stopping: Determines whether to stop the generation process early if the generated text reaches a certain quality threshold.\nno_repeat_ngram_size: Prevents the model from repeating the same ngram (sequence of tokens) multiple times.'

In [8]:
def get_hf_llm(model_name: str = "microsoft/phi-2", call_model_api=True, **kwargs):
    """
    Creates and returns a HuggingFace LLM pipeline for text generation.

    Args:
        model_name (str): The name of the pre-trained model to load from the HuggingFace model hub.
                         Defaults to "microsoft/phi-2".
        max_new_tokens (int): The maximum number of tokens to generate in the output sequence.
                              Defaults to 1024.
        **kwargs: Additional keyword arguments that can be passed to customize the model or tokenizer.

    Returns:
        llm: A HuggingFacePipeline instance configured for text generation.
    """
    # Define additional generation parameters, such as sampling temperature.
    gen_kwargs = {
        'temperature': kwargs.get('temperature', 0.5),
        'max_new_tokens': kwargs.get('max_new_tokens', 1024)
    }

    if call_model_api:
        # Access token
        HF_TOKEN = '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'
              
        # create llm model
        llm = HuggingFaceHub(
            repo_id=model_name,
            model_kwargs=gen_kwargs,
            huggingfacehub_api_token=HF_TOKEN
        )
    else:
        # Load a pre-trained causal language model with low memory usage optimization for CPUs.
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            low_cpu_mem_usage=True
        )
        # Load the tokenizer corresponding to the specified model.
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Create a text-generation pipeline using the model and tokenizer.
        model_pipeline = pipeline(
            'text-generation',
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=kwargs.get('max_new_tokens', 1024),
            pad_token_id=tokenizer.eos_token_id,
            device_map='auto'
        )
        # Wrap the pipeline in a HuggingFacePipeline object for further use.
        llm = HuggingFacePipeline(
            pipeline=model_pipeline,
            model_kwargs=gen_kwargs
        )

    return llm

In [9]:
# Example 1: Using default parameters
llm = get_hf_llm()

In [43]:
%%time
# Test the model with a prompt
response = llm.invoke("Explain the concept of machine learning in simple terms.")
print(response)

Explain the concept of machine learning in simple terms.
## INPUT

##OUTPUT
Machine learning is a branch of artificial intelligence that allows computers to learn from data and make predictions or decisions without being explicitly programmed. For example, a machine learning algorithm can learn to recognize faces from a large dataset of images and then identify new faces in new images. Machine learning can be used for various tasks, such as image analysis, speech recognition, natural language processing, and recommendation systems.

CPU times: total: 0 ns
Wall time: 1min 11s


In [45]:
# Example 2: Customizing model and temperature
llm = get_hf_llm(temperature=0.8)

In [46]:
%%time
# Test the model with a prompt
response = llm.invoke("Explain the concept of machine learning in simple terms.")
print(response)

Explain the concept of machine learning in simple terms. No input. OUTPUT: Machine learning is a type of artificial intelligence that allows computers to learn from data and experience, without being explicitly programmed. Machine learning algorithms can analyze large amounts of data, find patterns and trends, and make predictions or decisions based on the data. Machine learning can be used for various applications, such as image recognition, natural language processing, recommendation systems, and self-driving cars.

CPU times: total: 0 ns
Wall time: 1.21 s


In [47]:
# Example 3: Customizing max_new_tokens and temperature
llm = get_hf_llm(max_new_tokens=512, temperature=0.2)

In [48]:
%%time
# Test the model with a prompt
response = llm.invoke("Explain the concept of machine learning in simple terms.")
print(response)

Explain the concept of machine learning in simple terms.
## INPUT
Machine learning is a branch of artificial intelligence that enables computers to learn from data and experience without being explicitly programmed.
##OUTPUT
Machine learning is a way of making computers smarter by letting them learn from what they see and do.

CPU times: total: 0 ns
Wall time: 12.6 s


In [None]:
# Example 4: Customizing all parameters
llm = get_hf_llm(max_new_tokens=256, temperature=0.7)

In [49]:
%%time
# Test the model with a prompt
response = llm.invoke("Explain the concept of machine learning in simple terms.")
print(response)

Explain the concept of machine learning in simple terms.
## INPUT
Machine learning is a branch of artificial intelligence that enables computers to learn from data and experience without being explicitly programmed.
##OUTPUT
Machine learning is a way of making computers smarter by letting them learn from what they see and do.

CPU times: total: 0 ns
Wall time: 387 ms


# Parser

In [1]:
from langchain_community.document_loaders import PyPDFLoader

def remove_non_utf8_characters(text):
    return ''.join(char for char in text if ord(char) < 128)
def load_pdf(pdf_file):
    docs = PyPDFLoader(pdf_file, extract_images=True).load()
    text = ''
    for doc in docs:
        text += remove_non_utf8_characters(doc.page_content)

    return text

CVs_content1 = load_pdf('..\data\CVs\Smith Resume.pdf')
CVs_content2 = load_pdf('..\data\CVs\Alice Clark CV.pdf')

In [7]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field, EmailStr
from typing import List, Optional
import uuid

prompt_template = PromptTemplate(
    input_variables=["cv_content"],  # Only include the actual variables used in the template
    template="""
    You are a highly skilled language model specializing in extracting structured information from unstructured text. Your task is to parse resumes or CVs into a structured JSON format. Extract and organize the following information:

    - **Name**: The full name of the candidate.
    - **Location**: The city and country where the candidate resides.
    - **Contact_Information**: Email address, phone number and other contact details (if available).
    - **Experience**: Total years of professional experience or a summary.
    - **Skills**: List of technical, professional, or interpersonal skills mentioned.
    - **Work_Experience**: For each job, include:
      - Company: Name of the organization.
      - Location: City and country.
      - Duration: Start and end dates or the total duration.
      - Position: Job title.
      - Projects: For each project, include:
        - Name: Project name.
        - Description: What the project was about.
        - Technologies: Technologies used in the project.
        - Responsibilities: Key responsibilities undertaken.
    - **Education**: Include:
      - University: Name of the institution.
      - Year: Graduation year or years of attendance.
    - **Additional_Skills**: Any extra skills or certifications.
    - **Additional_Information**: Other professional qualities or awards, or achievements.

    Return the output strictly in JSON format, without any additional commentary or text.
    
    Example:
    CV Content:
    ```
    John Doe
    Address: New York, USA
    Email: john.doe@example.com | Phone: +123456789
    Professional Experience: Over 10 years in software development and project management.
    Skills: Python, JavaScript, React, Project Management, Agile Methodologies.
    Work Experience:
      - Company: Tech Solutions Inc.
        Location: San Francisco, USA
        Duration: Jan 2015 - Dec 2020
        Position: Senior Software Engineer
        Projects:
          - Name: Inventory Management System
            Description: A web-based inventory tracking system.
            Technologies: Python, Django, PostgreSQL
            Responsibilities: Designed and implemented backend services.
    Education:
      - University: Stanford University
        Year: 2014
    Additional Skills: Certified Scrum Master
    Additional Information: Received Employee of the Year award (2019).
    ```

    Expected JSON Output:
    ```
    dict(
      "name": "John Doe",
      "location": "New York, USA",
      "contact_information": dict('emai': "john.doe@example.com, 'phone': 123456789"),
      "experience": "Over 10 years in software development and project management.",
      "skills": ["Python", "JavaScript", "React", "Project Management", "Agile Methodologies"],
      "work_experience": [
        dict(
          "company": "Tech Solutions Inc.",
          "location": "San Francisco, USA",
          "duration": "Jan 2015 - Dec 2020",
          "position": "Senior Software Engineer",
          "projects": [
            dict(
              "name": "Inventory Management System",
              "description": "A web-based inventory tracking system.",
              "technologies": ["Python", "Django", "PostgreSQL"],
              "responsibilities": ["Designed and implemented backend services."]
            )
          ]
        )
      ],
      "education": [
        dict(
          "university": "Stanford University",
          "year": "2014"
        )
      ],
      "additional_skills": ["Certified Scrum Master"],
      "additional_information": ["Received Employee of the Year award (2019)."]
    )
    ```
    Now parse the following CV content:
    ```
    {cv_content}
    ```
    """
)

llm = get_gsk_parse_llm(model_name='llama-3.1-70b-versatile')

class Project(BaseModel):
    name: str
    description: str
    technologies: List[str]
    responsibilities: List[str]

class WorkExperience(BaseModel):
    company: str
    location: str
    duration: str
    position: str
    projects: List[Project]

class Education(BaseModel):
    university: str
    year: str

class Candidate(BaseModel):
    name: str
    location: str
    contact_information: dict = Field(default_factory=dict)
    experience: str
    skills: List[str]
    work_experience: List[WorkExperience]
    education: List[Education]
    additional_skills: List[str]
    additional_information: List[str]

cv_parser = JsonOutputParser(pydantic_object=Candidate)

chain = prompt_template | llm | cv_parser

info1 = chain.invoke(input={"cv_content": CVs_content1})
info1

{'name': 'Michael Smith',
 'location': 'Manchester, UK',
 'contact_information': {'email': 'indeed.com/r/falicent/140749dace5dc26f'},
 'experience': '10+ years of Experience in Designing, Development, Administration, Analysis, Management in the Business Intelligence Data warehousing, Client Server Technologies, Web-based Applications, cloud solutions and Databases.',
 'skills': ['BI',
  'Big Data',
  'Azure',
  'Data warehouse',
  'Database',
  'Cloud platform',
  'Big Data',
  'Azure data lake store/analytics',
  'Azure data factory',
  'U-SQL',
  'Problem solving',
  'Project lifecycle',
  'Project manager',
  'Technical assistance'],
 'work_experience': [{'company': 'Microsoft',
   'location': 'Manchester, UK',
   'duration': 'December 2015 to Present',
   'position': 'Software Engineer',
   'projects': [{'name': 'Microsoft Rewards Live dashboards',
     'description': 'Microsoft rewards is loyalty program that rewards Users for browsing and shopping online.',
     'technologies': [

In [8]:
candidate_instance1 = Candidate(**info1)

In [174]:
print(CVs_content1)

Michael Smith 
BI / Big Data/ Azure 
Manchester, UK- Email me on Indeed: indeed.com/r/falicent/140749dace5dc26f 
 
10+ years of Experience in Designing, Development, Administration, Analysis, 
Management inthe Business Intelligence Da ta warehousing, Client Server 
Technologies, Web-based Applications, cloud solutions and Databases. 
Data warehouse: Data analysis, star/ snow flake schema data modeling and design 
specific todata warehousing and business intelligence environment. 
Database: Experience in database designing, scalability, back -up and recovery, 
writing andoptimizing SQL code and Stored Procedures, creating functions, views, 
triggers and indexes.  
Cloud platform: Worked on Microsoft Azure cloud services like Document DB, SQL 
Azure, StreamAnalytics, Event hub, Power BI, Web Job, Web App, Power BI, Azure 
data lake analytics(U-SQL). 
Big Data: Worked Azure data lake store/analytics for big data processing and Azure 
data factoryto schedule U-SQL jobs. Designed and develo

In [41]:
info2 = chain.invoke(input={"cv_content": CVs_content2})
info2

{'name': 'Alice Clark',
 'location': 'Delhi, India',
 'contact_information': {'email': 'available on Indeed'},
 'experience': '20+ years of experience in data handling, design, and development',
 'skills': ['Data Warehouse',
  'Database',
  'Cloud platform',
  'Machine Learning',
  'Natural Language Processing',
  'Big Data Handling'],
 'work_experience': [{'company': 'Microsoft',
   'location': 'Bangalore, Karnataka',
   'duration': 'January 2000 to Present',
   'position': 'Software Engineer',
   'projects': [{'name': 'Microsoft Rewards Live dashboards',
     'description': 'A live picture of usage world-wide and by markets like US, Canada, Australia, new user registration count, top/bottom performing rewards offers, orders stats and weekly trends of user activities, orders and new user registrations.',
     'technologies': [],
     'responsibilities': []}]}],
 'education': [{'university': 'Indian Institute of Technology',
   'year': '2001'}],
 'additional_skills': [],
 'additional_i

In [43]:
candidate_instance2 = Candidate(**info2)

In [46]:
print(candidate_instance2.work_experience[0])

company='Microsoft' location='Bangalore, Karnataka' duration='January 2000 to Present' position='Software Engineer' projects=[Project(name='Microsoft Rewards Live dashboards', description='A live picture of usage world-wide and by markets like US, Canada, Australia, new user registration count, top/bottom performing rewards offers, orders stats and weekly trends of user activities, orders and new user registrations.', technologies=[], responsibilities=[])]


In [128]:
candidate1.work_experience

{'Name': 'Michael Smith',
 'Location': 'Manchester, UK',
 'Contact Information': {'Email': 'indeed.com/r/falicent/140749dace5dc26f'},
 'Experience': '10+ years',
 'Skills': ['BI',
  'Big Data',
  'Azure',
  'Data warehouse',
  'Database',
  'Cloud platform',
  'Problem solving',
  'Project lifecycle',
  'Project manager',
  'Technical assistance'],
 'Work Experience': [{'Company': 'Microsoft',
   'Location': 'Manchester, UK',
   'Duration': 'December 2015 to Present',
   'Position': 'Software Engineer',
   'Projects': [{'Name': 'Microsoft Rewards Live dashboards',
     'Description': 'A live picture of usage worldwide and by markets like US, Canada, Australia, new user registration count, top/bottom performing rewards offers, orders stats and weekly trends of user activities, orders and new user registrations.',
     'Technologies': ['Event hub', 'Stream analytics', 'Power BI'],
     'Responsibilities': ['Created stream analytics jobs to process event hub data',
      'Created Power BI

In [127]:
candidate1 = map_dict_to_candidate(info1)

candidate1.work_experience

[]

In [67]:
candidate2 = map_dict_to_candidate(info1)

candidate1.work_experience

<__main__.Candidate at 0x213c1a476d0>

In [40]:
info['workExperience'][0]['roles'][0]['projects']

[{'name': 'Microsoft Rewards Live Dashboards',
  'description': 'Real-time dashboards for Microsoft Rewards program',
  'technologies': ['Event Hub', 'Stream Analytics', 'Power BI'],
  'responsibilities': ['Created stream analytics jobs to process event hub data',
   'Created Power BI live dashboard to show live usage traffic, weekly trends, cards, charts to show top/bottom 10 offers and usage metrics']},
 {'name': 'Microsoft Rewards Data Insights',
  'description': 'Data analytics and reporting platform for Microsoft Rewards program',
  'technologies': ['Cosmos (Microsoft big-data platform)',
   'C#',
   'X-flow job monitoring',
   'Power BI'],
  'responsibilities': ['Created big data scripts in cosmos',
   'C# data extractors, processors and reducers for data transformation',
   'Power BI dashboards']},
 {'name': 'End to End Tracking Tool',
  'description': 'Real-time tracking tool for business transactions',
  'technologies': ['Azure Document DB',
   'Azure Web Job',
   'Web App',
 

In [3]:
import os
os.environ["GROQ_API_KEY"] = "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"

In [50]:
temp = ChatGroq(
    temperature=0.6,
    model_name = "llama-3.1-70b-versatile",
    gsk_token = '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'
)

                    gsk_token was transferred to model_kwargs.
                    Please confirm that gsk_token is what you intended.


In [6]:
def get_gsk_parse_llm(model_name: str = "llama-3.1-70b-versatile", **kwargs):
    """
    Initializes a ChatGroq language model with specified parameters.

    Args:
        model_name (str): Name of the model to initialize.
        **kwargs: Additional parameters to customize the ChatGroq instance.

    Returns:
        ChatGroq: The initialized language model.

    Raises:
        ValueError: If the GROQ_API_KEY is not set in the environment.
    """
    llm = ChatGroq(
        model_name = model_name,
        model_kwargs = kwargs
    )
    return llm


In [5]:
import os
os.environ["GROQ_API_KEY"] = "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"
