# Creating the Train dataset

**Using the free LLaMA 3-70B model from Groq and Pydantic to create clear and consistent training data.**

## Import dataset with job offers

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/combined_jobs.csv')

In [None]:
df.shape

(1079, 6)

In [None]:
df.head()

Unnamed: 0,Position,Company_Name,Location,Post_Month,Post_Year,Details
0,IT Manager,10 Percent Recruiting Ltd.,"Vancouver, British Columbia, Canada",June,2024,Position Title: IT Manager\n\nLocation: Vancou...
1,"Manager, IT Support",Procom,"Toronto, Ontario, Canada",June,2024,"On behalf of our public sector client, PROCOM ..."
2,Director of Information Technology,Southampton Financial Inc,"Toronto, Ontario, Canada",June,2024,Southampton Financials’ Mission: Bring Clarity...
3,"Manager, Information Technology Services",Town of Tillsonburg,"Tillsonburg, Ontario, Canada",June,2024,The Town of Tillsonburg is looking for a Manag...
4,Systems Manager,Accor,"Winnipeg, Manitoba, Canada",June,2024,"Company Description\n\n""Why work for Accor?""\n..."


In [None]:
df = df.head(500)

## Generative AI processing

In [None]:
# We will generate a new column that contains the following JSON object for each job Description in the following dataset

system_message_prompt = """
You are an AI assistant that converts job descriptions into structured JSON data to assist recruiters. Your task is to extract key details from a given job description and format them into a JSON object with the following structure:

{
  "role_summary": "A concise, non-technical summary of the job role. It should describe the primary responsibilities of the role in simple language, avoiding jargon. Focus on what the role does rather than listing requirements.",
  "key_terms": [
    {
      "term": "Technical Term or jargon from the job description",
      "explanation": "A simple explanation of the term in the context of the role. This helps recruiters understand technical jargon without needing domain expertise."
    }
  ],
  "skill_priorities": {
    "must_have": ["List of essential skills required for the role. These are non-negotiable and should be explicitly mentioned in the job description."],
    "nice_to_have": ["List of preferred skills that are beneficial but not mandatory. These are often marked as 'preferred,' 'a plus,' or 'optional' in the job description."]
  },
  "proposed_screening_questions_with_answers": [
       {
           "question": "A role-specific question to assess candidate expertise...",
           "example_answer": "An example of a strong candidate response..."
       },
       {
           "question": "Another relevant question...",
           "example_answer": "A strong answer to the second question..."
       }
  ],
  "red_flags": [
    "Indicators of potential mismatches for the role, offering actionable insights for recruiters based on the job offer. Examples include 'Avoid candidates without cloud experience'."
  ],
  "confidence_score": "A numerical score between 0 and 100 indicating the model's confidence in the accuracy of its analysis."
}

Guidelines:
- Ensure the role summary is written in simple, non-technical language, focusing on primary responsibilities rather than qualifications.
- Extract key technical terms and provide clear, non-technical explanations.
- Identify and categorize skills into 'must-have' and 'nice-to-have' based on explicit mentions in the job description.
- Generate screening questions that assess relevant competencies and provide an example of a strong answer.
- Highlight red flags that indicate potential mismatches, ensuring they are actionable.
- Assign a confidence score (0-100) reflecting the reliability of the extracted insights.

Output only the JSON object without additional text.

"""

In [None]:
!pip install groq



In [None]:
import re
import groq
import os
import json
import time
from typing import List, Optional
import json
from pydantic import BaseModel
from google.colab import userdata
from pprint import pprint

In [None]:
llama3_model= 'llama3-70b-8192'

In [None]:
key_1 = userdata.get('groq_key_1')
key_2 = userdata.get('groq_key_2')
key_3 = userdata.get('groq_key_3')

api_keys = [key_1, key_2, key_3]

In [None]:
from typing import List, Optional
from pydantic import BaseModel

class KeyTerm(BaseModel):
    term: str
    explanation: str

class SkillPriorities(BaseModel):
    must_have: List[str]
    nice_to_have: List[str]

class ScreeningQuestion(BaseModel):
    question: str
    example_answer: str

class JobAnalysis(BaseModel):
    role_summary: str
    key_terms: List[KeyTerm]
    skill_priorities: SkillPriorities
    proposed_screening_questions_with_answers: List[ScreeningQuestion]
    red_flags: List[str]
    confidence_score: float

In [None]:
from pydantic import ValidationError
from groq import Groq, APIError, APIConnectionError, RateLimitError
from pydantic import ValidationError
from typing import Union

def analyze_job_with_retries(
    job_description: str,
    prompt: str,
    model_name: str,
    api_keys: list,
    max_retries: int = 3,
    initial_retry_delay: float = 1.0
) -> Union[JobAnalysis, str]:  # Updated return type
    """
    Analyzes a job description with API key rotation on exceptions.

    Args:
        job_description: The job description text to analyze.
        prompt: System prompt for the AI model.
        model_name: Name of the LLM model to use.
        api_keys: List of API keys to rotate through.
        max_retries: Maximum number of retry attempts across all keys.
        initial_retry_delay: Initial delay between retries in seconds (exponential backoff).

    Returns:
        Union[JobAnalysis, str]: Parsed job analysis results or 'Error' if all retries fail.
    """
    retry_delay = initial_retry_delay
    total_attempts = 0

    while total_attempts < max_retries:
        for i, api_key in enumerate(api_keys):
            client = Groq(api_key=api_key)
            try:
                chat_completion = client.chat.completions.create(
                    messages=[
                        {
                            "role": "system",
                            "content": f"{prompt}\n"
                                       f"The JSON object must use the schema: {json.dumps(JobAnalysis.model_json_schema(), indent=2)}",
                        },
                        {
                            "role": "user",
                            "content": f"Analyze this job description:\n\n{job_description}",
                        },
                    ],
                    model=model_name,
                    temperature=0,
                    response_format={"type": "json_object"},
                )
                print(f"Success from {model_name} using API key {i + 1}")
                job_object = JobAnalysis.model_validate_json(
                    chat_completion.choices[0].message.content
                )

                return job_object.model_dump_json()


            except (APIError, APIConnectionError, RateLimitError) as e:
                print(f"API error with key {i + 1}: {str(e)}. Rotating to next key...")
                total_attempts += 1
                if total_attempts >= max_retries:
                    break  # Exit if max retries reached
                time.sleep(retry_delay)
                retry_delay *= 2  # Exponential backoff

            except Exception as e:
                print(f"Unexpected error: {str(e)}. Rotating to next key...")
                total_attempts += 1
                if total_attempts >= max_retries:
                    break  # Exit if max retries reached
                time.sleep(retry_delay)
                retry_delay *= 2

    print("All API keys and retries exhausted. Returning error response.")
    return 'Error'  # Return string 'Error' if all retries fail

In [None]:
# Example usage
job_desc = """
Note: By applying to this position you will have an opportunity to share your preferred working location from the following: Cambridge, MA, USA; Atlanta, GA, USA; Austin, TX, USA; Seattle, WA, USA.Minimum qualifications:

Master's degree in a quantitative discipline such as Statistics, Engineering, Sciences, or equivalent practical experience
4 years of experience using analytics to solve product or business problems, coding (e.g., Python, R, SQL), querying databases or statistical analysis.
4 years of experience in business visualization tools like Looker, Tableau, etc.

Preferred qualifications:

Master's degree in Computer Science, Economics, or Mathematics, or a related field.
5 years of work experience in data science or quantitative analytics with focus on statistical modeling, Machine Learning, and AI.
Experience with both SQL and Python.
Experience with Machine Learning, AI or AI Pipeline.
Experience in Google Cloud Platform.

About The Job

Cloud Learning Services (CLS) is revolutionizing direct cloud learning. We empower users of all levels with interactive labs and guided experiences to build practical skills on Google Cloud Platform and other leading technologies. Our mission is to make the cloud accessible, engaging, and enjoyable to learn.

As a Business Data Scientist, you will play a key role in uncovering valuable insights from diverse data sources to solve critical business tests. You will address a wide range of exciting projects, from establishing new measurement frameworks to identifying meaningful patterns in large datasets, ultimately empowering stakeholders to make data-driven decisions.

In this role, you will need to be a detail-oriented problem-solver with a strong foundation in data analytics, data visualization, and Artificial Intelligence/Machine Learning. You will solve the real-world problems and a commitment to continuous learning. Exceptional communication and stakeholder management skills are essential, as you will collaborate extensively with cross-functional teams. If you succeed in dynamic environments and are eager to make a real impact with data, we encourage you to apply!

The US base salary range for this full-time position is $166,000-$244,000 + bonus + equity + benefits. Our salary ranges are determined by role, level, and location. Within the range, individual pay is determined by work location and additional factors, including job-related skills, experience, and relevant education or training. Your recruiter can share more about the specific salary range for your preferred location during the hiring process.

Please note that the compensation details listed in US role postings reflect the base salary only, and do not include bonus, equity, or benefits. Learn more about benefits at Google .

Responsibilities

Extract actionable insights from large, complex datasets and build data products like dashboards to operationalize them, driving measurable improvements in Key Performance Indicators (KPIs).
Present and communicate actionable insights and recommendations to executives, leaders, and cross-functional partners, including Product, Engineering, and Marketing teams.
Serve as a peer-reviewer and consultant to other members of the team, fostering a collaborative and knowledge-sharing environment.
Learn and share knowledge of the latest advancements in AI/ML and data science that are relevant to our work.

"""

analysis = analyze_job_with_retries(job_desc, system_message_prompt, llama3_model, api_keys)


# Assuming `analysis` is your JobAnalysis object
pprint(analysis)


Success from llama3-70b-8192 using API key 1
('{"role_summary":"As a Business Data Scientist, you will uncover valuable '
 'insights from diverse data sources to solve critical business tests, '
 'empowering stakeholders to make data-driven '
 'decisions.","key_terms":[{"term":"Machine Learning","explanation":"A type of '
 'Artificial Intelligence that enables computers to learn from data without '
 'being explicitly programmed."},{"term":"Artificial '
 'Intelligence","explanation":"The development of computer systems that can '
 'perform tasks that typically require human intelligence, such as visual '
 'perception, speech recognition, and decision-making."},{"term":"Google Cloud '
 'Platform","explanation":"A suite of cloud computing services offered by '
 'Google, including data analytics, machine learning, and '
 'storage."},{"term":"SQL","explanation":"A programming language designed for '
 'managing and manipulating data in relational database management '
 'systems."},{"term":"P

In [None]:
from tqdm.notebook import tqdm
import time

def process_dataframe(df, system_message_prompt, llama3_model, api_keys):
    tqdm.pandas(desc="Processing job descriptions")

    def safe_analyze(row):
        job_desc = row['Details']
        position = row['Position']
        print(f"Processing position: {position}")

        # Analyze the job description
        result = analyze_job_with_retries(job_desc, system_message_prompt, llama3_model, api_keys)

        # Handle the case where the result is 'Error'
        if result == 'Error':
            print(f"Failed to analyze position: {position}")
            return None  # Or return a default value or error object
        else:
            return result  # Return the JobAnalysis object

    # Use progress_apply to process each row
    df['parsed_description'] = df.progress_apply(safe_analyze, axis=1)
    return df

# Process the DataFrame
analysed_df = process_dataframe(df, system_message_prompt, llama3_model, api_keys)

Processing job descriptions:   0%|          | 0/500 [00:00<?, ?it/s]

Processing position: IT Manager
Success from llama3-70b-8192 using API key 1
Processing position: Manager, IT Support
API error with key 1: Error code: 400 - {'error': {'message': "Failed to generate JSON. Please adjust your prompt. See 'failed_generation' for more details.", 'type': 'invalid_request_error', 'code': 'json_validate_failed', 'failed_generation': '{\n   "role_summary": "The Manager, IT Support is a leadership role responsible for managing the daily operations of IT Support, ensuring effective vendor governance, and achieving premium customer service and operational excellence.",\n   "key_terms": [\n      {\n         "term": "ITIL",\n         "explanation": "ITIL is a framework for IT service management that provides best practices for delivering high-quality IT services."\n      },\n      {\n         "term": "ITSM",\n         "explanation": "ITSM is a set of policies, procedures, and processes for managing IT services, focusing on delivering value to customers."\n      },

In [None]:
analysed_df.head()

Unnamed: 0,Position,Company_Name,Location,Post_Month,Post_Year,Details,parsed_description
0,IT Manager,10 Percent Recruiting Ltd.,"Vancouver, British Columbia, Canada",June,2024,Position Title: IT Manager\n\nLocation: Vancou...,"{""role_summary"":""The IT Manager will oversee I..."
3,"Manager, Information Technology Services",Town of Tillsonburg,"Tillsonburg, Ontario, Canada",June,2024,The Town of Tillsonburg is looking for a Manag...,"{""role_summary"":""The Manager, Information Tech..."
6,Information Technology (IT) Manager,Alpha Auto Group,"Ottawa, Ontario, Canada",June,2024,Job Description\n\nInterested in developing or...,"{""role_summary"":""The Information Technology Ma..."
8,IT Support Manager,Logistec Corporation,"Montreal, Quebec, Canada",June,2024,CAREER OPPORTUNITY AT LOGISTEC\n\nLOGISTEC off...,"{""role_summary"":""Oversee the day-to-day operat..."
10,IT Service Center Director,Leclerc Foods,"Brockville, Ontario, Canada",June,2024,Leclerc is a family business with 117 years of...,"{""role_summary"":""Manage the IT Service Center,..."


In [None]:
analysed_df.shape

(500, 7)

In [None]:
pd.isna(analysed_df).sum()

Unnamed: 0,0
Position,0
Company_Name,0
Location,0
Post_Month,0
Post_Year,0
Details,0
parsed_description,130


In [None]:
analysed_df = analysed_df.dropna()

In [None]:
analysed_df.shape

(370, 7)

In [None]:
analysed_df.to_csv('analysed_jobs_with_analysis.csv', index=False)