## First Load in the Data

In [41]:
import pandas as pd
import numpy as np
jobs = pd.read_csv("Descriptions_and_roles_AI_flagged.csv")
jobs_df = pd.DataFrame(jobs)
jobs_df_des = jobs_df[['description']].iloc[:3]
# Convert to JSONL (newline-delimited JSON objects)
batch_jsonl = jobs_df_des.to_json(orient="records", lines=True)
print(batch_jsonl)

{"description":"\u2022\tIdentify and fix over 400 disruptive transportation shipments per day to increase trucking customer\u2019s revenue and Shiplify's efficiency\u2022\tHelped increase from a 93% success rate to 96% success rate over 3 months for the machine learning software which calculates over 300,000 shipments per day"}
{"description":"Digital Self-Service Launch: Successfully launched over 90% Straight-Through Processing (STP) menus for self-service, integrating Chat-bot technology across Thailand, Indonesia and Singapore.Operational Efficiency: Achieved a 15% reduction in agent inquiries through increased deflection rate, contributing to improved operational efficiency.Strategic Partnerships: Drove strategic partnerships with Fintechs like Avatech.AI,  Walkme, Bancassurance fintechs."}
{"description":"- Prototyped AI Chatbot and Agents ingested with vector-embedded knowledge base and production feature store using Anthropic Claude and AWS Bedrock.- Built Urgently\u2019s award

In [113]:
import openai
from openai import OpenAI
import os
import boto3
import json
import re
from botocore.exceptions import ClientError

descriptions_token  = os.getenv('description_API')
AWS_API = os.getenv('AWS_BEARER_TOKEN_BEDROCK')

AI_roles = []

In [121]:
# Prompts modified from Gmyrek et al, and Hosseini and Lichtinger 

root_prompt = """Act as a precise classifier for LinkedIn job descriptions. Follow instructions exactly and do not add any commentary or reasoning."""

core_prompt = f"""
You will be given job descriptions in JSON format. 
We distinguish three categories: 
A) LLM INTEGRATOR = descriptions of positions that build/operate LLM-powered systems or embed LLMs into workflows. 
Signals:RAG(retrieval-augmentedgeneration),embeddings/vectorDB(FAISS/Milvus/Pinecone), prompt engineering at system level, 
orchestration/agents/LLMOps,LangChain/LlamaIndex,fine-tuning/adapters, model serving/inference, 
evaluation/guardrails/red-teaming, API integration of LLMs into products or internal processes. 
B) LLM USER = descriptions of positions that primarily use LLM tools (ChatGPT, Gemini, Copilot, etc.) to perform tasks 
such as drafting, summarizing, coding assistance, customer responses —without building systems. 
C) SIGNALING = descriptions of positions that vaguely mention LLMs or AI terminology without actually using or integrating it.

NOT in-scope for integrator unless integration is explicit: 
– Foundation-model pretraining/research scientist roles at model labs (OpenAI/DeepMind/etc.). 
– Generic ML/NLP with no explicit LLM signals. 
– Pure labeling/annotation. 

Edge rules: 
– If both integration and user aspects appear, set role type= "both”. 
– If acronyms like “RAG” or "AI" appear, assume the LLM meaning unless context contradicts. 

Output a string of AI Roles for each description, with a comma delimited separator 
where AI Role → "LLM INTEGRATOR", "LLM USER", "BOTH", or "SIGNALING". Only use the descriptions to make the classification. 

Do not add extra keys or commentary outside the output.  

Here are the job descriptions:

{batch_jsonl}

"""

In [82]:
# Make the API call to Hugging Face model. Define a function for this
def description_AIrole_labeling(root_prompt, core_prompt, temp):
    client = OpenAI(
        base_url="https://router.huggingface.co/v1",
        api_key=descriptions_token,
    )
    # Put prompt into a dictionary object. 
    messages = [{"role": "system", "content": root_prompt},
                {"role": "user", "content": core_prompt}]
    
    # Query the API
    response = client.chat.completions.create(
            model="mistralai/Mistral-7B-Instruct-v0.2:featherless-ai",
            messages=messages,
            temperature=temp)
    return response.choices[0].message.content

In [122]:
# Make the API call to AWS GPT model.
def description_AIrole_labeling_AWS(root_prompt, core_prompt):
    
    # Initialize the Bedrock Runtime client
    client = boto3.client('bedrock-runtime')
    
    # Model ID
    model_id = 'openai.gpt-oss-120b-1:0'
    
    # Create the request body
    request = {
      "messages": [
        {"role": "system", "content": root_prompt},
        { "role": "user","content": core_prompt}
      ],
      "max_completion_tokens": 2132,
      "temperature": 0,
      "top_p": 0.5,
      "reasoning_effort": "low",
    }
    
    # Make the InvokeModel request
    response = client.invoke_model(
        modelId=model_id,
        body=json.dumps(request)
    )
    
    # Parse and print the message for each choice in the chat completion
    response_body = json.loads(response['body'].read().decode('utf-8'))
    response = response_body['choices'][0]['message']['content']
    response = re.sub(r"<reasoning>.*?</reasoning>", "", response, flags=re.DOTALL).strip()
    
    return response

In [123]:
# Print the response
result = description_AIrole_labeling_AWS(root_prompt, core_prompt)
print(result)
roles = result.split(",")

SIGNALING,LLM INTEGRATOR,LLM INTEGRATOR


In [124]:
AI_roles = []
AI_roles.append(roles)
AI_roles

[['SIGNALING', 'LLM INTEGRATOR', 'LLM INTEGRATOR']]

In [125]:
from itertools import chain
list(chain.from_iterable(AI_roles))

['SIGNALING', 'LLM INTEGRATOR', 'LLM INTEGRATOR']