## First Load in the Data

In [13]:
import pandas as pd
import numpy as np
jobs = pd.read_csv("Descriptions_and_roles_AI_flagged.csv")
jobs_df = pd.DataFrame(jobs)
jobs_df_des = jobs_df[['description']]
# Convert to JSONL (newline-delimited JSON objects)
batch_jsonl = jobs_df_des.to_json(orient="records", lines=True)
print(batch_jsonl)

{"description":"\u2022\tIdentify and fix over 400 disruptive transportation shipments per day to increase trucking customer\u2019s revenue and Shiplify's efficiency\u2022\tHelped increase from a 93% success rate to 96% success rate over 3 months for the machine learning software which calculates over 300,000 shipments per day"}
{"description":"Digital Self-Service Launch: Successfully launched over 90% Straight-Through Processing (STP) menus for self-service, integrating Chat-bot technology across Thailand, Indonesia and Singapore.Operational Efficiency: Achieved a 15% reduction in agent inquiries through increased deflection rate, contributing to improved operational efficiency.Strategic Partnerships: Drove strategic partnerships with Fintechs like Avatech.AI,  Walkme, Bancassurance fintechs."}
{"description":"- Prototyped AI Chatbot and Agents ingested with vector-embedded knowledge base and production feature store using Anthropic Claude and AWS Bedrock.- Built Urgently\u2019s award

## Import Libraries needed for using AWS Bedrock and Fetch API

In [14]:
import os
import boto3
import json
import re
from botocore.exceptions import ClientError

AWS_API = os.getenv('AWS_BEARER_TOKEN_BEDROCK')

## Prompts

In [28]:
# Prompts modified from Gmyrek et al, and Hosseini and Lichtinger 

root_prompt = """Act as a precise classifier for LinkedIn job descriptions. Follow instructions exactly and do not add any commentary or reasoning."""

core_prompt = f"""
You will be given 100 job descriptions in JSON format, some of which will be in different languages other than English. 
We distinguish three categories: 
A) Generative AI INTEGRATOR = descriptions of positions that build/operate LLM/Gen AI-powered systems or embed LLMs into workflows. 
B) Generative AI USER = descriptions of positions that primarily use LLM/Gen AI tools (ChatGPT, Gemini, Copilot, etc.) to perform tasks 
such as drafting, summarizing, coding assistance, customer responses —without building systems. 
C) SIGNALING = descriptions of positions that vaguely mention LLMs or AI terminology without actually using or integrating it.


NOT in-scope for integrator unless integration is explicit: 
– Foundation-model pretraining/research scientist roles at model labs (OpenAI/DeepMind/etc.). 
– Generic ML/NLP with no explicit LLM signals. 
– Pure labeling/annotation. 

Edge rules: 
– If both integration and user aspects appear, set role type= "both”. 
– If acronyms like “RAG” or "AI" appear, assume the LLM meaning unless context contradicts. 

Return the results as a json array of exactly 100 strings for the AI role classification you give to each of the descriptions. 
where AI Role → "AI INTEGRATOR", "AI USER", "BOTH", or "SIGNALING". Only use the descriptions to make the classification. 

The array must have length 100. Do not add extra keys or commentary outside the output.

Here are the job descriptions:

{batch_jsonl}

"""

## Invoke the Model

In [29]:
# Make the API call to AWS GPT model.
def description_AIrole_labeling_AWS(root_prompt, core_prompt):
    
    # Initialize the Bedrock Runtime client
    client = boto3.client('bedrock-runtime', region_name = "us-east-1")
    
    # Model ID
    model_id = 'openai.gpt-oss-120b-1:0'
    
    # Create the request body
    request = {
      "messages": [
        {"role": "system", "content": root_prompt},
        { "role": "user","content": core_prompt}
      ],
      "max_completion_tokens": 8192,
      "temperature": 0,
      "top_p": 0.5,
      #"reasoning_effort": "low",
    }
    
    # Make the InvokeModel request
    response = client.invoke_model(
        modelId=model_id,
        body=json.dumps(request)
    )
    
    # Parse the message in the chat completion
    response_body = json.loads(response['body'].read().decode('utf-8'))
    response = response_body['choices'][0]['message']['content']
    # Remove the reasoning section of the response to get just the output
    #response = re.sub(r"<reasoning>.*?</reasoning>", "", response, flags=re.DOTALL).strip()
    
    return response

In [30]:
# Print the response
result = description_AIrole_labeling_AWS(root_prompt, core_prompt)
print(result)
roles = result.split(",")

<reasoning>We need to classify each of the 100 descriptions into AI INTEGRATOR, AI USER, BOTH, SIGNALING. Need to output array of 100 strings.

We must count 100 entries. Let's enumerate them in order given.

I'll list each description with index.

1: {"description":"\u2022\tIdentify and fix over 400 disruptive transportation shipments per day... machine learning software..."} This is about using ML software to improve shipping. No integration of LLM. It's using ML system, not LLM. It's a user of ML? It's building ML? It says "machine learning software which calculates over 300,000 shipments per day". They helped increase success rate. Likely they are using ML tool, not building LLM. Category: AI USER? It's generic ML, not LLM. But rule: Not integrator unless integration explicit. This is using ML software, not LLM. So AI USER? It's using ML, but not LLM. The categories are about generative AI LLM. This description doesn't mention LLM or generative AI. Might be SIGNALING? It mentions "

In [31]:
# Clean response to remove the reasoning and just show output
response = re.sub(r"<reasoning>.*?</reasoning>", "", result, flags=re.DOTALL).strip()
roles = json.loads(response)

In [36]:
# Create a data frame from the output
import pandas as pd
AI_roles_df = pd.DataFrame(roles, columns = ['AI_role'])
AI_roles_df

Unnamed: 0,AI_role
0,SIGNALING
1,AI INTEGRATOR
2,AI INTEGRATOR
3,SIGNALING
4,SIGNALING
...,...
95,SIGNALING
96,SIGNALING
97,SIGNALING
98,AI INTEGRATOR


In [38]:
# Join output df with original
jobs_AI_role = jobs_df.join(AI_roles_df)
# Save to a csv file
jobs_AI_role.to_csv("jobs_AI_role.csv", index = False)

In [21]:
# Add the output roles to a list
AI_roles = []
AI_roles.append(roles)
AI_roles

[['SIGNALING',
  'AI INTEGRATOR',
  'AI INTEGRATOR',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'AI INTEGRATOR',
  'SIGNALING',
  'AI USER',
  'SIGNALING',
  'SIGNALING',
  'AI USER',
  'AI USER',
  'AI INTEGRATOR',
  'SIGNALING',
  'AI USER',
  'AI USER',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'AI INTEGRATOR',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'AI USER',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'AI USER',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'BOTH',
  'AI INTEGRATOR',
  'AI USER',
  'AI INTEGRATOR',
  'AI INTEGRATOR',
  'SIGNALING',
  'AI INTEGRATOR',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'SIGNALING',
  'AI USER',
  'SIGNALING',
  'AI USER',
  'BOTH',
  'S

In [22]:
# Flatten out the list
from itertools import chain
AI_roles = list(chain.from_iterable(AI_roles))
AI_roles

['SIGNALING',
 'AI INTEGRATOR',
 'AI INTEGRATOR',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'AI INTEGRATOR',
 'SIGNALING',
 'AI USER',
 'SIGNALING',
 'SIGNALING',
 'AI USER',
 'AI USER',
 'AI INTEGRATOR',
 'SIGNALING',
 'AI USER',
 'AI USER',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'AI INTEGRATOR',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'AI USER',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'AI USER',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'BOTH',
 'AI INTEGRATOR',
 'AI USER',
 'AI INTEGRATOR',
 'AI INTEGRATOR',
 'SIGNALING',
 'AI INTEGRATOR',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'SIGNALING',
 'AI USER',
 'SIGNALING',
 'AI USER',
 'BOTH',
 'SIGNALING',
 'AI USER',
 'AI USER',
 'SIGNALING',
 'SIGNALING',
 'SI

In [27]:
#AI_roles[-1] = AI_roles[-1].replace(')', '')
AI_roles[59]

'SIGNALING'

In [20]:
AI_roles_array = np.array(AI_roles).reshape((len(AI_roles) ,1))

In [21]:
AI_roles_array.shape

(104, 1)

In [21]:
roles = pd.DataFrame(data = AI_roles, columns = {'AI_roles': np.arange(len(AI_roles))})
roles

ValueError: 1 columns passed, passed data had 110 columns