In [3]:
# install openai library
!pip install openai -qq

In [6]:
# dependencies
import os
import re
import time
import numpy as np
import pandas as pd
import asyncio
import nest_asyncio
import httpx
from openai import AsyncOpenAI
from google.colab import drive, userdata

# async fix for notebook
nest_asyncio.apply()

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# openai client for async
client = AsyncOpenAI(api_key=userdata.get('OpenAI'), http_client=httpx.AsyncClient())

In [10]:
# prompt to completion (api response)
async def get_completion(prompt):
    response = await client.chat.completions.create(
        model='gpt-4o-mini',

        messages=[{'role': 'user', 'content': prompt}]
    )
    return response

# prompts to completions (api responses)
async def get_completions(prompts):
    tasks = [get_completion(prompt) for prompt in prompts]
    return await asyncio.gather(*tasks)

In [11]:
# prompt template
prompt = '''
Your task is to generate 10 DIFFERENT and UNRELATED sentences from hypothetical but REALISTIC job ads in DANISH.

You are given one (1) MAIN skill/competence/knowledge and ten (10) ADDITIONAL skills/competences/knowledges.

MAIN
1. {label}

ADDITIONAL
{related_labels}

The MAIN should be requested in each of the ten (10) sentences, and the ADDITIONAL should be requested in one (1) sentence each, in the order they are given.

Each sentence should stand on its own using DIFFERENT SYNONYMS, ACRONYMS, PHRASINGS, CASINGS and GRAMMAR to precisely express the requested skills/competences/knowledges.
Separate the sentences by a single line break.
Output nothing else. There should be ten (10) lines with one (1) sentence each.
'''.strip()

In [None]:
# load ESCO skills and generated relations
skills = pd.read_csv(work_dir('ESCO', 'ESCO dataset - v1.1.2 - classification - da - csv', 'skills_da.csv'))
relations = pd.read_csv(work_dir('Data', 'related_skills.csv'), index_col=0)

# get labels and descriptions for skill in question
relations['conceptUri'] = relations.index
relations['label'] = relations.index.map(skills.set_index('conceptUri')['preferredLabel'])
relations['description'] = relations.index.map(skills.set_index('conceptUri')['description'])

# get labels for related skills
for n in range(1, 11):
    relations[f'label_{n}'] = relations[f'related_{n}'].map(skills.set_index('conceptUri')['preferredLabel'])

# drop any NA
relations.dropna(inplace=True)

# generate prompt for each skill
relations['prompt'] = relations.apply(lambda row: prompt.format(label=row['label'], description=row['description'], related_labels='\n'.join(str(n) + '. ' + row[f'label_{n}'] for n in range(1, 11))), axis=1)

# separate in chunks of s
s = 100
chunks = np.array_split(relations, np.arange(s, len(relations), s))

  return bound(*args, **kwds)


In [None]:
print(relations.iloc[0]['prompt'])

Your task is to generate 10 DIFFERENT and UNRELATED sentences from hypothetical but REALISTIC job ads in DANISH.

You are given one (1) MAIN skill/competence/knowledge and ten (10) ADDITIONAL skills/competences/knowledges.

MAIN
1. lede musikalsk personale

ADDITIONAL
1. planlægge musikoptræden
2. supervisere musikgrupper
3. positionere musikere
4. studere musikpartiturer
5. tilstræbe toppræstationer inden for musikoptræden
6. musikteori
7. evaluere musikidéer
8. vælge musikere
9. udvikle musikidéer
10. musikgenrer

The MAIN should be requested in each of the ten (10) sentences, and the ADDITIONAL should be requested in one (1) sentence each, in the order they are given.

Each sentence should stand on its own using DIFFERENT SYNONYMS, ACRONYMS, PHRASINGS, CASINGS and GRAMMAR to precisely express the requested skills/competences/knowledges.
Separate the sentences by a single line break.
Output nothing else. There should be ten (10) lines with one (1) sentence each.


In [None]:
# mapping additional conceptUri to sentence
def map_completion_to_related(row_idx, row, completion):
    if len(completion) != 10:
        return pd.DataFrame()
    return pd.DataFrame({
        'conceptUriPrimary': row['conceptUri'],
        'conceptUriSecondary': [row[f'related_{i+1}'] for i in range(len(completion))],
        'sentence': completion
    })

# get completions for each chunk
for i, chunk in enumerate(chunks):
    filename = os.path.join(WORK_DIR, 'Data', 'multi_sentences_{}.csv'.format(i))
    if os.path.exists(filename):
        continue
    completions = await get_completions(chunk['prompt'].values)
    completions = [[t for s in c.choices[0].message.content.split('\n') if (t := re.sub(r'^[\W0-9]+', '', s).strip()) != ''] for c in completions]
    completions = pd.concat([map_completion_to_related(idx, row, comp)
                             for (idx, row), comp in zip(chunk.iterrows(), completions)])
    completions.to_csv(filename, index=False)

In [8]:
pd.set_option('display.max_colwidth', None)
pd.read_csv(os.path.join(WORK_DIR, 'Data', 'multi_sentences_{}.csv'.format(0))).head(10)

Unnamed: 0,conceptUriPrimary,conceptUriSecondary,sentence
0,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/4109c79f-0332-498d-a967-b6d22761c639,Vi søger en leder til at styre musikalsk personale og sikre en vellykket planlægning af musikoptræden.
1,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/339f165c-0002-47d2-91a6-ca4722ec682f,Som koordinator skal du lede musikalsk personale og supervisere vores forskellige musikgrupper effektivt.
2,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/aa755e1d-81cd-498b-bd0e-99d393b9aeb4,"Du skal lede musikalsk personale, mens du positionerer de relevante musikere for optimal lyd."
3,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/3e2d3720-84e3-4b4f-84db-013fe1fea42f,I denne rolle vil du lede musikalsk personale og studere musikpartiturer grundigt for at forbedre optrædenerne.
4,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/fbb9ceec-26c2-417c-b849-bf613a7b3e9f,Som musikchef er det vigtigt at lede musikalsk personale og tilstræbe toppræstationer inden for musikoptræden.
5,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/d11e4683-d2fe-45fd-bb1a-e995f7877851,"Vi er på udkig efter en person med dybdegående kendskab til musikteori, som kan lede musikalsk personale."
6,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/e3840f20-1928-4d07-944c-d2dd2ae5cbba,Din opgave vil være at evaluere musikidéer og lede musikalsk personale i udførelsen af projekter.
7,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/d205b276-7f2b-4831-b22b-01427cb53f36,"Når du vælger musikere, skal du lede musikalsk personale med fokus på holdets samlede synergier."
8,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/1e3e7f8b-6416-43ed-af07-a1fb71c5e292,"Du vil lede musikalsk personale og udvikle musikidéer, som engagerer både musikere og publikum."
9,http://data.europa.eu/esco/skill/0005c151-5b5a-4a66-8aac-60e734beb1ab,http://data.europa.eu/esco/skill/52eb7ab6-269f-4f44-a21d-b0c705eaf857,Som musikdirektør vil du lede musikalsk personale og have en bred forståelse af forskellige musikgenrer.
