In [None]:
# install openai library
!pip install openai -qq

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m335.9/335.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# dependencies
import os
import re
import time
import numpy as np
import pandas as pd
import asyncio
import nest_asyncio
from openai import AsyncOpenAI
from google.colab import drive, userdata

# async fix for notebook
nest_asyncio.apply()

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# openai client for async
client = AsyncOpenAI(api_key=userdata.get('OpenAI'))

In [None]:
# prompt to completion (api response)
async def get_completion(prompt):
    response = await client.chat.completions.create(
        model='gpt-4o-mini',

        messages=[{'role': 'user', 'content': prompt}]
    )
    return response

# prompts to completions (api responses)
async def get_completions(prompts):
    tasks = [get_completion(prompt) for prompt in prompts]
    return await asyncio.gather(*tasks)

In [None]:
# prompt template
prompt = 'Please translate the following 10 sentences into Danish, and output them in plain text in the same order as they are given to you, separated by linebreak:'

In [None]:
# get english sentence dataset
df = pd.read_csv(work_dir('Data', 'dataset.csv'))
print(df.shape)
df.head(3)

(138260, 2)


Unnamed: 0,sentence,skill
0,the ideal candidate for this position should b...,advise customers on sewing patterns
1,we need an employee who is able to assist our ...,advise customers on sewing patterns
2,if you possess good communication skills and h...,advise customers on sewing patterns


In [None]:
# generate prompt for each skill
prompts = df.groupby('skill')['sentence'].agg('\n'.join).reset_index()
prompts['sentence'] = prompt + '\n\n' + prompts['sentence']

# check example
prompts.shape
prompts.head(3)

Unnamed: 0,skill,sentence
0,procurement legislation,Please translate the following 10 sentences in...
1,3D body scanning technologies,Please translate the following 10 sentences in...
2,3D lighting,Please translate the following 10 sentences in...


In [None]:
# check full prompt example
print(prompts.head(1)['sentence'].values[0])

Please translate the following 10 sentences into Danish, and output them in plain text in the same order as they are given to you, separated by linebreak:

a comprehensive understanding of procurement legislation is essential for this position.
familiarity and experience with procurement legislation is a must-have for applicants to this position.
applicants with experience working in fields related to procurement legislation are strongly encouraged to apply.
to be considered for this role, you must have a strong understanding of procurement legislation and the role it plays in public procurement.
the ideal candidate for this job will have a deep understanding of procurement legislation and its implications for public procurement.
the ideal candidate will have a proven track record of working with procurement legislation and policies.
knowledge of procurement legislation at the national and European levels is crucial to the success of this role.
minimum qualifications for this role incl

In [None]:
# separate in chunks of 100
chunks = np.array_split(prompts, np.arange(100, len(prompts), 100))

  return bound(*args, **kwds)


Unnamed: 0,skill,sentence
100,German,Please translate the following 10 sentences in...
101,Global Maritime Distress and Safety System,Please translate the following 10 sentences in...
102,Greek,Please translate the following 10 sentences in...
103,Groovy,Please translate the following 10 sentences in...
104,Grovo,Please translate the following 10 sentences in...
...,...,...
195,MarkLogic,Please translate the following 10 sentences in...
196,Metasploit,Please translate the following 10 sentences in...
197,Microsoft Access,Please translate the following 10 sentences in...
198,Microsoft Visio,Please translate the following 10 sentences in...


In [None]:
# get completions for each chunk
for i, chunk in enumerate(chunks):
    filename = os.path.join(WORK_DIR, 'Translated_data', 'translated_sentences_{}.csv'.format(i))
    if os.path.exists(filename):
        continue
    completions = await get_completions(chunk['sentence'].values)
    completions = [[t for s in c.choices[0].message.content.split('\n') if (t := re.sub(r'^[\W0-9]+', '', s).strip()) != ''] for c in completions]
    chunk['sentence'] = completions
    chunk.explode('sentence')[['skill', 'sentence']].to_csv(filename, index=False)
    time.sleep(50) # avoiding rate limits

In [None]:
# sanity check
df = pd.read_csv(os.path.join(WORK_DIR, 'Translated_data', 'translated_sentences_{}.csv'.format(i)))
df

Unnamed: 0,conceptUri,completion
0,http://data.europa.eu/esco/skill/fe77f9ca-7bd2...,Arbejder du inden for bygge- og anlægsindustri...
1,http://data.europa.eu/esco/skill/fe77f9ca-7bd2...,En solid forståelse af de mest anvendte produk...
2,http://data.europa.eu/esco/skill/fe77f9ca-7bd2...,"Kendskab til de førende produkter, populære mæ..."
3,http://data.europa.eu/esco/skill/fe77f9ca-7bd2...,Eksperter inden for bygge- og anlægsindustrien...
4,http://data.europa.eu/esco/skill/fe77f9ca-7bd2...,En omfattende kendskab til de mest efterspurgt...
...,...,...
1330,http://data.europa.eu/esco/skill/fff74a70-4f82...,erfaring med montering af gelændere og fodbrædder
1331,http://data.europa.eu/esco/skill/fff74a70-4f82...,evne til at arbejde med præcision og sikkerhed
1332,http://data.europa.eu/esco/skill/fff74a70-4f82...,kendskab til at fastgøre gelændere med kobling...
1333,http://data.europa.eu/esco/skill/fff74a70-4f82...,professionel tilgang til at forebygge fald af ...
