In [None]:
# install openai library
!pip install openai -qq

In [None]:
# dependencies
import os
import re
import time
import numpy as np
import pandas as pd
import asyncio
import nest_asyncio
import httpx
from openai import AsyncOpenAI
from google.colab import drive, userdata

# async fix for notebook
nest_asyncio.apply()

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

Mounted at /content/drive


In [None]:
# openai client for async
client = AsyncOpenAI(api_key=userdata.get('OpenAI'), http_client=httpx.AsyncClient())

In [None]:
# prompt to completion (api response)
async def get_completion(prompt):
    response = await client.chat.completions.create(
        model='gpt-4o-mini',

        messages=[{'role': 'user', 'content': prompt}]
    )
    return response

# prompts to completions (api responses)
async def get_completions(prompts):
    tasks = [get_completion(prompt) for prompt in prompts]
    return await asyncio.gather(*tasks)

In [None]:
# prompt template
prompt = 'Please translate the following 20 sentences which are separated by line break into Danish, and output them in plain text in the exact same order as they are given to you. Mirror the symbols and composition. Do not use line break in any sentence, but separate the sentences by line break instead.'

In [None]:
# get skills dataset
df = pd.read_json(work_dir('Data', 'bench.json'), orient='records', lines=True)
df['sentence'] = df['sentence'].str.replace('\n', ' ')
print(df.shape)
df.head(3)

(1060, 6)


Unnamed: 0,conceptUri,group,sentence,tokens_labse,tokens_mpnet,tokens_l12v2
0,[http://data.europa.eu/esco/skill/60c78287-22e...,1,* Ability to work in large collaborative teams...,"[101, 115, 317975, 14986, 16751, 14981, 21142,...","[0, 661, 62, 83259, 47, 4488, 23, 21334, 57119...","[0, 661, 62, 83259, 47, 4488, 23, 21334, 57119..."
1,[http://data.europa.eu/esco/skill/f7e2eb04-3e5...,1,* Advanced knowledge of application data and i...,"[101, 115, 39467, 30283, 14997, 22208, 15695, ...","[0, 661, 127596, 51359, 111, 38415, 2053, 136,...","[0, 661, 127596, 51359, 111, 38415, 2053, 136,..."
2,[http://data.europa.eu/esco/skill/19a8293b-8e9...,1,* Java Spring Boot NoSQL Message Cloud CI/CD E...,"[101, 115, 27861, 25930, 61397, 15445, 388404,...","[0, 661, 41925, 38026, 58800, 438, 158897, 482...","[0, 661, 41925, 38026, 58800, 438, 158897, 482..."


In [None]:
# generate groups of prompts
groups = np.array_split(df['sentence'].tolist(), np.arange(20, len(df['sentence']), 20))
prompts = [prompt + '\n\n' + '\n'.join(sentences) for sentences in groups]
print(len(groups))
print([len(c) for c in groups])
print(prompts[0])

53
[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
Please translate the following 20 sentences which are separated by line break into Danish, and output them in plain text in the exact same order as they are given to you. Mirror the symbols and composition. Do not use line break in any sentence, but separate the sentences by line break instead.

* Ability to work in large collaborative teams to achieve organizational goals
* Advanced knowledge of application data and infrastructure architecture disciplines
* Java Spring Boot NoSQL Message Cloud CI/CD Experience
* Knowledge of industry-wide technology trends and best practices
* Passionate about building an innovative culture
* Understanding of architecture and design across all systems
* Understanding of software skills such as business analysis development maintenance and 

In [None]:
# no chunking, not that much data
completions = await get_completions(prompts)
completions = [[t for s in c.choices[0].message.content.split('\n') if (t := re.sub(r'^[\W0-9]+', '', s).strip()) != ''] for c in completions]

In [None]:
# flatten
translated = []
for c in completions:
    translated += c

# sample some arbitrary translations
print(df.iloc[1040]['sentence'])
print(translated[1040])

You will be proficient with IT systems, have excellent communication skills, and a key eye for detail., Similar Job Titles:
Du vil være dygtig med IT-systemer, have fremragende kommunikationsevner og et skarpt øje for detaljer.


In [None]:
# replace into existing
translated_df = df.assign(sentence=translated)
translated_df

Unnamed: 0,conceptUri,group,sentence
0,[http://data.europa.eu/esco/skill/60c78287-22e...,1,Evne til at arbejde i store samarbejdsteams fo...
1,[http://data.europa.eu/esco/skill/f7e2eb04-3e5...,1,Avanceret viden om anvendelsesdata og infrastr...
2,[http://data.europa.eu/esco/skill/19a8293b-8e9...,1,Java Spring Boot NoSQL Message Cloud CI/CD erf...
3,[http://data.europa.eu/esco/skill/7a17d7ce-01a...,1,Viden om branchens teknologi trends og bedste ...
4,[http://data.europa.eu/esco/skill/c2a0c52c-0b4...,1,Passioneret om at opbygge en innovativ kultur
...,...,...,...
1048,[http://data.europa.eu/esco/skill/cb668e89-6ef...,5,Dit første fokus vil være at styre implementer...
1049,[http://data.europa.eu/esco/skill/7ff2c668-0e8...,5,Din rolle vil være at arbejde med både test- o...
1050,[http://data.europa.eu/esco/skill/699e7c26-650...,5,hjælp i deres udvikling.
1051,[http://data.europa.eu/esco/skill/21c5790c-093...,5,og mindst 2 års erfaring på mellemniveau i for...


In [None]:
# save!
translated_df.to_json(work_dir('Data', 'translated_bench.json'), orient='records', lines=True, index=False)