In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
from langchain.globals import set_llm_cache
from langchain.cache import SQLiteCache

set_llm_cache(SQLiteCache(database_path="data/.langchain.db"))

In [27]:
import json
import pandas as pd
from langchain_openai import ChatOpenAI

GPT_MODEL = "gpt-4o-mini"

llm = ChatOpenAI(model=GPT_MODEL)




In [28]:
import glob
from tqdm import tqdm

tqdm.pandas()

In [29]:

IDs = []
F = glob.glob("/home/kelu/projets/mm_jobs/data/jobs/*.json")
print(len(F))
for file in F:
    with open(file, "r") as f:
        t = f.read()
    if not t == "Unauthorized":
        D = json.loads(t)
        for x in D["hits"]["hits"]:
            IDs.append(x["_source"])

df = pd.DataFrame(IDs)[
    [
        "jobRef",
        "pageVersionId",
        "contentPageId",
        "title",
        "pageText",
        "publishedDate",
        "sector",
        "discipline",
        "jobSector",
    ]
]

df["Digital"] = df.sector.apply(lambda x: "Digital" in x)


1726


In [30]:
df.head(2)

Unnamed: 0,jobRef,pageVersionId,contentPageId,title,pageText,publishedDate,sector,discipline,jobSector,Digital
0,67558BR,5371358,80554,Principal Power Systems Studies Engineer,Principal Power Systems Studies Engineer About...,2023-11-06T00:00:00,[Energy],"[Electrical, Energy]",Energy,False
1,5305,5545110,85005,Communications Specialist,Communications Specialist Communications Speci...,2024-10-10T09:59:09,[],[International development],International Development,False


# Simplifying the dataset

In [31]:
skillSet = df[["jobRef","contentPageId","title","pageText","publishedDate"]]

In [32]:
def getSkills(txt):
    prompt = """## Instructions\n\nFrom the below text to analyse, you have to extract 'digital skills', 'capabilities', or names of digital solutions, that are unitary skills in the digital realm, and answer with the style of a comma-separated list of strings between square brackets. Do not include non-software or non-digital skills.  \n\n## Examples of answer\n\n["Data Management","BIM", "ISO19650"]\n\n["Digital strategy","information management"]\n\n## Text to analyse\n\n"""
    try:
        skills = eval(llm.invoke(prompt+txt).content)
    except:
        print("error")
        skills = ['ERROR']
    return skills

In [37]:
skillSet["SKILLS_List"] = "None"
skillSet["SKILLS_List"] = skillSet["pageText"].progress_apply(lambda x: getSkills(str(x)))

  0%|          | 0/1726 [00:00<?, ?it/s]

100%|██████████| 1726/1726 [00:01<00:00, 1074.78it/s]


### Saving skills

In [38]:
skillSet

Unnamed: 0,jobRef,contentPageId,title,pageText,publishedDate,SKILLS_List
0,67558BR,80554,Principal Power Systems Studies Engineer,Principal Power Systems Studies Engineer About...,2023-11-06T00:00:00,"[Power System Analysis, Power System modelling..."
1,5305,85005,Communications Specialist,Communications Specialist Communications Speci...,2024-10-10T09:59:09,"[Adobe PhotoShop, InDesign, Canva]"
2,67436BR,80484,Principal Mechanical Engineer,"About the roleOur Power, Process &amp; Nuclear...",2023-10-24T23:00:00,[]
3,65546BR,79294,Senior geospatial analyst,Senior geospatial analyst North America United...,2023-11-20T00:00:00,"[GIS, Esri suite, Survey123, ArcGIS online, Ar..."
4,66573BR,79819,Senior project information specialist,"Here at Mott MacDonald, we are experts in engi...",2023-09-11T23:00:00,"[Document Management, PowerApps, Power Automat..."
...,...,...,...,...,...,...
1721,67851BR,80744,Commercial Manager,"Commercial Manager Equality, diversity and inc...",2023-11-15T00:00:00,"[Commercial management software, eSourcing, eE..."
1722,3059,84096,Project Planning Director,Project Planning Director Project Planning Dir...,2024-07-19T13:43:56,"[Digital Consulting, data, digital delivery ca..."
1723,3298,83848,Active Travel Lead,Active Travel Lead Active Travel Lead Active T...,2024-06-28T11:39:25,"[Digital delivery, digital technology, Busines..."
1724,2103,83020,Senior Water/Wastewater Engineer,Senior Water/Wastewater Engineer Senior Water/...,2024-04-26T17:36:47,"[AutoCAD, Civil 3D, Microsoft Office Suite, hy..."


In [39]:
skillSet.to_parquet("outputs/skills.parquet.gzip",compression="gzip")

# Aggregating skills

In [40]:
skillSet = pd.read_parquet("outputs/skills.parquet.gzip")

skillSet = skillSet[["jobRef","contentPageId","title","pageText","publishedDate","SKILLS_List"]]

SKILLS = []
for ix, row in skillSet.iterrows():
    for skill in row["SKILLS_List"]:
        SKILLS.append(skill.strip().title())

skillSet.head()

Unnamed: 0,jobRef,contentPageId,title,pageText,publishedDate,SKILLS_List
0,67558BR,80554,Principal Power Systems Studies Engineer,Principal Power Systems Studies Engineer About...,2023-11-06T00:00:00,"[Power System Analysis, Power System modelling..."
1,5305,85005,Communications Specialist,Communications Specialist Communications Speci...,2024-10-10T09:59:09,"[Adobe PhotoShop, InDesign, Canva]"
2,67436BR,80484,Principal Mechanical Engineer,"About the roleOur Power, Process &amp; Nuclear...",2023-10-24T23:00:00,[]
3,65546BR,79294,Senior geospatial analyst,Senior geospatial analyst North America United...,2023-11-20T00:00:00,"[GIS, Esri suite, Survey123, ArcGIS online, Ar..."
4,66573BR,79819,Senior project information specialist,"Here at Mott MacDonald, we are experts in engi...",2023-09-11T23:00:00,"[Document Management, PowerApps, Power Automat..."


# Creating skills database

In [41]:
from collections import Counter
from random import sample

skillsdb = dict(Counter(SKILLS))
print(len(skillsdb))

3580


In [37]:
fams = []
for k in range(10):
    SAMPLE = sample(list(skillsdb.keys()),100)
    prompt = """## Instructions\n\nFrom the below list to analyse, you have to extract families of 'digital skills', or 'capabilities', grouped by family name and answer as a json. Do not include non-software or non-digital skills. Don't use decorators like '```json' \n\n## Examples of answer\n\n{'Data':['Data Governance', 'Data Manegement'], "Advisory":["Strategic Advisory","Digital Strategy"]} \n\n## List to analyse\n\n"""
    SK = "* "+"\n* ".join(SAMPLE)
    try:
        families = json.loads(llm.invoke(prompt+SK).content)
        with open("outputs/families/"+str(k)+".json","w") as f:
            f.write(json.dumps(families))
        [fams.append(x) for x in families.keys()]
    except:
        print("Error with ",k)
print(fams)

### Assembling skills

In [11]:
families = ["Project Management","Software Development","Capacity building","VR", "Emerging technologies","Data Management","Information Management","Automation","Collaboration Tools","AI","Office Tools","Cloud Technologies", "3D modelling", "Engineering design", "Geospatial analysis","Technical design"]

In [12]:
prompt = "# Instructions\n\nYou are a world-wide digital capabilities expert. For each item of the capabilities list you are given, you need to find its 'capability' parent group, to which it is most relevant"
prompt += "\n\nYour answer is only a list of bullet point, which is the initial list you are given, then add a ':' separating the item from its parent capability group. "
prompt += "# List of the parent capability group\n\n* "+"\n* ".join(families)+"\n\n#Example of answer:\n\n* Data processing : Data Management\nOpenAI : AI\n...\n\n# List to review:"

In [13]:
allClassified = ""
SKILLS_set = list(set(SKILLS))
k = len(SKILLS_set)//100
print(k,"iterations")
for i in range(k):
    skill_sample = SKILLS_set[i*100:(i+1)*100]
    allClassified = allClassified +"\n" + str(llm.invoke(prompt+"* "+"\n* ".join(skill_sample)).content)

with open("outputs/presorted.md","w") as f:
    f.write(allClassified)

35 iterations


In [42]:
tuples = [x.strip("*").strip() for x in allClassified.split("\n") if len(x)]
matches = {}
for tuple in tuples:
    A = [x.strip() for x in tuple.split(":")]
    a = A[0]
    b = A[1]
    matches[a] = b
with open("outputs/presorted.json","w") as f:
    f.write(json.dumps(matches))

# Classifying skills

In [70]:
def extractFams(x):
    all = []
    for y in x:
        try:
            all.append(matches[y.strip().title()])
        except:
            pass
    all = list(set(all))
    return all

In [72]:
with open("outputs/presorted.json","r") as f:
    matches = json.loads ( f.read() )

skillSet = pd.read_parquet("outputs/skills.parquet.gzip")
skillSet = skillSet[["jobRef","contentPageId","title","pageText","publishedDate","SKILLS_List"]]
skillSet["SKILLS"] = skillSet.SKILLS_List.apply(lambda x: extractFams(x))
skillSet

Unnamed: 0,jobRef,contentPageId,title,pageText,publishedDate,SKILLS_List,SKILLS
0,67558BR,80554,Principal Power Systems Studies Engineer,Principal Power Systems Studies Engineer About...,2023-11-06T00:00:00,"[Power System Analysis, Power System modelling...","[Project Management, Software Development, Dat..."
1,5305,85005,Communications Specialist,Communications Specialist Communications Speci...,2024-10-10T09:59:09,"[Adobe PhotoShop, InDesign, Canva]","[Collaboration Tools, Software Development]"
2,67436BR,80484,Principal Mechanical Engineer,"About the roleOur Power, Process &amp; Nuclear...",2023-10-24T23:00:00,[],[]
3,65546BR,79294,Senior geospatial analyst,Senior geospatial analyst North America United...,2023-11-20T00:00:00,"[GIS, Esri suite, Survey123, ArcGIS online, Ar...","[Geospatial analysis, Data Management, Automat..."
4,66573BR,79819,Senior project information specialist,"Here at Mott MacDonald, we are experts in engi...",2023-09-11T23:00:00,"[Document Management, PowerApps, Power Automat...","[Project Management, Software Development, Inf..."
...,...,...,...,...,...,...,...
1721,67851BR,80744,Commercial Manager,"Commercial Manager Equality, diversity and inc...",2023-11-15T00:00:00,"[Commercial management software, eSourcing, eE...","[Collaboration Tools, Project Management, Data..."
1722,3059,84096,Project Planning Director,Project Planning Director Project Planning Dir...,2024-07-19T13:43:56,"[Digital Consulting, data, digital delivery ca...","[Collaboration Tools, Data Management, AI, Eme..."
1723,3298,83848,Active Travel Lead,Active Travel Lead Active Travel Lead Active T...,2024-06-28T11:39:25,"[Digital delivery, digital technology, Busines...","[Project Management, Software Development, Eme..."
1724,2103,83020,Senior Water/Wastewater Engineer,Senior Water/Wastewater Engineer Senior Water/...,2024-04-26T17:36:47,"[AutoCAD, Civil 3D, Microsoft Office Suite, hy...","[Technical design, Collaboration Tools, Inform..."
