### Python notebook to load data from onet

author: Jan Jörg
date: 13.03.2024

In [8]:
# imports
import requests
from requests.auth import HTTPBasicAuth
import json
from pymongo import MongoClient

In [9]:
# import authentication data

with open('./infos.json') as f:
    infos = json.load(f)
    onet = infos['onet']
    onetUsername = onet['username']
    onetPassword = onet['password']
    mongodb = infos['mongodb']
    mongoUusername = mongodb['username']
    mongoPpassword = mongodb['password']
    mongoUrl = mongodb['connectionString']


In [27]:
# get all available jobs from onet
occupationsUrl = "https://services.onetcenter.org/ws/online/occupations/"
headers = {'Accept': 'application/json'}
initialRequest = requests.get(occupationsUrl, auth=HTTPBasicAuth(onetUsername, onetPassword), headers=headers)

if initialRequest.status_code == 200:
    data = initialRequest.json()
    # the initial request only returns 20 jobs, so we need to make another request to get all the jobs. 
    # the initial request includes the total of available jobs
    total = data['total']
else:
    print(f"Request failed with status code {initialRequest.status_code}")

getAllOccupations = requests.get(occupationsUrl, auth=HTTPBasicAuth(onetUsername, onetPassword), headers=headers, params={'start': 1, 'end': total})

if getAllOccupations.status_code == 200:
    data = getAllOccupations.json()
    occupations = data['occupation']
else:
    print(f"Request failed with status code {getAllOccupations.status_code}")

[{'href': 'https://services.onetcenter.org/ws/online/occupations/13-2011.00/', 'code': '13-2011.00', 'title': 'Accountants and Auditors', 'tags': {'bright_outlook': True, 'green': False}}, {'href': 'https://services.onetcenter.org/ws/online/occupations/27-2011.00/', 'code': '27-2011.00', 'title': 'Actors', 'tags': {'bright_outlook': False, 'green': False}}, {'href': 'https://services.onetcenter.org/ws/online/occupations/15-2011.00/', 'code': '15-2011.00', 'title': 'Actuaries', 'tags': {'bright_outlook': True, 'green': False}}, {'href': 'https://services.onetcenter.org/ws/online/occupations/29-1291.00/', 'code': '29-1291.00', 'title': 'Acupuncturists', 'tags': {'bright_outlook': True, 'green': False}}, {'href': 'https://services.onetcenter.org/ws/online/occupations/29-1141.01/', 'code': '29-1141.01', 'title': 'Acute Care Nurses', 'tags': {'bright_outlook': True, 'green': False}}, {'href': 'https://services.onetcenter.org/ws/online/occupations/25-2059.01/', 'code': '25-2059.01', 'title':

In [28]:
# Fetch summary for each occupation
summaries=[]

for occupation in occupations:
    summaryUrl = f"https://services.onetcenter.org/ws/online/occupations/{occupation['code']}/summary/"
    summaryRequest = requests.get(summaryUrl, auth=HTTPBasicAuth(onetUsername, onetPassword), headers=headers)
   
    if summaryRequest.status_code == 200:
        occupation_data = summaryRequest.json()

        summaries.append((occupation, occupation_data))
    else:
        print(f"Request for occupation {occupation['code']} failed with status code {summaryRequest.status_code}")

In [37]:

def get_database():
 
   # Provide the mongodb atlas url to connect python to mongodb using pymongo
   CONNECTION_STRING = mongoUrl
 
   # Create a connection using MongoClient
   client = MongoClient(CONNECTION_STRING)
 
   # Create the database 
   return client['bachelorarbeit']
  
# This is added so that many files can reuse the function get_database()
if __name__ == "__main__":   
  
   # Get the database

   dbname = get_database()

In [38]:
collection_name = dbname["onet"]

# Convert summaries to a list of dictionaries
summaries_dict = [{'code': code, 'summary': summary} for code, summary in summaries]

print(len(summaries_dict))

# Insert the dictionaries into the MongoDB collection
collection_name.insert_many(summaries_dict)

1016


InsertManyResult([ObjectId('65f1eafb105b54d267258a79'), ObjectId('65f1eafb105b54d267258a7a'), ObjectId('65f1eafb105b54d267258a7b'), ObjectId('65f1eafb105b54d267258a7c'), ObjectId('65f1eafb105b54d267258a7d'), ObjectId('65f1eafb105b54d267258a7e'), ObjectId('65f1eafb105b54d267258a7f'), ObjectId('65f1eafb105b54d267258a80'), ObjectId('65f1eafb105b54d267258a81'), ObjectId('65f1eafb105b54d267258a82'), ObjectId('65f1eafb105b54d267258a83'), ObjectId('65f1eafb105b54d267258a84'), ObjectId('65f1eafb105b54d267258a85'), ObjectId('65f1eafb105b54d267258a86'), ObjectId('65f1eafb105b54d267258a87'), ObjectId('65f1eafb105b54d267258a88'), ObjectId('65f1eafb105b54d267258a89'), ObjectId('65f1eafb105b54d267258a8a'), ObjectId('65f1eafb105b54d267258a8b'), ObjectId('65f1eafb105b54d267258a8c'), ObjectId('65f1eafb105b54d267258a8d'), ObjectId('65f1eafb105b54d267258a8e'), ObjectId('65f1eafb105b54d267258a8f'), ObjectId('65f1eafb105b54d267258a90'), ObjectId('65f1eafb105b54d267258a91'), ObjectId('65f1eafb105b54d267258a