### jupyter notebook to clean up data from onet

author: Jan Jörg
date: 14.03.2024

In [193]:
import json
from db import get_database
import pandas as pd
import numpy as np

In [194]:
# import authentication data

with open('./infos.json') as f:
    infos = json.load(f)
    onet = infos['onet']
    onetUsername = onet['username']
    onetPassword = onet['password']
    mongodb = infos['mongodb']
    mongoUusername = mongodb['username']
    mongoPpassword = mongodb['password']
    mongoUrl = mongodb['connectionString']

In [195]:
dbname = get_database()

collection = dbname["joined"]
job_zone_col = dbname["job_zone"]
abilities_col = dbname["abilities"]
work_context_col = dbname["work_context"]
knowledge_col = dbname["knowledge"]
skills_col = dbname["skills"]
tasks_col = dbname["tasks"]
work_styles_col = dbname["work_styles"]
detailed_work_activities_col = dbname["detailed_work_activities"]
education_col = dbname["education"]
work_activities_col = dbname["work_activities"]
work_values_col = dbname["work_values"]

documents = collection.find()
job_zone = job_zone_col.find()
abilities = abilities_col.find()
work_context = work_context_col.find()
knowledge = knowledge_col.find()
skills = skills_col.find()
tasks = tasks_col.find()
work_styles = work_styles_col.find()
detailed_work_activities = detailed_work_activities_col.find()
education = education_col.find()
work_activities = work_activities_col.find()
work_values = work_values_col.find()

df = pd.DataFrame(list(documents))
job_zone_df = pd.DataFrame(list(job_zone))
abilities_df = pd.DataFrame(list(abilities))
work_context_df = pd.DataFrame(list(work_context))
knowledge_df = pd.DataFrame(list(knowledge))
skills_df = pd.DataFrame(list(skills))
tasks_df = pd.DataFrame(list(tasks))
work_styles_df = pd.DataFrame(list(work_styles))
detailed_work_activities_df = pd.DataFrame(list(detailed_work_activities))
education_df = pd.DataFrame(list(education))
work_activities_df = pd.DataFrame(list(work_activities))
work_values_df = pd.DataFrame(list(work_values))

In [196]:
# Remove empty cols
print("Total documents: ", len(df))

# Drop columns where some values are null
df = df.dropna(how='all', axis=1)

# Drop rows where all some are null
df = df.dropna(how='all', axis=0)

# Replace any remaining NaN values with 0
df = df.fillna(0)

print("Total documents after dropping na: ", len(df))

Total documents:  846
Total documents after dropping na:  846


In [197]:
# Remove columns that are not useful
df = df.drop(['additional_information',"related_occupations","display","onetsoc19","2010soc","technology_skills","tools_technology","tools_used"], axis=1)
df.head(2)

Unnamed: 0,_id,occupation,tasks,knowledge,skills,abilities,work_activities,detailed_work_activities,work_context,job_zone,interests,work_styles,work_values,education,isco08,Name_de
0,661d1a30d05f1cef398e0e97,"{'code': '27-2011.00', 'title': 'Actors', 'tag...","{'task': [{'id': 7646, 'green': False, 'relate...","{'element': [{'id': '2.C.7.c', 'related': 'htt...","{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'element': [{'id': '4.A.4.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.a.8.I01.D04', 'rel...","{'element': [{'id': '4.C.1.b.1.e', 'related': ...","{'value': 2, 'title': 'Job Zone Two: Some Prep...","{'high_point_code': 'ASE', 'element': [{'id': ...","{'element': [{'id': '1.C.3.a', 'related': 'htt...","{'element': [{'id': '1.B.2.d', 'related': 'htt...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler
1,661d1a30d05f1cef398e0e98,"{'code': '15-2011.00', 'title': 'Actuaries', '...","{'task': [{'id': 3500, 'green': False, 'relate...","{'element': [{'id': '2.C.4.a', 'related': 'htt...","{'element': [{'id': '2.A.2.a', 'related': 'htt...","{'element': [{'id': '1.A.1.c.1', 'related': 'h...","{'element': [{'id': '4.A.2.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.b.4.I09.D06', 'rel...","{'element': [{'id': '4.C.1.a.2.h', 'related': ...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'high_point_code': 'CIE', 'element': [{'id': ...","{'element': [{'id': '1.C.7.b', 'related': 'htt...","{'element': [{'id': '1.B.2.b', 'related': 'htt...","{'level_required': {'category': [{'name': ""Bac...",2120,"Mathematiker, Aktuare und Statistiker"


In [198]:
occupation = pd.DataFrame(list(df["occupation"])) 

# List of columns to drop
cols_to_drop = ["code","updated","sample_of_reported_job_titles","summary_resources","details_resources","custom_resources", "tags"]

# Drop the columns
occupation = occupation.drop(cols_to_drop, axis=1)

occupation = occupation.iloc[:, 0]
df["occupation"] = occupation

df["occupation"].head(3)

0               Actors
1            Actuaries
2    Acute Care Nurses
Name: occupation, dtype: object

In [199]:
new_tasks = list()

for index, row in df["tasks"].items():
    row = pd.DataFrame(row['task'])

    cols_to_drop = ["green","related","name"]
    row = row.drop(cols_to_drop, axis=1)

    # Merge the tasks dataframe with the tasks_df dataframe to get the new task id
    row = row.merge(tasks_df, left_on='id', right_on='id', how='left')

    cols_to_drop = ["_id","name","id"]
    row = row.drop(cols_to_drop, axis=1)

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()

    new_tasks.append(row)

df["tasks"] = new_tasks

df.head(2)

Unnamed: 0,_id,occupation,tasks,knowledge,skills,abilities,work_activities,detailed_work_activities,work_context,job_zone,interests,work_styles,work_values,education,isco08,Name_de
0,661d1a30d05f1cef398e0e97,Actors,"[1, 2, 3, 4, 5]","{'element': [{'id': '2.C.7.c', 'related': 'htt...","{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'element': [{'id': '4.A.4.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.a.8.I01.D04', 'rel...","{'element': [{'id': '4.C.1.b.1.e', 'related': ...","{'value': 2, 'title': 'Job Zone Two: Some Prep...","{'high_point_code': 'ASE', 'element': [{'id': ...","{'element': [{'id': '1.C.3.a', 'related': 'htt...","{'element': [{'id': '1.B.2.d', 'related': 'htt...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler
1,661d1a30d05f1cef398e0e98,Actuaries,"[6, 7, 8, 9, 10]","{'element': [{'id': '2.C.4.a', 'related': 'htt...","{'element': [{'id': '2.A.2.a', 'related': 'htt...","{'element': [{'id': '1.A.1.c.1', 'related': 'h...","{'element': [{'id': '4.A.2.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.b.4.I09.D06', 'rel...","{'element': [{'id': '4.C.1.a.2.h', 'related': ...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'high_point_code': 'CIE', 'element': [{'id': ...","{'element': [{'id': '1.C.7.b', 'related': 'htt...","{'element': [{'id': '1.B.2.b', 'related': 'htt...","{'level_required': {'category': [{'name': ""Bac...",2120,"Mathematiker, Aktuare und Statistiker"


In [200]:
new_skills = list()

for index, row in df["skills"].items():
    row = pd.DataFrame(row['element'])

    cols_to_drop = ["name", "description", "related"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Check if 'id' exists in both DataFrames
    if 'id' in row.columns and 'id' in skills_df.columns:
        # Merge the skills dataframe with the skills_df dataframe to get the new skill id
        row = row.merge(skills_df, left_on='id', right_on='id', how='left')

    cols_to_drop = ["id", "_id", "related", "name", "description"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()

    new_skills.append(row)

df["skills"] = new_skills

df.head(2)

Unnamed: 0,_id,occupation,tasks,knowledge,skills,abilities,work_activities,detailed_work_activities,work_context,job_zone,interests,work_styles,work_values,education,isco08,Name_de
0,661d1a30d05f1cef398e0e97,Actors,"[1, 2, 3, 4, 5]","{'element': [{'id': '2.C.7.c', 'related': 'htt...","[1, 2, 3, 4, 5]","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'element': [{'id': '4.A.4.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.a.8.I01.D04', 'rel...","{'element': [{'id': '4.C.1.b.1.e', 'related': ...","{'value': 2, 'title': 'Job Zone Two: Some Prep...","{'high_point_code': 'ASE', 'element': [{'id': ...","{'element': [{'id': '1.C.3.a', 'related': 'htt...","{'element': [{'id': '1.B.2.d', 'related': 'htt...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler
1,661d1a30d05f1cef398e0e98,Actuaries,"[6, 7, 8, 9, 10]","{'element': [{'id': '2.C.4.a', 'related': 'htt...","[5, 6, 7, 1, 3]","{'element': [{'id': '1.A.1.c.1', 'related': 'h...","{'element': [{'id': '4.A.2.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.b.4.I09.D06', 'rel...","{'element': [{'id': '4.C.1.a.2.h', 'related': ...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'high_point_code': 'CIE', 'element': [{'id': ...","{'element': [{'id': '1.C.7.b', 'related': 'htt...","{'element': [{'id': '1.B.2.b', 'related': 'htt...","{'level_required': {'category': [{'name': ""Bac...",2120,"Mathematiker, Aktuare und Statistiker"


In [201]:
new_abilities = list()

for index, row in df["abilities"].items():
    row = pd.DataFrame(row['element'])

    cols_to_drop = ["name", "description", "related"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Check if 'id' exists in both DataFrames
    if 'id' in row.columns and 'id' in abilities_df.columns:
        # Merge the abilities dataframe with the abilities_df dataframe to get the new ability id
        row = row.merge(abilities_df, left_on='id', right_on='id', how='left')

    cols_to_drop = ["id", "_id", "related", "name", "description"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()

    new_abilities.append(row)

df["abilities"] = new_abilities

df.head(2)

Unnamed: 0,_id,occupation,tasks,knowledge,skills,abilities,work_activities,detailed_work_activities,work_context,job_zone,interests,work_styles,work_values,education,isco08,Name_de
0,661d1a30d05f1cef398e0e97,Actors,"[1, 2, 3, 4, 5]","{'element': [{'id': '2.C.7.c', 'related': 'htt...","[1, 2, 3, 4, 5]","[1, 2, 3, 4, 5]","{'element': [{'id': '4.A.4.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.a.8.I01.D04', 'rel...","{'element': [{'id': '4.C.1.b.1.e', 'related': ...","{'value': 2, 'title': 'Job Zone Two: Some Prep...","{'high_point_code': 'ASE', 'element': [{'id': ...","{'element': [{'id': '1.C.3.a', 'related': 'htt...","{'element': [{'id': '1.B.2.d', 'related': 'htt...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler
1,661d1a30d05f1cef398e0e98,Actuaries,"[6, 7, 8, 9, 10]","{'element': [{'id': '2.C.4.a', 'related': 'htt...","[5, 6, 7, 1, 3]","[6, 7, 8, 9, 10]","{'element': [{'id': '4.A.2.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.b.4.I09.D06', 'rel...","{'element': [{'id': '4.C.1.a.2.h', 'related': ...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'high_point_code': 'CIE', 'element': [{'id': ...","{'element': [{'id': '1.C.7.b', 'related': 'htt...","{'element': [{'id': '1.B.2.b', 'related': 'htt...","{'level_required': {'category': [{'name': ""Bac...",2120,"Mathematiker, Aktuare und Statistiker"


In [202]:
new_education = list()

for index, row in df["education"].items():
    row = pd.DataFrame(row['level_required']['category'])

    # Check if 'score' exists
    if 'score' in row.columns:
        # Extract 'value' from 'score' and convert it to a DataFrame column
        max_value = row['score'].apply(lambda d: d['value'])

        # Find the index of the row with the highest score
        max_score_index = max_value.idxmax()

        # Select only the row with the highest score
        row = row.loc[[max_score_index]]

    cols_to_drop = ["description", "score"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

   
    # Merge the education dataframe with the education_df dataframe to get the new education level
    row = row.merge(education_df, left_on='name', right_on='name', how='left')

    cols_to_drop = ["id", "_id", "name"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()[0]

    new_education.append(row)


df["education"] = new_education

df.head(2)

Unnamed: 0,_id,occupation,tasks,knowledge,skills,abilities,work_activities,detailed_work_activities,work_context,job_zone,interests,work_styles,work_values,education,isco08,Name_de
0,661d1a30d05f1cef398e0e97,Actors,"[1, 2, 3, 4, 5]","{'element': [{'id': '2.C.7.c', 'related': 'htt...","[1, 2, 3, 4, 5]","[1, 2, 3, 4, 5]","{'element': [{'id': '4.A.4.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.a.8.I01.D04', 'rel...","{'element': [{'id': '4.C.1.b.1.e', 'related': ...","{'value': 2, 'title': 'Job Zone Two: Some Prep...","{'high_point_code': 'ASE', 'element': [{'id': ...","{'element': [{'id': '1.C.3.a', 'related': 'htt...","{'element': [{'id': '1.B.2.d', 'related': 'htt...",[1],2655,Schauspieler
1,661d1a30d05f1cef398e0e98,Actuaries,"[6, 7, 8, 9, 10]","{'element': [{'id': '2.C.4.a', 'related': 'htt...","[5, 6, 7, 1, 3]","[6, 7, 8, 9, 10]","{'element': [{'id': '4.A.2.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.b.4.I09.D06', 'rel...","{'element': [{'id': '4.C.1.a.2.h', 'related': ...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'high_point_code': 'CIE', 'element': [{'id': ...","{'element': [{'id': '1.C.7.b', 'related': 'htt...","{'element': [{'id': '1.B.2.b', 'related': 'htt...",[2],2120,"Mathematiker, Aktuare und Statistiker"


In [203]:
new_job_zones = list()

for index, row in df["job_zone"].items():
    value = row["value"]
    
    row = job_zone_df[job_zone_df["value"] == value]
  
    cols_to_drop = ["id", "_id", "related_experience", "education", "title", "job_training","job_zone_examples","value"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()[0]
    
    new_job_zones.append(row)

df["job_zone"] = new_job_zones

df.head(2)

Unnamed: 0,_id,occupation,tasks,knowledge,skills,abilities,work_activities,detailed_work_activities,work_context,job_zone,interests,work_styles,work_values,education,isco08,Name_de
0,661d1a30d05f1cef398e0e97,Actors,"[1, 2, 3, 4, 5]","{'element': [{'id': '2.C.7.c', 'related': 'htt...","[1, 2, 3, 4, 5]","[1, 2, 3, 4, 5]","{'element': [{'id': '4.A.4.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.a.8.I01.D04', 'rel...","{'element': [{'id': '4.C.1.b.1.e', 'related': ...",[1],"{'high_point_code': 'ASE', 'element': [{'id': ...","{'element': [{'id': '1.C.3.a', 'related': 'htt...","{'element': [{'id': '1.B.2.d', 'related': 'htt...",[1],2655,Schauspieler
1,661d1a30d05f1cef398e0e98,Actuaries,"[6, 7, 8, 9, 10]","{'element': [{'id': '2.C.4.a', 'related': 'htt...","[5, 6, 7, 1, 3]","[6, 7, 8, 9, 10]","{'element': [{'id': '4.A.2.a.4', 'related': 'h...","{'activity': [{'id': '4.A.4.b.4.I09.D06', 'rel...","{'element': [{'id': '4.C.1.a.2.h', 'related': ...",[2],"{'high_point_code': 'CIE', 'element': [{'id': ...","{'element': [{'id': '1.C.7.b', 'related': 'htt...","{'element': [{'id': '1.B.2.b', 'related': 'htt...",[2],2120,"Mathematiker, Aktuare und Statistiker"


In [204]:
def convert_np_array_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

In [205]:
dbname = get_database()

insert_df = df.applymap(convert_np_array_to_list)

collection_name = dbname["with_id"]

# Insert the dictionaries into the MongoDB collection
collection_name.insert_many(insert_df.to_dict("records"))

  insert_df = df.applymap(convert_np_array_to_list)


InsertManyResult([ObjectId('661d1a30d05f1cef398e0e97'), ObjectId('661d1a30d05f1cef398e0e98'), ObjectId('661d1a30d05f1cef398e0e99'), ObjectId('661d1a30d05f1cef398e0e9a'), ObjectId('661d1a30d05f1cef398e0e9b'), ObjectId('661d1a30d05f1cef398e0e9c'), ObjectId('661d1a30d05f1cef398e0e9d'), ObjectId('661d1a30d05f1cef398e0e9e'), ObjectId('661d1a30d05f1cef398e0e9f'), ObjectId('661d1a30d05f1cef398e0ea0'), ObjectId('661d1a30d05f1cef398e0ea1'), ObjectId('661d1a30d05f1cef398e0ea2'), ObjectId('661d1a30d05f1cef398e0ea3'), ObjectId('661d1a30d05f1cef398e0ea4'), ObjectId('661d1a30d05f1cef398e0ea5'), ObjectId('661d1a30d05f1cef398e0ea6'), ObjectId('661d1a30d05f1cef398e0ea7'), ObjectId('661d1a30d05f1cef398e0ea8'), ObjectId('661d1a30d05f1cef398e0ea9'), ObjectId('661d1a30d05f1cef398e0eaa'), ObjectId('661d1a30d05f1cef398e0eab'), ObjectId('661d1a30d05f1cef398e0eac'), ObjectId('661d1a30d05f1cef398e0ead'), ObjectId('661d1a30d05f1cef398e0eae'), ObjectId('661d1a30d05f1cef398e0eaf'), ObjectId('661d1a30d05f1cef398e0e