### jupyter notebook to do feature engineering

author: Jan Jörg
date: 14.03.2024

In [14]:
import json
from db import get_database
import pandas as pd
import numpy as np

In [15]:
# import authentication data

with open('./infos.json') as f:
    infos = json.load(f)
    onet = infos['onet']
    onetUsername = onet['username']
    onetPassword = onet['password']
    mongodb = infos['mongodb']
    mongoUusername = mongodb['username']
    mongoPpassword = mongodb['password']
    mongoUrl = mongodb['connectionString']

In [16]:
dbname = get_database()

collection = dbname["joined"]
job_zone_col = dbname["job_zone"]
abilities_col = dbname["abilities"]
work_context_col = dbname["work_context"]
knowledge_col = dbname["knowledge"]
skills_col = dbname["skills"]
work_styles_col = dbname["work_styles"]
detailed_work_activities_col = dbname["detailed_work_activities"]
education_col = dbname["education"]
work_activities_col = dbname["work_activities"]
work_values_col = dbname["work_values"]

documents = collection.find()
job_zone = job_zone_col.find()
abilities = abilities_col.find()
work_context = work_context_col.find()
knowledge = knowledge_col.find()
skills = skills_col.find()
work_styles = work_styles_col.find()
detailed_work_activities = detailed_work_activities_col.find()
education = education_col.find()
work_activities = work_activities_col.find()
work_values = work_values_col.find()

df = pd.DataFrame(list(documents))
job_zone_df = pd.DataFrame(list(job_zone))
abilities_df = pd.DataFrame(list(abilities))
work_context_df = pd.DataFrame(list(work_context))
knowledge_df = pd.DataFrame(list(knowledge))
skills_df = pd.DataFrame(list(skills))
work_styles_df = pd.DataFrame(list(work_styles))
detailed_work_activities_df = pd.DataFrame(list(detailed_work_activities))
education_df = pd.DataFrame(list(education))
work_activities_df = pd.DataFrame(list(work_activities))
work_values_df = pd.DataFrame(list(work_values))

In [17]:
# Remove empty cols
print("Total documents: ", len(df))

# Drop columns where some values are null
df = df.dropna(how='all', axis=1)

# Drop rows where all some are null
df = df.dropna(how='all', axis=0)

# Replace any remaining NaN values with 0
df = df.fillna(0)

print("Total documents after dropping na: ", len(df))

Total documents:  846
Total documents after dropping na:  846


In [18]:
# Remove columns that are not useful
df = df.drop(['additional_information',"tasks","related_occupations","display","onetsoc19","2010soc","technology_skills","tools_technology","_id","tools_used","work_context","interests","work_values","knowledge","detailed_work_activities","work_activities","work_styles"], axis=1)
df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de
0,"{'code': '27-2011.00', 'title': 'Actors', 'tag...","{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler
1,"{'code': '15-2011.00', 'title': 'Actuaries', '...","{'element': [{'id': '2.A.2.a', 'related': 'htt...","{'element': [{'id': '1.A.1.c.1', 'related': 'h...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'level_required': {'category': [{'name': ""Bac...",2120,"Mathematiker, Aktuare und Statistiker"


In [19]:
occupation = pd.DataFrame(list(df["occupation"])) 

# List of columns to drop
cols_to_drop = ["code","updated","sample_of_reported_job_titles","summary_resources","details_resources","custom_resources", "tags"]

# Drop the columns
occupation = occupation.drop(cols_to_drop, axis=1)

occupation = occupation.iloc[:, 0]
df["occupation"] = occupation

df["occupation"].head(3)

0               Actors
1            Actuaries
2    Acute Care Nurses
Name: occupation, dtype: object

In [20]:
# Initialize all skill columns to 0
for _, skill in skills_df.iterrows():
    skilltext = ("s" + str(skill["skill_id"]))
    df[skilltext] = 0

# add Value to each skill
for index, row in df["skills"].items():
    row = pd.DataFrame(row['element'])

    # Merge the skills dataframe with the skills_df dataframe to get the new skill id   
    row = row.merge(skills_df, left_on='id', right_on='id', how='left')

    for _, skill_row in row.iterrows():
        dftext = ("s" + str(skill_row["skill_id"]))
        score_value = skill_row["score"]['value']
        df.loc[index, dftext] = score_value

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,s1,s2,s3,...,s26,s27,s28,s29,s30,s31,s32,s33,s34,s35
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler,72,72,69,...,0,0,0,0,0,0,0,0,0,0
1,Actuaries,"{'element': [{'id': '2.A.2.a', 'related': 'htt...","{'element': [{'id': '1.A.1.c.1', 'related': 'h...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'level_required': {'category': [{'name': ""Bac...",2120,"Mathematiker, Aktuare und Statistiker",81,72,75,...,0,0,41,25,81,0,28,22,0,0


In [21]:
# Initialize all ability columns to 0
for _, ability in abilities_df.iterrows():
    abilitytext = ("a" + str(ability["ability_id"]))
    df[abilitytext] = 0

# add Value to each ability
for index, row in df["abilities"].items():
    row = pd.DataFrame(row['element'])

    # Merge the skills dataframe with the skills_df dataframe to get the new skill id   
    row = row.merge(abilities_df, left_on='id', right_on='id', how='left')

    for _, ability_row in row.iterrows():
        dftext = ("a" + str(ability_row["ability_id"]))
        score_value = ability_row["score"]['value']
        df.loc[index, dftext] = score_value

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,s1,s2,s3,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler,72,72,69,...,0,0,0,0,0,0,0,0,0,0
1,Actuaries,"{'element': [{'id': '2.A.2.a', 'related': 'htt...","{'element': [{'id': '1.A.1.c.1', 'related': 'h...","{'value': 4, 'title': 'Job Zone Four: Consider...","{'level_required': {'category': [{'name': ""Bac...",2120,"Mathematiker, Aktuare und Statistiker",81,72,75,...,0,91,0,0,0,0,0,0,0,0


In [22]:
new_education = list()

for index, row in df["education"].items():
    row = pd.DataFrame(row['level_required']['category'])

    # Check if 'score' exists
    if 'score' in row.columns:
        # Extract 'value' from 'score' and convert it to a DataFrame column
        max_value = row['score'].apply(lambda d: d['value'])

        # Find the index of the row with the highest score
        max_score_index = max_value.idxmax()

        # Select only the row with the highest score
        row = row.loc[[max_score_index]]

    cols_to_drop = ["description", "score"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

   
    # Merge the education dataframe with the education_df dataframe to get the new education level
    row = row.merge(education_df, left_on='name', right_on='name', how='left')

    cols_to_drop = ["id", "_id", "name"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()[0]

    new_education.append(row)


df["education"] = new_education

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,s1,s2,s3,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",1,2655,Schauspieler,72,72,69,...,0,0,0,0,0,0,0,0,0,0
1,Actuaries,"{'element': [{'id': '2.A.2.a', 'related': 'htt...","{'element': [{'id': '1.A.1.c.1', 'related': 'h...","{'value': 4, 'title': 'Job Zone Four: Consider...",2,2120,"Mathematiker, Aktuare und Statistiker",81,72,75,...,0,91,0,0,0,0,0,0,0,0


In [23]:
new_job_zones = list()

for index, row in df["job_zone"].items():
    value = row["value"]
    
    row = job_zone_df[job_zone_df["value"] == value]
  
    cols_to_drop = ["id", "_id", "related_experience", "education", "title", "job_training","job_zone_examples","value"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()[0]
    
    new_job_zones.append(row)

df["job_zone"] = new_job_zones

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,s1,s2,s3,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...",1,1,2655,Schauspieler,72,72,69,...,0,0,0,0,0,0,0,0,0,0
1,Actuaries,"{'element': [{'id': '2.A.2.a', 'related': 'htt...","{'element': [{'id': '1.A.1.c.1', 'related': 'h...",2,2,2120,"Mathematiker, Aktuare und Statistiker",81,72,75,...,0,91,0,0,0,0,0,0,0,0


In [24]:
def convert_np_array_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

In [25]:
# Remove columns that are not useful
df = df.drop(["skills","abilities","occupation"], axis=1)
df.head(2)

Unnamed: 0,job_zone,education,isco08,Name_de,s1,s2,s3,s4,s5,s6,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,1,1,2655,Schauspieler,72,72,69,69,50,50,...,0,0,0,0,0,0,0,0,0,0
1,2,2,2120,"Mathematiker, Aktuare und Statistiker",81,72,75,50,81,53,...,0,91,0,0,0,0,0,0,0,0


In [26]:
dbname = get_database()

insert_df = df.applymap(convert_np_array_to_list)

collection_name = dbname["with_id"]

# Insert the dictionaries into the MongoDB collection
collection_name.insert_many(insert_df.to_dict("records"))

  insert_df = df.applymap(convert_np_array_to_list)


InsertManyResult([ObjectId('662f495f6ad0038ed71eb00e'), ObjectId('662f495f6ad0038ed71eb00f'), ObjectId('662f495f6ad0038ed71eb010'), ObjectId('662f495f6ad0038ed71eb011'), ObjectId('662f495f6ad0038ed71eb012'), ObjectId('662f495f6ad0038ed71eb013'), ObjectId('662f495f6ad0038ed71eb014'), ObjectId('662f495f6ad0038ed71eb015'), ObjectId('662f495f6ad0038ed71eb016'), ObjectId('662f495f6ad0038ed71eb017'), ObjectId('662f495f6ad0038ed71eb018'), ObjectId('662f495f6ad0038ed71eb019'), ObjectId('662f495f6ad0038ed71eb01a'), ObjectId('662f495f6ad0038ed71eb01b'), ObjectId('662f495f6ad0038ed71eb01c'), ObjectId('662f495f6ad0038ed71eb01d'), ObjectId('662f495f6ad0038ed71eb01e'), ObjectId('662f495f6ad0038ed71eb01f'), ObjectId('662f495f6ad0038ed71eb020'), ObjectId('662f495f6ad0038ed71eb021'), ObjectId('662f495f6ad0038ed71eb022'), ObjectId('662f495f6ad0038ed71eb023'), ObjectId('662f495f6ad0038ed71eb024'), ObjectId('662f495f6ad0038ed71eb025'), ObjectId('662f495f6ad0038ed71eb026'), ObjectId('662f495f6ad0038ed71eb0