# Feature Engineering

In [70]:
import json
from db import get_database
import pandas as pd
import numpy as np

In [71]:
# import authentication data

with open('./infos.json') as f:
    infos = json.load(f)
    onet = infos['onet']
    onetUsername = onet['username']
    onetPassword = onet['password']
    mongodb = infos['mongodb']
    mongoUusername = mongodb['username']
    mongoPpassword = mongodb['password']
    mongoUrl = mongodb['connectionString']

In [72]:
dbname = get_database()

collection = dbname["joined"]
fo_col = dbname["fo_joined"]

documents = collection.find()
fo = fo_col.find()

df = pd.DataFrame(list(documents))
fo_df = pd.DataFrame(list(fo))
skills_df = pd.read_csv('files/skills.csv')
abilities_df = pd.read_csv('files/abilities.csv')

In [73]:
# Remove empty cols
print("Total documents: ", len(df))

# Drop columns where some values are null
df = df.dropna(how='all', axis=1)

# Drop rows where all some are null
df = df.dropna(how='all', axis=0)

# Replace any remaining NaN values with 0
df = df.fillna(0)

print("Total documents after dropping na: ", len(df))

Total documents:  846
Total documents after dropping na:  846


In [74]:
# Remove columns that are not useful
df = df.drop(['additional_information',"tasks","related_occupations","display","technology_skills","tools_technology","_id","tools_used","work_context","interests","work_values","knowledge","detailed_work_activities","work_activities","work_styles"], axis=1)

# add field Berufshauptgruppen
df["Berufshauptgruppe"] = df["isco08"].str[0]

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,Berufshauptgruppe
0,"{'code': '27-2011.00', 'title': 'Actors', 'tag...","{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler,2
1,"{'code': '23-1021.00', 'title': 'Administrativ...","{'element': [{'id': '2.A.1.b', 'related': 'htt...","{'element': [{'id': '1.A.1.b.5', 'related': 'h...","{'value': 5, 'title': 'Job Zone Five: Extensiv...",{'level_required': {'category': [{'name': 'Doc...,2612,Richter,2


In [75]:
occupation = pd.DataFrame(list(df["occupation"])) 

# List of columns to drop
cols_to_drop = ["code","updated","sample_of_reported_job_titles","summary_resources","details_resources","custom_resources", "tags"]

# Drop the columns
occupation = occupation.drop(cols_to_drop, axis=1)

occupation = occupation.iloc[:, 0]
df["occupation"] = occupation

df["occupation"].head(3)

0                                               Actors
1    Administrative Law Judges, Adjudicators, and H...
2    Aerospace Engineering and Operations Technolog...
Name: occupation, dtype: object

In [76]:
# Initialize all skill columns to 0
for _, skill in skills_df.iterrows():
    skilltext = ("s" + str(skill["skill_id"]))
    df[skilltext] = 0

# add Value to each skill
for index, row in df["skills"].items():
    row = pd.DataFrame(row['element'])

    # Merge the skills dataframe with the skills_df dataframe to get the new skill id   
    row = row.merge(skills_df, left_on='id', right_on='id', how='left')

    for _, skill_row in row.iterrows():
        dftext = ("s" + str(skill_row["skill_id"]))
        score_value = skill_row["score"]['value']
        df.loc[index, dftext] = score_value

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,Berufshauptgruppe,s1,s2,...,s26,s27,s28,s29,s30,s31,s32,s33,s34,s35
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler,2,72,72,...,0,0,0,0,0,0,0,0,0,0
1,"Administrative Law Judges, Adjudicators, and H...","{'element': [{'id': '2.A.1.b', 'related': 'htt...","{'element': [{'id': '1.A.1.b.5', 'related': 'h...","{'value': 5, 'title': 'Job Zone Five: Extensiv...",{'level_required': {'category': [{'name': 'Doc...,2612,Richter,2,81,75,...,0,0,19,16,28,0,13,3,0,0


In [77]:
# Initialize all ability columns to 0
for _, ability in abilities_df.iterrows():
    abilitytext = ("a" + str(ability["ability_id"]))
    df[abilitytext] = 0

# add Value to each ability
for index, row in df["abilities"].items():
    row = pd.DataFrame(row['element'])

    # Merge the skills dataframe with the skills_df dataframe to get the new skill id   
    row = row.merge(abilities_df, left_on='id', right_on='id', how='left')

    for _, ability_row in row.iterrows():
        dftext = ("a" + str(ability_row["ability_id"]))
        score_value = ability_row["score"]['value']
        df.loc[index, dftext] = score_value

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,Berufshauptgruppe,s1,s2,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",{'level_required': {'category': [{'name': 'Les...,2655,Schauspieler,2,72,72,...,0,0,0,0,0,0,0,0,0,0
1,"Administrative Law Judges, Adjudicators, and H...","{'element': [{'id': '2.A.1.b', 'related': 'htt...","{'element': [{'id': '1.A.1.b.5', 'related': 'h...","{'value': 5, 'title': 'Job Zone Five: Extensiv...",{'level_required': {'category': [{'name': 'Doc...,2612,Richter,2,81,75,...,0,31,0,0,0,0,0,0,0,0


In [78]:
new_education = list()

for index, row in df["education"].items():
    row = pd.DataFrame(row['level_required']['category'])

    # Check if 'score' exists
    if 'score' in row.columns:
        # Extract 'value' from 'score' and convert it to a DataFrame column
        max_value = row['score'].apply(lambda d: d['value'])

        # Find the index of the row with the highest score
        max_score_index = max_value.idxmax()

        # Select only the row with the highest score
        row = row.loc[[max_score_index]]

    cols_to_drop = ["description", "score"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

   
    # Merge the education dataframe with the education_df dataframe to get the new education level
    row = row.merge(education_df, left_on='name', right_on='name', how='left')

    cols_to_drop = ["id", "_id", "name"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()[0]

    new_education.append(row)


df["education"] = new_education

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,Berufshauptgruppe,s1,s2,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...","{'value': 2, 'title': 'Job Zone Two: Some Prep...",1,2655,Schauspieler,2,72,72,...,0,0,0,0,0,0,0,0,0,0
1,"Administrative Law Judges, Adjudicators, and H...","{'element': [{'id': '2.A.1.b', 'related': 'htt...","{'element': [{'id': '1.A.1.b.5', 'related': 'h...","{'value': 5, 'title': 'Job Zone Five: Extensiv...",9,2612,Richter,2,81,75,...,0,31,0,0,0,0,0,0,0,0


In [79]:
new_job_zones = list()

for index, row in df["job_zone"].items():
    value = row["value"]
    
    row = job_zone_df[job_zone_df["value"] == value]
  
    cols_to_drop = ["id", "_id", "related_experience", "education", "title", "job_training","job_zone_examples","value"]
    row = row.drop(cols_to_drop, axis=1, errors='ignore')

    # Convert DataFrame to numpy array
    row = row.to_numpy().flatten()[0]
    
    new_job_zones.append(row)

df["job_zone"] = new_job_zones

df.head(2)

Unnamed: 0,occupation,skills,abilities,job_zone,education,isco08,Name_de,Berufshauptgruppe,s1,s2,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,Actors,"{'element': [{'id': '2.A.1.a', 'related': 'htt...","{'element': [{'id': '1.A.1.a.3', 'related': 'h...",1,1,2655,Schauspieler,2,72,72,...,0,0,0,0,0,0,0,0,0,0
1,"Administrative Law Judges, Adjudicators, and H...","{'element': [{'id': '2.A.1.b', 'related': 'htt...","{'element': [{'id': '1.A.1.b.5', 'related': 'h...",4,9,2612,Richter,2,81,75,...,0,31,0,0,0,0,0,0,0,0


In [80]:
def convert_np_array_to_list(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

In [81]:
# Remove columns that are not useful
df = df.drop(["skills","abilities","occupation"], axis=1)
df.head(2)

Unnamed: 0,job_zone,education,isco08,Name_de,Berufshauptgruppe,s1,s2,s3,s4,s5,...,a43,a44,a45,a46,a47,a48,a49,a50,a51,a52
0,1,1,2655,Schauspieler,2,72,72,69,69,50,...,0,0,0,0,0,0,0,0,0,0
1,4,9,2612,Richter,2,81,75,81,72,81,...,0,31,0,0,0,0,0,0,0,0


In [82]:
# Get columns that start with 'a' or 's'
cols = df.columns[df.columns.str.startswith(('a', 's'))]

# Divide these columns by 100
df[cols] = df[cols] / 100

In [83]:
fo_df = pd.merge(df, fo_df, left_on='isco08', right_on="isco08", how='left')
fo_df = fo_df.drop_duplicates(subset=['isco08'])
fo_df = fo_df.drop(columns=["_id"])
fo_df = fo_df.dropna()

In [84]:
fo_df.to_csv("files/fo_swiss.csv", index=False)
df.to_csv("files/occupations_swiss.csv", index=False)