# Import libraries

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functions.vectorization import load_vectorized_dataframe

# Import RAW data

In [19]:

df_train_path = '../data/train/TRAIN.json'
df_train = pd.read_json(df_train_path)
# Map columns names to upper
df_train.columns = map(str.upper, df_train.columns)
df_train.head(2)

Unnamed: 0,ID_OFERTA,SUBCATEGORIA,CATEGORIA,PALABRAS_EMPLEO_TEXTO,ID_PUESTO_ESCO,ID_PUESTO_ESCO_ULL
0,ef5a8ae0a743018628df9bd53893bb,Administración,Administración de empresas,ADMINISTRATIVO INGLES NAVISION EMPRESAS CARACT...,1634,1634
1,47137c06a640348ca4cb7dcbf938b1,Medicina general,Sanidad y salud,MEDICO MEDICINA DIAGNOSTICO TRATAMIENTO LICENC...,611,611


# Visualize type of data

In [20]:
print('df_train.shape: ', df_train.shape)
print('df_train.columns: ', df_train.columns)


df_train.shape:  (48893, 6)
df_train.columns:  Index(['ID_OFERTA', 'SUBCATEGORIA', 'CATEGORIA', 'PALABRAS_EMPLEO_TEXTO',
       'ID_PUESTO_ESCO', 'ID_PUESTO_ESCO_ULL'],
      dtype='object')


# Get all the WORDS and COUNT for each Ocupación

In [21]:
def create_diccionario_ocupaciones(df):
    # Get all possible ID_PUESTO_ESCO_ULL
    occupations = df['ID_PUESTO_ESCO_ULL'].unique()
    # Diccionario de ocupaciones
    diccionario_ocupaciones = {}
    # Iterate over all occupations
    for occupation_id in occupations:
        # Get all rows with the current occupation
        occupation_df = df[df['ID_PUESTO_ESCO_ULL'] == occupation_id]
        # Get all words in the current occupation
        diccionario_ocupacion = {}
        # Iterate over all offers
        for oferta in occupation_df['PALABRAS_EMPLEO_TEXTO']:
            # Get all words in the current offer deleting the last space
            for palabra in oferta.split(" ")[:-1]:
                # If the word is not in the dictionary, add it
                if palabra not in diccionario_ocupacion:
                    diccionario_ocupacion[palabra] = 1
                # If the word is in the dictionary, add 1 to the counter
                else:
                    diccionario_ocupacion[palabra] += 1
        # Add the occupation dictionary to the occupations dictionary
        diccionario_ocupaciones[occupation_id] = diccionario_ocupacion
    return diccionario_ocupaciones

diccionario_ocupacion = create_diccionario_ocupaciones(df_train)
# print('diccionario_ocupacion: ', diccionario_ocupacion)
def get_offers_signature_relative(df, diccionario_ocupacion):
    # Get all possible ID_PUESTO_ESCO_ULL
    occupations = df['ID_PUESTO_ESCO_ULL'].unique()
    # Create a new column in the dataframe
    df['RELATIVE_SIGNATURE'] = 0
    # Iterate over all occupations
    for occupation_id in occupations:
        # Get all rows with the current occupation
        occupation_df = df[df['ID_PUESTO_ESCO_ULL'] == occupation_id]
        # Iterate over all offers
        signatures = []
        for oferta in occupation_df['PALABRAS_EMPLEO_TEXTO']:
            # Start the signature in 0
            signature = 0
            # Iterate over all words in the current offer
            for word in oferta.split(" ")[:-1]:
                signature += diccionario_ocupacion[occupation_id][word]
            # Add the signature to the list of signatures
            signatures.append(signature)
        # Now for the same occupation divide all signatures by the max signature
        max_signature = max(signatures)
        # Iterate over all signatures and round them to 2 decimals
        for i in range(len(signatures)):
            signatures[i] = round(signatures[i] / max_signature, 2)
        # Add the signatures to the dataframe
        df.loc[df['ID_PUESTO_ESCO_ULL'] == occupation_id, 'RELATIVE_SIGNATURE'] = signatures
    return df['RELATIVE_SIGNATURE']
df_train['RELATIVE_SIGNATURE'] = get_offers_signature_relative(df_train, diccionario_ocupacion)
            


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[0.25, 0.59, 0.34, 0.27, 0.5, 0.4, 0.37, 0.55, 0.23, 0.66, 0.55, 0.7, 0.47, 0.61, 0.3, 0.64, 0.33, 0.28, 0.59, 0.54, 0.6, 0.78, 0.55, 0.2, 0.6, 0.45, 0.43, 0.5, 0.57, 0.59, 0.26, 0.11, 0.42, 0.54, 0.49, 0.49, 0.53, 0.37, 0.2, 0.38, 0.52, 0.5, 0.46, 0.22, 0.69, 0.7, 0.37, 0.65, 0.41, 0.39, 0.31, 0.53, 0.66, 0.66, 0.49, 0.46, 0.49, 0.29, 0.65, 0.89, 0.41, 0.73, 0.65, 0.39, 0.72, 0.75, 0.65, 0.59, 0.37, 0.2, 0.76, 0.65, 0.2, 0.36, 0.76, 0.65, 0.39, 0.5, 0.23, 0.58, 0.2, 0.32, 0.25, 0.57, 0.34, 0.62, 0.44, 0.54, 0.38, 0.35, 0.49, 0.57, 0.74, 0.3, 0.23, 0.4, 0.55, 0.43, 0.83, 0.62, 0.39, 0.49, 0.26, 0.4, 0.7, 0.38, 0.37, 0.61, 0.54, 0.7, 0.72, 0.94, 0.27, 0.46, 0.2, 0.49, 0.36, 0.59, 0.46, 0.28, 0.43, 0.1, 0.42, 0.74, 0.44, 0.53, 0.41, 0.73, 0.7, 0.45, 0.74, 0.57, 0.58, 0.37, 0.55, 0.63, 0.52, 0.46, 0.23, 0.35, 0.47, 0.66, 0.42, 0.24, 0.28, 0.41, 0.65, 0.55, 0.77, 0.37, 0.52, 0.77, 0.44, 

In [24]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.io import output_notebook
import pandas as pd

# Filter and sort the data
filtered_df = df_train[df_train['ID_PUESTO_ESCO_ULL'] == 1607]
sorted_df = filtered_df.sort_values(by='RELATIVE_SIGNATURE', ascending=False).reset_index(drop=True)

# Initialize Bokeh
output_notebook()

# Create a ColumnDataSource to store data
source = ColumnDataSource(sorted_df)

# Create a figure
p = figure(
    title='Relative Signature for Occupation 1508',
    x_axis_label='Data Point Index',
    y_axis_label='Relative Signature',
    tools="pan,box_zoom,wheel_zoom,reset,save",
)

# Add scatter plot markers
p.circle(x='index', y='RELATIVE_SIGNATURE', size=8, source=source)

# Add hover tool to display values on hover
hover = HoverTool()
hover.tooltips = [("Value", "@RELATIVE_SIGNATURE"), ("ID_OFERTA", "@ID_OFERTA")]
p.add_tools(hover)

# Show the plot
show(p, notebook_handle=True)

In [26]:
def get_offers_by_relative_signature_and_occupation(df, occupation_id, relative_signature):
    # Get all rows with the current occupation
    occupation_df = df[df['ID_PUESTO_ESCO_ULL'] == occupation_id]
    # Get all rows with the current relative signature
    relative_signature_df = occupation_df[occupation_df['RELATIVE_SIGNATURE'] == relative_signature]
    # Return the offers
    return relative_signature_df['ID_OFERTA'].tolist()
lista = get_offers_by_relative_signature_and_occupation(df_train, 1607, 0.94)

lista:  ['c169a8e84041bbba6528c43a2eda94', '0f6417e406456e9b6224b8d34b6d8f', '515586c9b540cabeb7f7053e64aa9b', '6033b99aad47739721f59fa1f3a981', '017278ca1041e9a917878aae26b584', 'fe64089f6541c79db998737b1ad59c', 'e9d6c814e0453da8d8c13042f1db97', '6d574be3eb4eeea2a44a684896af17', '8b271ebc0e4227aea153c82e97ed59', '0e2a69278e4ed0b7e7bf8d28ecd49f', 'c23e87405d449f84d268ef0448e6cf', '50dc35b2c3449ea15a1dac8a65401d', '0d40f5f63241bdab295da24395ac98', '9db78cabb7413baf4553b725aa2823', '3c4f7c2e2a47d0abb97248151ab972', '2d429bc18d4770ab91d10b62eaff71', 'd58e6be7a64da2a8a2e18e551a6770', '4c30e4b3ee4c4ab68c61c4e5c10c48', '78dd647961480a8e0ff3576fa0036b', '9290c3986a4f4786008b05904bd7b3', 'c6957ab67b45b4a34a142d5b3e904c', 'b27e32490b474c88458c2a5be28251', 'e0e5a7a4334af78888603f17807769', '40b0846efd417f84dad26d4ef63efb', 'c257d50f13445da3f485793b14b864', 'c59d5272f243479022c32afd12fe18', 'c347f5581847cfb9f3b840177f7409', 'f6e7cbcfd44867b304793d034ec75e', '1984d7689b47d284ef2d6741b33f23', 'e44b