# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions.filter import *
from functions.signature import *
from functions.tools import count_words

# Import data

In [2]:
df_train = pd.read_json("./data/offers_matching.json", orient='records', lines=True)
# Convert the name of the columns to uppercase
df_train.columns = map(str.upper, df_train.columns)
# Drop the null register if the PALABRAS_EMPLEO_TEXTO or CATEGORIA or SUBCATEGORIA is null
print("Before:" + str(df_train.shape))
df_train = df_train.dropna(subset=['PALABRAS_EMPLEO_TEXTO', 'CATEGORIA', 'SUBCATEGORIA'])
# df_train = df_train.dropna(subset=['PALABRAS_EMPLEO_TEXTO_NUEVAS', 'CATEGORIA', 'SUBCATEGORIA'])
df_train["NUM_WORDS"] = count_words(df_train)
print("After:" + str(df_train.shape))
# Shuffle the dataframe
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.head(2)

Before:(30283, 23)
After:(29667, 24)


Unnamed: 0,TITLE,DESCRIPTION,COMPANY,LOCATION,CATEGORY,DATE,SITE,ID,URL,ID_OFERTA,...,ID_TITULOS,TITULOS_RAW,ASUNTO,CATEGORIA,SUBCATEGORIA,DESCRIPCION_OFERTA,PALABRAS_EMPLEO_TEXTO,ID_PUESTO_ESCO_ULL,MATCHING,NUM_WORDS
0,Comercial de Seguros La Laguna (fijo+variable),Agencias Exclusivas de Santalucía Seguros sele...,"Santalucía, S.A. Compañía de Seguros y Reaseguros",La Laguna,Comercial - Ventas,2022-11-15,infojobs.net,3670115453,https://www.infojobs.net/san-cristobal-de-la-l...,1f387c54bd4324adb031b2d8ea2f39,...,,,Comercial de Seguros La Laguna (fijo+variable),Comercial y ventas,Comercial,Agencias Exclusivas de Santalucía Seguros sele...,COMERCIAL SEGUROS AUTONOMO PERSONA ASESORAR CL...,1504,yes,29
1,Atención al Cliente,Nos encontramos trabajando con una de las Enti...,Fundación Human Age Institute (ManpowerGroup ),Santa Cruz de Tenerife,Atención al Cliente,2022-09-14,infojobs.net,3573167131,https://www.infojobs.net/santa-cruz-de-tenerif...,9f7e1f118a447aa1e7a21bcd661f32,...,,,Atención al cliente (H/M) (Certificado de disc...,Atención a clientes,Atención al cliente,Nos encontramos trabajando con una de las Enti...,ATENCIONES CLIENTES CERTIFICADA DISCAPACIDAD F...,1835,yes,19


# Filter by random undersampling

In [3]:
print("Before:" + str(df_train.shape))
# Delete occupations with more than 1000 offers and set to 1000
df_undersampling = drop_randomly_most_offers(df_train, maxOffers=1000, totalOffers=800)
print("After:" + str(df_undersampling.shape))
# df_undersampling.to_json("../data/train/TRAIN_filter_1000_offers.json", orient='records', force_ascii=False)

Before:(29667, 24)


After:(26408, 24)


# Filter by Min Num of Words

In [4]:
print("Before: ", df_train.shape[0])
# Delete all rows with less than 5 words
df_words = delete_offers_with_less_words(df_train, 5)
print("After: ", df_words.shape[0])
# df_words.to_json("../data/train/TRAIN_filter_5_words.json", orient='records', force_ascii=False)

Before:  29667
After:  28562


# Filter by Signature

In [6]:
print("Before: ", df_train.shape[0])
diccionario_ocupaciones = create_diccionario_ocupaciones(df_train)
df_train['RELATIVE_SIGNATURE'] = get_offers_signature_relative(df_train, diccionario_ocupaciones, precision=2)
show_signature_by_occupation(df_train, 1607)
df_signature = delete_offers_same_occupation_by_signature(df_train, maxOffers=100, totalOffers=100, precision=2)
df_signature = get_offers_by_relative_min_signature_value(df_signature, 0.05)
show_signature_by_occupation(df_signature, 1607)
print("After: ", df_signature.shape[0])
df_signature.to_json("./data/TRAIN_filter.json", orient='records', force_ascii=False)

Before:  29667


Procesando ocupaciones: 100%|██████████| 930/930 [00:48<00:00, 19.10it/s]


After:  29614


# Delete by signature and undersampling

In [None]:
print("Before: ", df_train.shape[0])
diccionario_ocupaciones = create_diccionario_ocupaciones(df_train)
df_train['RELATIVE_SIGNATURE'] = get_offers_signature_relative(df_train, diccionario_ocupaciones, precision=2)
df_signature = delete_offers_same_occupation_by_signature(df_train, maxOffers=50, totalOffers=50, precision=2)
# df_signature_undersampling = drop_randomly_most_offers(df_signature, maxOffers=100, totalOffers=100)
print("After: ", df_signature.shape[0])

Before:  50099


Procesando ocupaciones: 100%|██████████| 1027/1027 [00:42<00:00, 24.27it/s]

After:  45346



