# Import libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions.filter import *
from functions.signature import *
from functions.tools import count_words

# Import data

In [4]:
df_train = pd.read_json("../data/train/train_palabras_empleo_texto.json", encoding='utf-8')
# Convert the name of the columns to uppercase
df_train.columns = map(str.upper, df_train.columns)
# Drop the null register if the PALABRAS_EMPLEO_TEXTO or CATEGORIA or SUBCATEGORIA is null
print("Before:" + str(df_train.shape))
df_train = df_train.dropna(subset=['PALABRAS_EMPLEO_TEXTO', 'CATEGORIA', 'SUBCATEGORIA'])
df_train["NUM_WORDS"] = count_words(df_train)
print("After:" + str(df_train.shape))
# Shuffle the dataframe
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.head(2)

Before:(50099, 5)
After:(50099, 6)


Unnamed: 0,ID_OFERTA,ID_PUESTO_ESCO_ULL,CATEGORIA,SUBCATEGORIA,PALABRAS_EMPLEO_TEXTO,NUM_WORDS
0,82f00dcc7e45c990a933728af212e5,1902,Turismo y restauracion,Hosteleria,COCINERO DEMOSTRABLE GASTRONOMICO INTEGRARSE E...,11
1,01167798924b269a012f754cdafbe3,1838,Atencion a clientes,Atencion al cliente,ESC GENERALES PROSEGUR RECEPCIONISTAS ALEMAN U...,15


# Filter by random undersampling

In [5]:
print("Before:" + str(df_train.shape))
# Delete occupations with more than 1000 offers and set to 1000
df_undersampling = drop_randomly_most_offers(df_train, maxOffers=1000, totalOffers=800)
print("After:" + str(df_undersampling.shape))
# df_undersampling.to_json("../data/train/TRAIN_filter_1000_offers.json", orient='records', force_ascii=False)

Before:(50099, 6)
After:(38744, 6)


# Filter by Min Num of Words

In [6]:
print("Before: ", df_train.shape[0])
# Delete all rows with less than 5 words
df_words = delete_offers_with_less_words(df_train, 5)
print("After: ", df_words.shape[0])
# df_words.to_json("../data/train/TRAIN_filter_5_words.json", orient='records', force_ascii=False)

Before:  50099
After:  50099


# Filter by Signature

In [7]:
print("Before: ", df_train.shape[0])
diccionario_ocupaciones = create_diccionario_ocupaciones(df_train)
df_train['RELATIVE_SIGNATURE'] = get_offers_signature_relative(df_train, diccionario_ocupaciones, precision=2)
show_signature_by_occupation(df_train, 1607)
df_signature = delete_offers_same_occupation_by_signature(df_train, maxOffers=100, totalOffers=100, precision=2)
df_signature = get_offers_by_relative_min_signature_value(df_signature, 0.05)
show_signature_by_occupation(df_signature, 1607)
print("After: ", df_signature.shape[0])
df_signature.to_json("../data/train/TRAIN_filter.json", orient='records', force_ascii=False)

Before:  50099


Procesando ocupaciones: 100%|██████████| 1027/1027 [00:41<00:00, 24.96it/s]


After:  43642


# Delete by signature and undersampling

In [8]:
print("Before: ", df_train.shape[0])
diccionario_ocupaciones = create_diccionario_ocupaciones(df_train)
df_train['RELATIVE_SIGNATURE'] = get_offers_signature_relative(df_train, diccionario_ocupaciones, precision=2)
df_signature = delete_offers_same_occupation_by_signature(df_train, maxOffers=50, totalOffers=50, precision=2)
# df_signature_undersampling = drop_randomly_most_offers(df_signature, maxOffers=100, totalOffers=100)
print("After: ", df_signature.shape[0])

Before:  50099


Procesando ocupaciones: 100%|██████████| 1027/1027 [00:47<00:00, 21.48it/s]


After:  45129
