In [1]:
from functions import *
import pandas as pd
import numpy as np

# Import training data

In [2]:
df_train = pd.read_json('data/train/TRAIN.json')
# Transform columns to uppercase
df_train.columns = map(str.upper, df_train.columns)
df_train.head()

Unnamed: 0,ID_OFERTA,SUBCATEGORIA,CATEGORIA,PALABRAS_EMPLEO_TEXTO,ID_PUESTO_ESCO,ID_PUESTO_ESCO_ULL
0,ef5a8ae0a743018628df9bd53893bb,Administración,Administración de empresas,ADMINISTRATIVO INGLES NAVISION EMPRESAS CARACT...,1634,1634
1,47137c06a640348ca4cb7dcbf938b1,Medicina general,Sanidad y salud,MEDICO MEDICINA DIAGNOSTICO TRATAMIENTO LICENC...,611,611
2,cdfb72672340819a6721db72eee187,Comercial,Comercial y ventas,REPARTIDOR PERSONA CHOFER VENTAS CLIENTES SEGU...,2825,2825
3,5746210e854264aaca9452f4d377a4,Venta al detalle,Ventas al detalle,DEPENDIENTE PROCESOS SELECCION COMERCIAL TIEND...,1984,1984
4,8a1bda0c91438bb4133f32e392a1ce,Venta al detalle,Ventas al detalle,DEPENDIENTE PALMERAS SEMANALES ATENCIONES VENT...,1984,1984


# Create Vector and Vocabulary

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

def custom_tokenizer_category_subcategory(text):
    # this functions returns a list with the text with underscores instead of spaces
    text_with_underscores = text.replace(' ', '_')
    return [text_with_underscores]

def custom_tokenizer_palabras_empleo_texto(text):
    # This function returns a list with the text splitted by spaces
    elements = text.strip().split(" ")
    return elements

def get_vocabulary_dimension(vocabulary):
    # This function returns the total size of the vocabulary (the dimension)
    return len(vocabulary['CATEGORIA']) + len(vocabulary['SUBCATEGORIA']) + len(vocabulary['PALABRAS_EMPLEO_TEXTO'])

def create_vocabulary(df_train):
    # This function returns a dictionary with the vocabulary of each column
    # The keys are the columns and the values are the vocabulary of each column
    columns = ['CATEGORIA', 'SUBCATEGORIA', 'PALABRAS_EMPLEO_TEXTO']
    vocabularies = {}
    for column in columns:
        df_train[column] = df_train[column].astype(str)
        if column == 'CATEGORIA' or column == 'SUBCATEGORIA':
            tfidf = TfidfVectorizer(tokenizer=custom_tokenizer_category_subcategory)
        else:
            tfidf = TfidfVectorizer(tokenizer=custom_tokenizer_palabras_empleo_texto)
        
        tfidf.fit_transform(df_train[column]).todense()
        vocabularies[column] = list(map(str, tfidf.vocabulary_.keys()))
    return vocabularies

def get_vertical_index(vocabularies, column, text):
    if column == 'CATEGORIA':
        return vocabularies[column].index(text)
    elif column == 'SUBCATEGORIA':
        return vocabularies[column].index(text) + len(vocabularies['CATEGORIA'])
    elif column == 'PALABRAS_EMPLEO_TEXTO':
        return vocabularies[column].index(text) + len(vocabularies['CATEGORIA']) + len(vocabularies['SUBCATEGORIA'])

def one_on_feature(text, column, index, offset, vocabularies, matrix):
    # This function returns the modified matrix in the position of the text
    # If the text is in the vocabulary, it will return a 1 in the position of the text
    if text in vocabularies[column]:
        horizontal_index = index - offset
        vertical_index = get_vertical_index(vocabularies, column, text)
        matrix[horizontal_index][vertical_index] = 1
    return matrix

def create_vectorize_dataframe(df_train_subset, vocabularies):
    columns = ['CATEGORIA', 'SUBCATEGORIA', 'PALABRAS_EMPLEO_TEXTO']
    # Get the total size of vocabulary
    total_size = get_vocabulary_dimension(vocabularies)
    # Create a matrix of zeros with the shape of the length of the subset and the total size of the vocabulary
    matrix = np.zeros((len(df_train_subset), total_size))
    # Get the offset of the index if the dataframe is a subset
    offset = df_train_subset.index[0]
    # Iterate over the subset (the offers)
    for index, row in df_train_subset.iterrows():
        # Iterate over the columns
        for column in columns:
            if column == 'CATEGORIA':
                text = custom_tokenizer_category_subcategory(row[column].lower())[0]
                matrix = one_on_feature(text, column, index, offset, vocabularies, matrix)
            elif column == 'SUBCATEGORIA':
                text = custom_tokenizer_category_subcategory(row[column].lower())[0]
                matrix = one_on_feature(text, column, index, offset, vocabularies, matrix)
            elif column == 'PALABRAS_EMPLEO_TEXTO':
                # Split the text by spaces
                texts = custom_tokenizer_palabras_empleo_texto(row[column].lower())
                # Iterate over the splitted text
                for text in texts:
                    matrix = one_on_feature(text, column, index, offset, vocabularies, matrix)
    return matrix



vocabularies = create_vocabulary(df_train[0:2])
vectorized_dataframe = create_vectorize_dataframe(df_train[0:2], vocabularies)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[column] = df_train[column].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[column] = df_train[column].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train[column] = df_train[column].astype(str)


In [35]:
print(get_vocabulary_dimension(vocabularies))
print(vocabularies['CATEGORIA'])
print(vocabularies['SUBCATEGORIA'])
print(vocabularies['PALABRAS_EMPLEO_TEXTO'])

27
['administración_de_empresas', 'sanidad_y_salud']
['administración', 'medicina_general']
['administrativo', 'ingles', 'navision', 'empresas', 'caracter', 'internacional', 'logistico', 'dominio', 'escrito', 'microsoft', 'dynamics', 'office', 'excel', 'medico', 'medicina', 'diagnostico', 'tratamiento', 'licenciado', 'grado', 'homologado', 'colegiado', 'vehiculo', 'carne']


In [36]:
print(vectorized_dataframe[1])

[0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1.]


In [32]:
print(vocabularies)

{'CATEGORIA': ['administración_de_empresas', 'sanidad_y_salud'], 'SUBCATEGORIA': ['administración', 'medicina_general'], 'PALABRAS_EMPLEO_TEXTO': ['administrativo', 'ingles', 'navision', 'empresas', 'caracter', 'internacional', 'logistico', 'dominio', 'escrito', 'microsoft', 'dynamics', 'office', 'excel', 'medico', 'medicina', 'diagnostico', 'tratamiento', 'licenciado', 'grado', 'homologado', 'colegiado', 'vehiculo', 'carne']}
