In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from scipy import spatial
from scipy import sparse
from scipy import linalg, mat
from tqdm import tqdm, notebook
import math
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from random import randint
from src.utils import split_in, extended_describe, cos_cdist
import lightfm as lfm
from lightfm import data
from lightfm import cross_validation
from lightfm import evaluation
import lightgbm as lgb

In [19]:
df_applicants_to_predict = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_test.csv")
df_notice = pd.read_csv("../data/02_intermediate/avisos/avisos_extended.csv")
dtypes = {"idaviso": "int64", "idpostulante": "string"}
mydateparser = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
df_applicants_with_rank = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_train_rank.csv", parse_dates=["fechapostulacion"], date_parser=mydateparser, dtype=dtypes)
df_applicants_test = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_test.csv")

In [87]:
columns_dict = pd.read_csv('../data/02_intermediate/postulantes/atributos_diccionario.csv')
matrix_train = pd.read_csv('../data/02_intermediate/postulantes/postulantes_matrix_train.csv', index_col='idpostulante')
matrix_test = pd.read_csv('../data/02_intermediate/postulantes/postulantes_matrix_test.csv', index_col='idpostulante')
numpy_matrix_test = matrix_test.to_numpy()
numpy_matrix_train = matrix_train.to_numpy()
normalized_test = normalize(numpy_matrix_test, norm='l2')
normalized_train = normalize(numpy_matrix_train, norm='l2')

In [37]:
ranking_notice = df_applicants_with_rank.groupby(['idaviso', 'sexo']).agg({'idpostulante': 'count'}).reset_index().rename(columns={'idpostulante': 'cantidad'}).sort_values(by='cantidad', ascending=False)
top_ten_by_sex = {}
top_ten_by_sex['FEM'] = ranking_notice[ranking_notice.sexo=='FEM'].head(10).idaviso.values
top_ten_by_sex['MASC'] = ranking_notice[ranking_notice.sexo=='MASC'].head(10).idaviso.values
top_ten_by_sex['NO_DECLARA'] = ranking_notice[ranking_notice.sexo=='NO_DECLARA'].head(10).idaviso.values

In [153]:
ids_to_predict = set(df_applicants_to_predict.idpostulante)
ids_train = set(df_applicants_with_rank.idpostulante)
intersect = ids_to_predict.intersection(ids_train)
only_in_prediction = ids_to_predict - ids_train
only_in_train = ids_train - ids_to_predict

In [45]:
print(f"Ids que estan solo en test: {len(only_in_prediction)}. Usar algo random.")

Ids que estan solo en test: 41204. Usar algo random.


In [46]:
print(f"Ids que estan en train y test: {len(intersect)}.")

Ids que estan en train y test: 115028.


In [154]:
df_intersect = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(intersect)]

In [160]:
ranking_by_applicant = (
    df_applicants_with_rank[
        df_applicants_with_rank.idpostulante.isin(intersect)
    ]
    .groupby("idpostulante")
    .agg({"idaviso": "count"})
    .reset_index()
    .rename(columns={"idaviso": "cantidad"})
    .sort_values(by="cantidad", ascending=False)
)

In [171]:
ids_hard_users = ranking_by_applicant[
    ranking_by_applicant.cantidad > 100
].idpostulante.values
len(ids_hard_users)

8528

In [174]:
df_hard_users = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(ids_hard_users)]
df_test_hard_users = df_applicants_to_predict[df_applicants_to_predict.idpostulante.isin(ids_hard_users)]

In [187]:
df_hard_users.head()

Unnamed: 0,idaviso,idpostulante,tipo_de_trabajo,nivel_laboral,nombre_area,sexo,anios_al_postularse,fechapostulacion,rank
166,1112433297,0zB0Xwd,Full-time,Otro,Ventas,FEM,28.0,2018-03-30 21:14:48,0
167,1112399864,0zB0Xwd,Full-time,Senior / Semi-Senior,Administración,FEM,28.0,2018-03-30 16:38:53,1
168,1112401437,0zB0Xwd,Full-time,Senior / Semi-Senior,Administración,FEM,28.0,2018-03-30 16:35:54,2
169,1112219347,0zB0Xwd,Full-time,Senior / Semi-Senior,Administración de Personal,FEM,28.0,2018-03-30 16:34:14,3
170,1112423908,0zB0Xwd,Full-time,Junior,Administración de Personal,FEM,28.0,2018-03-30 16:31:03,4


In [188]:
df_test_hard_users.head()

Unnamed: 0,idpostulante,fechanacimiento,sexo
7,ZPWlE,1962-05-18,FEM
9,N0V2D,1978-09-29,FEM
21,EppJmb,1960-06-20,MASC
60,5M8apw,1974-10-09,FEM
71,YNlRzP,1980-05-01,MASC


In [198]:
rank_notices = df_hard_users.groupby('idaviso').agg({'rank': 'count'}).reset_index().rename(columns={'rank': 'cantidad'})

In [202]:
notice = df_notice.merge(rank_notices, on='idaviso', how='left').fillna(0).drop(columns=['online_desde', 'online_hasta'])

In [203]:
notice

Unnamed: 0,idaviso,tipo_de_trabajo,nivel_laboral,nombre_area,cantidad
0,8725750,Full-time,Senior / Semi-Senior,Comercial,16.0
1,12543760,Full-time,Senior / Semi-Senior,Diseño Gráfico,77.0
2,12812680,Full-time,Senior / Semi-Senior,Otros,23.0
3,17903700,Full-time,Senior / Semi-Senior,Salud,2.0
4,1000132160,Full-time,Senior / Semi-Senior,Programación,6.0
...,...,...,...,...,...
18357,1112470166,Full-time,Junior,Mantenimiento y Limpieza,0.0
18358,1112470172,Full-time,Senior / Semi-Senior,Ventas,0.0
18359,1112470230,Full-time,Otro,Transporte,0.0
18360,1112470244,Full-time,Senior / Semi-Senior,Ingeniería Automotriz,0.0


In [190]:
ds = lfm.data.Dataset()
ds.fit(users=df_hard_users.idpostulante.unique(), items=notice.idaviso.unique())
ds.interactions_shape()

(8528, 18362)

In [192]:
# transformar  'fechapostulacion' en un int
(interactions, weights) = ds.build_interactions(df_hard_users[['idpostulante', 'idaviso', 'rank']].itertuples(index=False))
interactions

<8528x18362 sparse matrix of type '<class 'numpy.int32'>'
	with 1736724 stored elements in COOrdinate format>

In [182]:
train, test = lfm.cross_validation.random_train_test_split(interactions, test_percentage=0.2)
model = lfm.LightFM(no_components=20)
model.fit(train, epochs=50, num_threads=8)

<lightfm.lightfm.LightFM at 0x7f5c33b2a820>

In [183]:
train_precision = lfm.evaluation.precision_at_k(model, train, k=10, num_threads=8).mean()
test_precision  = lfm.evaluation.precision_at_k(model, test,  k=10, num_threads=8).mean()
print(f'Precision en train: {train_precision}')
print(f'Precision en test: {test_precision}')

Precision en train: 0.17938555777072906
Precision en test: 0.06287524104118347
