In [129]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from scipy import spatial
from scipy import sparse
from scipy import linalg, mat
from tqdm import tqdm, notebook
import math
import datetime

import lightfm as lfm
from lightfm.data import Dataset
from scipy.sparse import csr_matrix

In [2]:
df_notice = pd.read_csv("../data/02_intermediate/avisos/avisos_detalle.csv")
df_applicants_with_rank = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_train_rank.csv")
df_applicants_test = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_test.csv")

dtypes = {
    "idaviso": "int64",
    "tipo_de_trabajo": "string",
    "nivel_laboral": "string",
    "nombre_area": "string",
}
mydateparser = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
df_notice = pd.read_csv(
    "../data/02_intermediate/avisos/avisos_detalle.csv",
    parse_dates=["online_desde", "online_hasta"],
    date_parser=mydateparser,
    dtype=dtypes,
)
df_applicants_genre = pd.read_csv("../data/02_intermediate/postulantes/postulantes_genero_edad.csv")
df_applicants_education = pd.read_csv("../data/02_intermediate/postulantes/postulantes_educacion.csv")

In [263]:
live_until = datetime.datetime(2018, 4, 1)
available_notices = set(df_notice[df_notice.online_hasta >= live_until].idaviso)

In [3]:
columns_dict = pd.read_csv('../data/02_intermediate/postulantes/atributos_diccionario.csv')
matrix_train = pd.read_csv('../data/02_intermediate/postulantes/postulantes_matrix_train.csv', index_col='idpostulante')
matrix_test = pd.read_csv('../data/02_intermediate/postulantes/postulantes_matrix_test.csv', index_col='idpostulante')
numpy_matrix_test = matrix_test.to_numpy()
numpy_matrix_train = matrix_train.to_numpy()
normalized_test = normalize(numpy_matrix_test, norm='l2')
normalized_train = normalize(numpy_matrix_train, norm='l2')

In [4]:
ids_to_predict = set(df_applicants_test.idpostulante)
ids_train = set(df_applicants_with_rank.idpostulante)
intersect = ids_to_predict.intersection(ids_train)
only_in_prediction = ids_to_predict - ids_train
only_in_train = ids_train - ids_to_predict

In [5]:
print(f"Ids que estan solo en test: {len(only_in_prediction)}. Usar algo random.")

Ids que estan solo en test: 41204. Usar algo random.


In [6]:
print(f"Ids que estan en train y test: {len(intersect)}.")

Ids que estan en train y test: 115028.


In [7]:
df_intersect = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(intersect)]

In [8]:
ranking_by_applicant = (
    df_applicants_with_rank[
        df_applicants_with_rank.idpostulante.isin(intersect)
    ]
    .groupby("idpostulante")
    .agg({"idaviso": "count"})
    .reset_index()
    .rename(columns={"idaviso": "cantidad"})
    .sort_values(by="cantidad", ascending=False)
)

In [9]:
ids_hard_users = ranking_by_applicant[
    ranking_by_applicant.cantidad > 100
].idpostulante.values
print(f"Ids de los hardusers: {len(ids_hard_users)}.")

Ids de los hardusers: 4265.


In [10]:
df_hard_users = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(ids_hard_users)]
df_test_hard_users = df_applicants_test[df_applicants_test.idpostulante.isin(ids_hard_users)]

In [111]:
user_feature = df_applicants_genre.merge(df_applicants_education, on='idpostulante', how='left')
user_feature['estudio'] = user_feature.nombre + '-' + user_feature.estado
user_feature.drop(columns=['nombre', 'estado', 'fechanacimiento'], inplace=True)

In [109]:
user_feature_hard_user = user_feature[user_feature.idpostulante.isin(ids_hard_users)]

In [138]:
col = []
value = []
for a_column in user_feature.columns.values:
    if 'idpostulante' != a_column:
        col += [a_column] * len(user_feature[a_column].unique())
        value += list(user_feature[a_column].unique())

uf = []
for x,y in zip(col, value):
    res = str(x)+ ":" +str(y)
    uf += [res]

In [142]:
dataset1 = Dataset()
dataset1.fit(
        df_hard_users.idpostulante.unique(), # all the users
        df_hard_users.idaviso.unique(), # all the items
        user_features = uf # additional user features
)
# plugging in the interactions and their weights
(interactions, weights) = dataset1.build_interactions([(x[1], x[0], x[3]) for x in df_hard_users.values ])

In [204]:
def generate_user_feature(features, features_names):
    res = []
    for one_feature in features:
        one = []
        for index, feat_name in enumerate(features_names):
            one += [feat_name + ":" + str(one_feature[index])]
        res += [one]
    return res

In [205]:
feature_list = generate_user_feature(user_feature_hard_user[['sexo', 'estudio']].values, ['sexo', 'estudio'])

In [206]:
user_tuple = list(zip(user_feature_hard_user.idpostulante, feature_list))

In [208]:
user_features = dataset1.build_user_features(user_tuple, normalize= False)

In [213]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset1.mapping()

In [261]:
inv_item_id_map = {v: k for k, v in item_id_map.items()}

In [272]:
model = lfm.LightFM(loss='warp')
model.fit(interactions,
      user_features= user_features,
      sample_weight= weights,
      epochs=1000,
      num_threads=8)

<lightfm.lightfm.LightFM at 0x7f81974b0e80>

In [266]:
final_predictions = {}
for a_user in notebook.tqdm(df_test_hard_users.idpostulante.unique()):
    user_x = user_id_map[a_user]
    n_users, n_items = interactions.shape
    prediction = np.argsort(model.predict(user_x, np.arange(n_items)))[::-1]
    prediction_for_user = []
    for pred in prediction:
        notice = inv_item_id_map[pred]
        if notice in available_notices:
            prediction_for_user += [notice]
        if len(prediction_for_user) == 10:
            break
    final_predictions[a_user] = prediction_for_user
            

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4265.0), HTML(value='')))




In [248]:
np.argsort(prediction)[::-1]

array([ 6639,  6825,  2444, ..., 12242, 12554, 14347])

In [249]:
np.sort(prediction)[::-1]

array([-17.649326, -17.671553, -17.749048, ..., -27.943615, -27.950895,
       -28.698471], dtype=float32)

In [262]:
inv_item_id_map[6639]

1112352308

In [274]:
sub = pd.read_csv('/home/zero/git/sistemas-de-recomendacion-2020/challenge_postulacion_zonajobs/data/07_model_output/all.csv')

In [278]:
sub[['idaviso','idpostulante']].to_csv('/home/zero/git/sistemas-de-recomendacion-2020/challenge_postulacion_zonajobs/data/07_model_output/all.csv', index=False)