In [9]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from scipy import spatial
from scipy import sparse
from scipy import linalg, mat
from tqdm import tqdm, notebook
import math
import datetime

import lightfm as lfm
from lightfm.evaluation import precision_at_k
from lightfm.data import Dataset
from scipy.sparse import csr_matrix

In [15]:
df_applicants_with_rank = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_train_rank.csv")
df_applicants_test = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_test.csv")

dtypes = {
    "idaviso": "int64",
    "tipo_de_trabajo": "string",
    "nivel_laboral": "string",
    "nombre_area": "string",
}
mydateparser = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
df_notice = pd.read_csv(
    "../data/02_intermediate/avisos/avisos_detalle.csv",
    parse_dates=["online_desde", "online_hasta"],
    date_parser=mydateparser,
    dtype=dtypes,
)
df_applicants_genre = pd.read_csv("../data/02_intermediate/postulantes/postulantes_genero_edad.csv")
df_applicants_education = pd.read_csv("../data/02_intermediate/postulantes/postulantes_educacion.csv")

In [16]:
live_until = datetime.datetime(2018, 4, 1)
available_notices = set(df_notice[df_notice.online_hasta >= live_until].idaviso)

In [17]:
ids_to_predict = set(df_applicants_test.idpostulante)
ids_train = set(df_applicants_with_rank.idpostulante)
intersect = ids_to_predict.intersection(ids_train)
only_in_prediction = ids_to_predict - ids_train
only_in_train = ids_train - ids_to_predict

In [18]:
print(f"Ids que estan solo en test: {len(only_in_prediction)}. Usar algo random.")

Ids que estan solo en test: 41204. Usar algo random.


In [19]:
print(f"Ids que estan en train y test: {len(intersect)}.")

Ids que estan en train y test: 115028.


In [20]:
df_intersect = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(intersect)]

In [21]:
ranking_by_applicant = (
    df_applicants_with_rank[
        df_applicants_with_rank.idpostulante.isin(intersect)
    ]
    .groupby("idpostulante")
    .agg({"idaviso": "count"})
    .reset_index()
    .rename(columns={"idaviso": "cantidad"})
    .sort_values(by="cantidad", ascending=False)
)

In [22]:
ids_hard_users = ranking_by_applicant[
    ranking_by_applicant.cantidad > 100
].idpostulante.values
print(f"Ids de los hardusers: {len(ids_hard_users)}.")

Ids de los hardusers: 4265.


In [23]:
df_hard_users = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(ids_hard_users)]
df_test_hard_users = df_applicants_test[df_applicants_test.idpostulante.isin(ids_hard_users)]

In [24]:
user_feature = df_applicants_genre.merge(df_applicants_education, on='idpostulante', how='left')
user_feature['estudio'] = user_feature.nombre + '-' + user_feature.estado
user_feature.drop(columns=['nombre', 'estado', 'fechanacimiento'], inplace=True)

In [25]:
def generate_features(df_features):
    col = []
    value = []
    for a_column in df_features.columns.values:
        col += [a_column] * len(df_features[a_column].unique())
        value += list(df_features[a_column].unique())

    features = [] 
    for x,y in zip(col, value):
        res = str(x)+ ":" +str(y)
        features += [res]
    return features

In [26]:
def generate_user_feature(features, features_names):
    res = []
    for one_feature in features:
        one = []
        for index, feat_name in enumerate(features_names):
            one += [feat_name + ":" + str(one_feature[index])]
        res += [one]
    return res

In [27]:
user_feature_hard_user = user_feature[user_feature.idpostulante.isin(ids_hard_users)]

In [28]:
uf = generate_features(user_feature_hard_user[['sexo', 'estudio']])

In [29]:
itf = generate_features(df_notice[['nombre_zona', 'tipo_de_trabajo', 'nivel_laboral', 'nombre_area']])

In [30]:
dataset1 = Dataset()
dataset1.fit(
        df_hard_users.idpostulante.unique(), # all the users
        df_hard_users.idaviso.unique(), # all the items
        user_features = uf, # additional user features
        item_features = itf,
)
# plugging in the interactions and their weights
(interactions, weights) = dataset1.build_interactions([(x[1], x[0], x[3]) for x in df_hard_users.values ])

In [31]:
feature_list = generate_user_feature(user_feature_hard_user[['sexo', 'estudio']].values, ['sexo', 'estudio'])

In [32]:
user_tuple = list(zip(user_feature_hard_user.idpostulante, feature_list))

In [33]:
user_features = dataset1.build_user_features(user_tuple, normalize= False)

In [34]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset1.mapping()

In [35]:
inv_item_id_map = {v: k for k, v in item_id_map.items()}

In [36]:
model = lfm.LightFM(loss='warp', random_state=42)
model.fit(interactions,
      user_features= user_features,
      sample_weight= weights,
      epochs=1,
      num_threads=8)

<lightfm.lightfm.LightFM at 0x7f6e21a59ca0>

In [37]:
test_precision  = precision_at_k(model, interactions, user_features=user_features, k=10, num_threads=8).mean()
test_precision

0.33662367

In [None]:
final_predictions = {}
for a_user in notebook.tqdm(df_test_hard_users.idpostulante.unique()):
    user_x = user_id_map[a_user]
    n_users, n_items = interactions.shape
    prediction = np.argsort(model.predict(user_x, np.arange(n_items)))[::-1]
    prediction_for_user = []
    for pred in prediction:
        notice = inv_item_id_map[pred]
        if notice in available_notices:
            prediction_for_user += [notice]
        if len(prediction_for_user) == 10:
            break
    final_predictions[a_user] = prediction_for_user
            

In [None]:
my_dict = {}
for applicant, group in notebook.tqdm(df_applicants_with_rank.groupby('idpostulante')):
    my_dict[applicant] = set(group.idaviso.values)