In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from tqdm import tqdm, notebook
import math
import datetime
import whoosh as wh
from whoosh import fields
from whoosh import index
from whoosh import query

In [2]:
df_applicants_with_rank = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_train_rank.csv")
df_applicants_test = pd.read_csv("../data/02_intermediate/postulaciones/postulaciones_test.csv")

dtypes = {
    "idaviso": "int64",
    "tipo_de_trabajo": "string",
    "nivel_laboral": "string",
    "nombre_area": "string",
}
mydateparser = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
df_notice = pd.read_csv(
    "../data/02_intermediate/avisos/avisos_detalle.csv",
    parse_dates=["online_desde", "online_hasta"],
    date_parser=mydateparser,
    dtype=dtypes,
)
df_applicants_genre = pd.read_csv("../data/02_intermediate/postulantes/postulantes_genero_edad.csv")
df_applicants_education = pd.read_csv("../data/02_intermediate/postulantes/postulantes_educacion.csv")

In [3]:
live_until = datetime.datetime(2018, 4, 1)
available_notices = set(df_notice[df_notice.online_hasta >= live_until].idaviso)

In [4]:
ids_to_predict = set(df_applicants_test.idpostulante)
ids_train = set(df_applicants_with_rank.idpostulante)
intersect = ids_to_predict.intersection(ids_train)
only_in_prediction = ids_to_predict - ids_train
only_in_train = ids_train - ids_to_predict

In [5]:
print(f"Ids que estan solo en test: {len(only_in_prediction)}. Usar algo random.")

Ids que estan solo en test: 41204. Usar algo random.


In [6]:
print(f"Ids que estan en train y test: {len(intersect)}.")

Ids que estan en train y test: 115028.


In [7]:
df_intersect = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(intersect)]

In [8]:
ranking_by_applicant = (
    df_applicants_with_rank[
        df_applicants_with_rank.idpostulante.isin(intersect)
    ]
    .groupby("idpostulante")
    .agg({"idaviso": "count"})
    .reset_index()
    .rename(columns={"idaviso": "cantidad"})
    .sort_values(by="cantidad", ascending=False)
)

In [9]:
ids_hard_users = ranking_by_applicant[
    ranking_by_applicant.cantidad > 100
].idpostulante.values
print(f"Ids de los hardusers: {len(ids_hard_users)}.")

Ids de los hardusers: 4265.


In [10]:
df_hard_users = df_applicants_with_rank[df_applicants_with_rank.idpostulante.isin(ids_hard_users)]
df_test_hard_users = df_applicants_test[df_applicants_test.idpostulante.isin(ids_hard_users)]

In [11]:
schema = wh.fields.Schema(
    id=wh.fields.ID(stored=True),
    genres=wh.fields.KEYWORD(stored=True, lowercase=True, commas=True)
)

In [None]:
ix = wh.index.create_in("../index_dir", schema)
writer = ix.writer()

for mid in df_movies_genres.movieID.unique():
    genre = df_movies_genres.loc[df_movies_genres.movieID == mid, 'genre'].tolist()
    writer.add_document(id=str(mid), genres=u",".join(df_movies_genres.loc[df_movies_genres["movieID"]==mid, "genre"].tolist()))
writer.commit()