## Task2 
- train #0-19999
- valid #20000-22499
- test #22500-24999

uid 0~19999のデータフレームを作成してparquetで保存

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# import datetime
import os
import gc
# from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

import geobleu
from joblib import Parallel, delayed
from tqdm import tqdm

#　擬似スコア
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

from collections import Counter, defaultdict

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append("../src")

import importlib
from util import *
import util

from calc_metrices import *
import calc_metrices

In [2]:
class Config:

  exp = 'ens104'
  task = 'task2'
  data = 'CB'
  stage = "valid"
  # candidate hypara. group window_size
  window_size = 3
  co_start_day = 0
  co_end_day = 59

  # co-visitation matrix
  n_ago = 3
  
  # train test split. taskごとに要変更
  start_uid = 20000
  end_uid = 22499
  train_start_day = 0
  train_end_day = 59
  test_start_day = 60
  test_end_day = 74

  # train, valid split. taskごとに要変更
  model_train_start_day = 0
  model_train_end_day = 40
  model_valid_start_day = 41
  model_valid_end_day = 59

  seed = 123
  debug = False


In [3]:
INPUT_DIR = f'/root/humob/input/pkl/' #各自データがあるパスに変更の必要あり
OUTPUT_DIR = f'/root/humob/pub/data/{Config.task}/{Config.data}/'
if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)
if not os.path.exists(OUTPUT_DIR+Config.stage):
  os.makedirs(OUTPUT_DIR+Config.stage)

In [4]:
print(f'Processing : Dataset_load...')
print('')

# Load dataset
df = dataset_load(Config.task, Config.debug, INPUT_DIR)

train_weekend_list = [0, 1, 6, 7, 8, 13, 14, 20, 21, 27, 28, 29, 34, 35, 37, 41, 42, 48, 49, 50, 55, 56,]
test_not_normal_day = [65, 66, 67, 72, 73]

all_days = np.arange(df['d'].nunique())
weekend_list = train_weekend_list + test_not_normal_day
weekday_list = [day for day in all_days if day not in weekend_list]

# preprocess
df = extract_uid(df, Config.start_uid, Config.end_uid)
df["wd"] = df["d"] % 7
df['xy'] = df['x'].astype(str).str.zfill(3) + df['y'].astype(str).str.zfill(3)
df['xy'] = df['xy'].astype(int)

df = add_cumcount(df)
df = add_group_xx(df, window_size=Config.window_size)
df, drop_cols = add_lag_mesh(df, Config.n_ago)

train, test = train_test_split_func(df, Config.train_start_day, Config.train_end_day, Config.test_start_day, Config.test_end_day)

Processing : Dataset_load...



In [5]:
train_df = test.groupby(["uid", "d"]).head(1)[["uid", "d", "wd"]].reset_index(drop=True)

In [6]:
def feature_engineering(train, test, train_df):
    ### train期間の特徴量
    # uid*dごとのログの長さ中央値
    uid2log_len_median = train.groupby(["uid", "d"]).size().groupby("uid").median().to_dict()

    # uid*dごとのログの長さ平均値
    uid2log_len_mean = train.groupby(["uid", "d"]).size().groupby("uid").mean().to_dict()

    # uid*dごとのログの長さ分散
    uid2log_len_std = train.groupby(["uid", "d"]).size().groupby("uid").std().to_dict()

    # uid*dごとのログの長さ歪度
    uid2log_len_skew = train.groupby(["uid", "d"]).size().groupby("uid").skew().to_dict()

    # uid*wdごとのログの長さ中央値
    uid_wd2wd_log_len_median = train.groupby(["uid", "wd", "d"]).size().groupby(["uid","wd"]).median().to_dict()

    # uid*wdごとのログの長さ平均値
    uid_wd2wd_log_len_mean = train.groupby(["uid", "wd", "d"]).size().groupby(["uid","wd"]).mean().to_dict()

    # uid*wdごとのログの長さ分散
    uid_wd2wd_log_len_std = train.groupby(["uid", "wd", "d"]).size().groupby(["uid","wd"]).std().to_dict()

    # uid*wdごとのログの長さ歪度
    uid_wd2wd_log_len_skew = train.groupby(["uid", "wd", "d"]).size().groupby(["uid","wd"]).skew().to_dict()

    # uidごとのxyユニーク数
    uid2xy_nunique = train.groupby(["uid"])["xy"].nunique().to_dict()

    # uidごとの最頻滞在地の滞在割合
    temp_df = train.groupby(["uid"])["xy"].value_counts(normalize=True).groupby(["uid"]).head(1).to_frame().rename(columns={"xy":"v"}).reset_index()
    uid2top1_ratio = {uid:v for uid, v in zip(temp_df["uid"], temp_df["v"])}

    ### test期間の特徴量(コンペ特有でtの系列情報は使えるので使ってよい not leak)
    uid_d2log_len = test.groupby(["uid", "d"]).size().to_dict()
    temp_df = test.loc[(test["t"]>=0)&(test['t'] <= 11)].groupby(['uid', 'd']).size().reset_index(name='count')
    uid_d2t0_11_cnt = {(uid, t):v for uid, t, v in zip(temp_df["uid"], temp_df["d"] , temp_df["count"])}
    temp_df = test.loc[(test["t"]>=12)&(test['t'] <= 23)].groupby(['uid', 'd']).size().reset_index(name='count')
    uid_d2t12_23_cnt = {(uid, t):v for uid, t, v in zip(temp_df["uid"], temp_df["d"] , temp_df["count"])}
    temp_df = test.loc[(test["t"]>=24)&(test['t'] <= 35)].groupby(['uid', 'd']).size().reset_index(name='count')
    uid_d2t24_35_cnt = {(uid, t):v for uid, t, v in zip(temp_df["uid"], temp_df["d"] , temp_df["count"])}
    temp_df = test.loc[(test["t"]>=36)&(test['t'] <= 47)].groupby(['uid', 'd']).size().reset_index(name='count')
    uid_d2t36_47_cnt = {(uid, t):v for uid, t, v in zip(temp_df["uid"], temp_df["d"] , temp_df["count"])}


    ## map
    train_df["wd_flag"] = train_df["d"].isin(weekday_list)*1
    train_df["over_d"] = train_df["d"] - Config.test_start_day

    train_df["log_len_median"] = train_df["uid"].map(uid2log_len_median)
    train_df["log_len_mean"] = train_df["uid"].map(uid2log_len_mean)
    train_df["log_len_std"] = train_df["uid"].map(uid2log_len_std)
    train_df["log_len_skew"] = train_df["uid"].map(uid2log_len_skew)
    train_df["xy_nunique"] = train_df["uid"].map(uid2xy_nunique)

    def map_vec(key1, key2, dic):
        try:
            return dic[(key1, key2)]
        except KeyError:
            return -99999

    train_df["wd_log_len_median"] = np.vectorize(map_vec)(train_df["uid"], train_df["wd"], uid_wd2wd_log_len_median)
    train_df["wd_log_len_mean"] = np.vectorize(map_vec)(train_df["uid"], train_df["wd"], uid_wd2wd_log_len_mean)
    train_df["wd_log_len_std"] = np.vectorize(map_vec)(train_df["uid"], train_df["wd"], uid_wd2wd_log_len_std)
    train_df["wd_log_len_skew"] = np.vectorize(map_vec)(train_df["uid"], train_df["wd"], uid_wd2wd_log_len_skew)
    train_df["top1_ratio"] = train_df["uid"].map(uid2top1_ratio)
    train_df["log_len"] = np.vectorize(map_vec)(train_df["uid"], train_df["d"], uid_d2log_len)
    train_df["t0_11_cnt"] = np.vectorize(map_vec)(train_df["uid"], train_df["d"], uid_d2t0_11_cnt)
    train_df["t12_23_cnt"] = np.vectorize(map_vec)(train_df["uid"], train_df["d"], uid_d2t12_23_cnt)
    train_df["t24_35_cnt"] = np.vectorize(map_vec)(train_df["uid"], train_df["d"], uid_d2t24_35_cnt)
    train_df["t36_47_cnt"] = np.vectorize(map_vec)(train_df["uid"], train_df["d"], uid_d2t36_47_cnt)

    train_df = train_df.fillna(-99999)

    return train_df

In [7]:
train_df = feature_engineering(train, test, train_df)

答えの算出

In [8]:
for i, days in enumerate([weekday_list, weekend_list]):
    part_df = train.loc[train.d.isin(days)]
    uids = list(part_df.uid.unique())
    # t粒度
    t_func = [td002]
    uid_t_xy2wgt = calc_wgt_func(part_df, t_func, Config.co_start_day, Config.co_end_day, Config.test_end_day)
    test.loc[test.d.isin(days), "pred_t"] = map_dict_to_df_t(uid_t_xy2wgt, test.loc[test.d.isin(days)])
    # group_t粒度
    group_t_func = [td105]
    uid_group_t_xy2wgt = calc_wgt_func(part_df, group_t_func, Config.co_start_day, Config.co_end_day, Config.test_end_day)
    test.loc[test.d.isin(days), "pred_group_t"] = map_dict_to_df_group_t(uid_group_t_xy2wgt, test.loc[test.d.isin(days)])    

## pred_tがnullの列はgroup_tでうめる
test.loc[test.pred_t.isnull(), "pred_t"] = test.loc[test.pred_t.isnull(), "pred_group_t"]

test["pred_t"] = test.groupby(["uid","d"])["pred_t"].ffill()
test["pred_t"] = test.groupby(["uid","d"])["pred_t"].bfill()

test["pred_group_t"] = test.groupby(["uid","d"])["pred_group_t"].ffill()
test["pred_group_t"] = test.groupby(["uid","d"])["pred_group_t"].bfill()


# # 予測できていない部分をuidごとの最頻値で埋める
uid2most_xy = get_uid2most_xy(train)
test.loc[test.pred_t.isnull(), "pred_t"] = test.loc[test.pred_t.isnull(), "uid"].map(uid2most_xy)
test.loc[test.pred_group_t.isnull(), "pred_group_t"] = test.loc[test.pred_group_t.isnull(), "uid"].map(uid2most_xy)

# 後処理
test["pred_t"] = test.pred_t.astype(int)
test["pred_group_t"] = test.pred_group_t.astype(int)

In [9]:
test['pred_t_x'] = test['pred_t'].astype(str).str[:-3].astype(int)
test['pred_t_y'] = test['pred_t'].astype(str).str[-3:].astype(int)
test['pred_group_t_x'] = test['pred_group_t'].astype(str).str[:-3].astype(int)
test['pred_group_t_y'] = test['pred_group_t'].astype(str).str[-3:].astype(int)

In [10]:
def calc_geobleu_t(df):
    reference = df[["d","t","x","y"]].to_numpy()
    generated = df[["d","t","pred_t_x","pred_t_y"]].to_numpy()
    return geobleu.calc_geobleu_single_list(generated.tolist(), reference.tolist()) #日付間の並列化不要。ユーザーごとに並列化する。

def calc_geobleu_group_t(df):
    reference = df[["d","t","x","y"]].to_numpy()
    generated = df[["d","t","pred_group_t_x","pred_group_t_y"]].to_numpy()
    return geobleu.calc_geobleu_single_list(generated.tolist(), reference.tolist()) #日付間の並列化不要。ユーザーごとに並列化する。

In [11]:
grouped = test[["uid", "d", "t", "x", "y", "pred_t_x", "pred_t_y"]].groupby("uid")
results = Parallel(n_jobs=-1)(delayed(calc_geobleu_t)(df) for _, df in tqdm(grouped))
train_df["pred_t_geobleu"] = sum(results, [])

grouped = test[["uid", "d", "t", "x", "y", "pred_group_t_x", "pred_group_t_y"]].groupby("uid")
results = Parallel(n_jobs=-1)(delayed(calc_geobleu_group_t)(df) for _, df in tqdm(grouped))
train_df["pred_group_t_geobleu"] = sum(results, [])

  0%|          | 0/2500 [00:00<?, ?it/s]

100%|██████████| 2500/2500 [00:12<00:00, 199.06it/s]
100%|██████████| 2500/2500 [00:10<00:00, 235.89it/s]


In [12]:
print(f"{Config.task}, data {Config.data}")
for col in train_df.columns:
    if not check_exist_file(f"{OUTPUT_DIR}/{Config.stage}/{col}.pqt"):
        print(f"{col} does not exist, saving...")
        train_df[[col]].to_parquet(f"{OUTPUT_DIR}/{Config.stage}/{col}.pqt")

task2, data CB
uid does not exist, saving...
d does not exist, saving...
wd does not exist, saving...
wd_flag does not exist, saving...
over_d does not exist, saving...
log_len_median does not exist, saving...
log_len_mean does not exist, saving...
log_len_std does not exist, saving...
log_len_skew does not exist, saving...
xy_nunique does not exist, saving...
wd_log_len_median does not exist, saving...
wd_log_len_mean does not exist, saving...
wd_log_len_std does not exist, saving...
wd_log_len_skew does not exist, saving...
top1_ratio does not exist, saving...
log_len does not exist, saving...
t0_11_cnt does not exist, saving...
t12_23_cnt does not exist, saving...
t24_35_cnt does not exist, saving...
t36_47_cnt does not exist, saving...
pred_t_geobleu does not exist, saving...
pred_group_t_geobleu does not exist, saving...


EOF