# 特徴量エンジニアリング時の手法による処理時間の比較

## 比較方法

session_idを10人分ランダムサンプリングして、手法別に処理時間を比較する。

また提出環境ではpandasで提供されるので、それに合わせる。


## 比較手法

- polars
- pandas
- joblib


In [1]:
import sys

sys.path.append("../src/")

import os
import pathlib
import itertools
from collections import defaultdict
from typing import Any, Dict, Optional, Union, List, Tuple

import gensim
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import xgboost
import lightgbm
import matplotlib.pyplot as plt
from annoy import AnnoyIndex
from xgboost import XGBClassifier
from rich.progress import track
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts
from gensim.similarities.annoy import AnnoyIndexer
from gensim.models import Word2Vec

from utils import timer
from utils.io import load_pickle, save_pickle, save_txt
from common import create_features

%matplotlib inline


sns.set_style("darkgrid")
plt.style.use("seaborn-darkgrid")

pd.options.display.max_rows = 100
pd.options.display.max_columns = 500 

  plt.style.use("seaborn-darkgrid")


In [2]:
train = pl.read_parquet("../data/preprocessing/train.parquet").to_pandas()

In [3]:
num_sample = 10
session_ids = train["session_id"].unique()[:num_sample]

smpl_train = train[train["session_id"].isin(session_ids)].copy().sort_values(["session_id", "elapsed_time"])

In [4]:
smpl_train.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4


In [5]:
class TrainTimeSeriesIterator:
    def __init__(self, train: pd.DataFrame) -> None:
        self.train = train.sort_values(["session_id", "elapsed_time"])

        session_ids = self.train["session_id"].unique()
        level_group = ["0-4", "5-12", "13-22"]
        self.items = [(s, lg) for lg in level_group for s in session_ids]

    def __iter__(self):
        return self

    def __next__(self) -> pd.DataFrame:
        if len(self.items) == 0:
            raise StopIteration()

        session_id, level_group = self.items.pop(0)
        train_iter = self.train.query(f"session_id == {session_id} & level_group == '{level_group}'")
        return train_iter

In [6]:
train_ts_iter = TrainTimeSeriesIterator(smpl_train)

for train_iter in train_ts_iter:
    feat = create_features(pl.from_pandas(train_iter), input_dir=pathlib.Path("../data/preprocessing/"), level_group="0-4")

In [7]:
feat

session_id,word_that,word_this,word_it,word_you,word_find,word_found,word_Found,word_notebook,word_Wells,word_wells,word_help,word_need,word_Oh,word_Ooh,word_Jo,word_flag,word_can,word_and,word_is,word_the,word_to,word_mean_that,word_mean_this,word_mean_it,word_mean_you,word_mean_find,word_mean_found,word_mean_Found,word_mean_notebook,word_mean_Wells,word_mean_wells,word_mean_help,word_mean_need,word_mean_Oh,word_mean_Ooh,word_mean_Jo,…,screen_coor_y_mean,elapsed_time_diff_std,hover_duration_std,room_coor_x_std,room_coor_y_std,screen_coor_x_std,screen_coor_y_std,elapsed_time_diff_sum,hover_duration_sum,room_coor_x_sum,room_coor_y_sum,screen_coor_x_sum,screen_coor_y_sum,elapsed_time_diff_qtile_0.1,hover_duration_qtile_0.1,room_coor_x_qtile_0.1,room_coor_y_qtile_0.1,screen_coor_x_qtile_0.1,screen_coor_y_qtile_0.1,elapsed_time_diff_qtile_0.2,hover_duration_qtile_0.2,room_coor_x_qtile_0.2,room_coor_y_qtile_0.2,screen_coor_x_qtile_0.2,screen_coor_y_qtile_0.2,elapsed_time_diff_qtile_0.5,hover_duration_qtile_0.5,room_coor_x_qtile_0.5,room_coor_y_qtile_0.5,screen_coor_x_qtile_0.5,screen_coor_y_qtile_0.5,elapsed_time_diff_qtile_0.75,hover_duration_qtile_0.75,room_coor_x_qtile_0.75,room_coor_y_qtile_0.75,screen_coor_x_qtile_0.75,screen_coor_y_qtile_0.75
i64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
20090314441803444,5,7,27,33,9,5,0,0,4,0,12,9,5,0,8,20,8,7,26,45,38,1044.2,1073.857143,1209.703704,1344.545455,1410.777778,1563.4,,,925.25,,1248.333333,921.0,1412.0,,1214.25,…,380.140957,892.66841,1287.3612,608.120991,218.18361,214.627408,113.239045,516098,46174.0,-65311.142782,-50225.641058,171197.0,142933.0,382.0,51.0,-1063.502243,-465.038037,166.0,242.0,616.0,84.0,-779.628595,-271.24442,234.0,299.0,983.0,451.0,-40.211227,-112.375501,449.0,374.0,1583.0,1346.0,209.801652,20.911696,630.0,464.0


In [25]:

# for idx, gdf in smpl_train.groupby(by=["session_id", "level_group"], sort=False):
#     print(idx[1])
#     elapsed_time_aggs = gdf["elapsed_time"].agg(["sum", "mean", "max", "min"]).to_dict()
#     prefix = f"elapsed_time_{idx[1]}_"
#     elapsed_time_aggs = {str(prefix + key): value for key, value in elapsed_time_aggs.items()}
#     features[idx[0]] = {**features[idx[0]], **elapsed_time_aggs}


def create_features(data: pd.DataFrame) -> List[float]:
    outputs = []
    outputs.extend(data["elapsed_time"].agg(["sum", "mean", "max", "min"]).tolist())
    return outputs 



features = {level_group: defaultdict(list) for level_group in ["0-4", "5-12", "13-22"]}

train_ts_iter = TrainTimeSeriesIterator(smpl_train)
for train_iter in train_ts_iter:
    feat = create_features(train_iter)

    level_group = train_iter.iloc[0]["level_group"]
    session_id = train_iter.iloc[0]["session_id"]
    features[level_group][session_id].extend(feat)

In [34]:
# level_groupごとの特徴量を作りたい

feat_level_group = defaultdict(list)
for level_group in ["0-4", "5-12", "13-22"]:
    for sess in session_ids:
        feat_level_group[level_group].append(features[level_group][sess])

In [40]:
np.concatenate((feat_level_group["0-4"], feat_level_group["5-12"]), axis=1)

(10, 8)