In [2]:
import sys

sys.path.append("../src/")

import os
import pathlib
import itertools
from collections import defaultdict
from typing import Any, Dict, Optional, Union, List

import gensim
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import xgboost
import lightgbm
import matplotlib.pyplot as plt
from annoy import AnnoyIndex
from xgboost import XGBClassifier
from rich.progress import track
from gensim.models import KeyedVectors
from gensim.test.utils import common_texts
from gensim.similarities.annoy import AnnoyIndexer
from gensim.models import Word2Vec

from utils import timer
from utils.io import load_pickle, save_pickle, save_txt

%matplotlib inline


sns.set_style("darkgrid")
plt.style.use("seaborn-darkgrid")

pd.options.display.max_rows = 100
pd.options.display.max_columns = 500 

  plt.style.use("seaborn-darkgrid")


### やりたいこと
- groupごとに埋め込みたいidのリストを作成する。これをsentencesと呼ぶ
- sentencesをgensim.models.Word2Vecを使ってEmbeddingを作成する
- どうやってEmbedding評価する？
- 利用したい方法でEmbeddingを使えるか確かめる
  - 今回はsession_id, level_groupごとにfqidやtextなどの系列を最新を重視するように時系列に重みをつけて平均化する

In [3]:
train  = pl.read_parquet("../data/preprocessing/train.parquet")

In [4]:
train.head()

session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
i64,i64,i64,str,str,str,str,f64,f64,f64,f64,f64,str,str,str,str,i64,i64,i64,str
20090312431273200,0,0,"""cutscene_click…","""basic""","""0""",,-413.991405,-159.314686,380.0,494.0,,"""undefined""","""intro""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""
20090312431273200,1,1323,"""person_click""","""basic""","""0""",,-413.991405,-159.314686,380.0,494.0,,"""Whatcha doing …","""gramps""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""
20090312431273200,2,831,"""person_click""","""basic""","""0""",,-413.991405,-159.314686,380.0,494.0,,"""Just talking t…","""gramps""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""
20090312431273200,3,1147,"""person_click""","""basic""","""0""",,-413.991405,-159.314686,380.0,494.0,,"""I gotta run to…","""gramps""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""
20090312431273200,4,1863,"""person_click""","""basic""","""0""",,-412.991405,-159.314686,381.0,494.0,,"""Can I come, Gr…","""gramps""","""tunic.historic…","""tunic.historic…",0,0,1,"""0-4"""


In [5]:
target_cols = ["text", "fqid", "room_fqid", "text_fqid"]


for col in target_cols:
    num_unique = train[col].n_unique()
    print(f"n_unique of {col}: {num_unique}")
    print(train[col].is_null().mean())

n_unique of text: 598
0.6342868483663464
n_unique of fqid: 129
0.31465307796578357
n_unique of room_fqid: 19
0.0
n_unique of text_fqid: 127
0.6342828555072517


現状textのみ、特徴量としてうまく使えていない状況なのでtextに限定して試してみる

In [10]:
%%time

target_col = "text"
sentences = train.groupby("session_id").agg(pl.col(target_col))[target_col].to_list()

model = Word2Vec(sentences=sentences, vector_size=16, min_count=1, workers=4)

CPU times: user 53.8 s, sys: 3.09 s, total: 56.8 s
Wall time: 27.2 s


In [None]:
uniques = list(itertools.chain.from_iterable(sentences))

In [1]:
# save model
model.wv.save(f"./wv_{target_col}.wv")

# load model
wv = KeyedVectors.load(f"./wv_{target_col}.wv", mmap="r")

!rm ./wv_text.wv

NameError: name 'model' is not defined

In [21]:
path = "../data/preprocessing/wv_text.wv"
wv = KeyedVectors.load(path, mmap="r")

In [22]:
for u in uniques[:5]:
    print("Query item:", u, )
    for s in wv.most_similar(u, topn=5):
        print("\t", s)
    print()

NameError: name 'uniques' is not defined

In [24]:
wv.index_to_key[:10]

[None,
 'undefined',
 '\\u00f0\\u0178\\u02dc\\u0090',
 'Hey!',
 'I should go to the Capitol and tell Mrs. M!',
 'This is perfect for the exhibit.',
 'Look at all those activists!',
 "If I were you, I'd go to the library and do some digging.",
 'Ugh. Fine.',
 '\\u00f0\\u0178\\u02dc\\u00b4']

In [25]:
def pad_list(l: list, max_len: int, pad_value: Any) -> list:
    return l + [pad_value] * (max_len - len(l))


def empty_to_list(s: List) -> List:
    return s if s else []

In [26]:
embedding = train.filter(pl.col("level_group")=="0-4").groupby("session_id").agg(
    pl.col(target_col).tail(10).apply(
        lambda x: wv.get_mean_vector(empty_to_list(x.to_list()))
    ).alias("embedding")
)["embedding"].to_list()

pl.DataFrame(embedding, schema=[f"{target_col}_embedding_{i}" for i in range(wv.vector_size)])

text_embedding_0,text_embedding_1,text_embedding_2,text_embedding_3,text_embedding_4,text_embedding_5,text_embedding_6,text_embedding_7
f64,f64,f64,f64,f64,f64,f64,f64
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006


In [27]:
num_tail = 50
weights = [float(i / num_tail) for i in range(1, num_tail + 1)]

train.filter(pl.col("level_group")=="0-4").groupby("session_id").agg(
    pl.col(target_col).tail(num_tail).apply(
        lambda x: wv.get_mean_vector(empty_to_list(x.to_list()), weights)
    ).alias("embedding")
)["embedding"].to_list()

pl.DataFrame(embedding, schema=[f"{target_col}_embedding_{i}" for i in range(wv.vector_size)])

text_embedding_0,text_embedding_1,text_embedding_2,text_embedding_3,text_embedding_4,text_embedding_5,text_embedding_6,text_embedding_7
f64,f64,f64,f64,f64,f64,f64,f64
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006
0.179691,0.017425,-0.256541,0.55113,0.554639,0.154368,-0.478286,-0.194006


In [28]:
df = pl.DataFrame(embedding, schema=[f"{target_col}_embedding_{i}" for i in range(wv.vector_size)])

In [33]:
df["text_embedding_0"].to_pandas().value_counts()

 0.179691    22051
 0.181459      598
 0.127148      348
 0.183757      305
 0.074605       55
             ...  
-0.249423        1
 0.001743        1
 0.008740        1
 0.300243        1
-0.196012        1
Name: text_embedding_0, Length: 146, dtype: int64