# Daily Analysis Notebook

오늘 분석 내용을 이곳에 작성하세요.

In [1]:
# Notebook initialized

import sys
sys.path.append("..")
sys.path.append("../..")


import pandas as pd
import numpy as np

# Day4 raw eventd load
df_raw = pd.read_csv('../data/raw/raw_events.csv')
df_raw["event_time"] = pd.to_datetime(df_raw["event_time"])

df_raw.head()

Unnamed: 0,user_id,event_time,event_text
0,1,2025-01-01 09:00:00,search hybrid suv deals
1,1,2025-01-01 09:02:30,view kona hybrid 2024 specs
2,1,2025-01-01 09:05:10,click suv comparison page
3,1,2025-01-01 10:20:00,search ev charging station near home
4,1,2025-01-01 10:22:10,view ioniq 6 battery info


In [2]:
from feature_store.sessionizer import Sessionizer

sessionizer = Sessionizer(inactivity_minutes = 10)

df_sessions = sessionizer.assign_sessions(df_raw)

df_sessions.head()

Unnamed: 0,user_id,event_time,event_text,prev_time,time_gap,new_session,session_id
0,1,2025-01-01 09:00:00,search hybrid suv deals,NaT,,True,1
1,1,2025-01-01 09:02:30,view kona hybrid 2024 specs,2025-01-01 09:00:00,150.0,False,1
2,1,2025-01-01 09:05:10,click suv comparison page,2025-01-01 09:02:30,160.0,False,1
3,1,2025-01-01 10:20:00,search ev charging station near home,2025-01-01 09:05:10,4490.0,True,2
4,1,2025-01-01 10:22:10,view ioniq 6 battery info,2025-01-01 10:20:00,130.0,False,2


In [3]:
df_labels = pd.read_csv("../data/raw/session_labels.csv")
df_merged = df_sessions.merge(df_labels, on = "session_id", how = "left")

df_merged.head()

Unnamed: 0,user_id,event_time,event_text,prev_time,time_gap,new_session,session_id,intent
0,1,2025-01-01 09:00:00,search hybrid suv deals,NaT,,True,1,browse
1,1,2025-01-01 09:02:30,view kona hybrid 2024 specs,2025-01-01 09:00:00,150.0,False,1,browse
2,1,2025-01-01 09:05:10,click suv comparison page,2025-01-01 09:02:30,160.0,False,1,browse
3,1,2025-01-01 10:20:00,search ev charging station near home,2025-01-01 09:05:10,4490.0,True,2,browse
4,1,2025-01-01 10:22:10,view ioniq 6 battery info,2025-01-01 10:20:00,130.0,False,2,browse


In [4]:
from feature_store.feat_eng import FeatureEngineering

# Session-level text aggregation
fe = FeatureEngineering()
fe_out = fe.aggregate_session_texts(df_sessions)

df_sessions = sessionizer.assign_sessions(df_raw)   # Session_labels.csv merge output 재사용

fe_out.head()


Unnamed: 0,session_id,raw_text,cleaned_text
0,1,search hybrid suv deals view kona hybrid 2024 ...,search hybrid suv deals view kona hybrid 2024 ...
1,2,search ev charging station near home view ioni...,search ev charging station near home view ioni...
2,3,click charger map details search protein powde...,click charger map details search protein powde...
3,4,search tire replacement cost view tire brand c...,search tire replacement cost view tire brand c...


In [5]:
fe_out["session_id"].nunique()
fe_out.head()


Unnamed: 0,session_id,raw_text,cleaned_text
0,1,search hybrid suv deals view kona hybrid 2024 ...,search hybrid suv deals view kona hybrid 2024 ...
1,2,search ev charging station near home view ioni...,search ev charging station near home view ioni...
2,3,click charger map details search protein powde...,click charger map details search protein powde...
3,4,search tire replacement cost view tire brand c...,search tire replacement cost view tire brand c...


In [None]:
pip install scikit-learn

In [6]:
df_sessions.head(20)
df_sessions.shape
df_sessions["session_id"].nunique()
df_sessions["user_id"].nunique()


5

In [7]:
from feature_store.vectorizer import TextVectorizer

tv = TextVectorizer(max_features = 300)

X, vec = tv.fit_transform(fe_out)

print("Embedding shape: ", X.shape)
print("Example vector: ", X[0][:10])  # 첫 10개 값 출력

Embedding shape:  (4, 88)
Example vector:  [0.11151709 0.11151709 0.11151709 0.11151709 0.         0.
 0.         0.         0.         0.        ]


In [None]:
df_sessions["session_id"].nunique()
fe_out["session_id"].nunique()
X.shape

(39, 3)

In [12]:
df_raw.head(10)

Unnamed: 0,user_id,event_time,event_text
0,1,2025-01-01 09:00:00,search hybrid suv deals
1,1,2025-01-01 09:02:30,view kona hybrid 2024 specs
2,1,2025-01-01 09:05:10,click suv comparison page
3,1,2025-01-01 10:20:00,search ev charging station near home
4,1,2025-01-01 10:22:10,view ioniq 6 battery info
5,1,2025-01-01 10:45:00,click charger map details
6,1,2025-01-01 13:00:00,search tire replacement cost
7,1,2025-01-01 13:01:30,view tire brand comparison
8,1,2025-01-01 13:05:40,click tire shop details
9,2,2025-01-02 08:10:00,search running shoes discount


In [13]:
df_raw.shape

(39, 3)

In [14]:
df_raw["event_text"].min(), df_raw["event_text"].max()

('add earbuds to cart', 'view whey protein isolate')

In [15]:
df_raw.groupby("user_id")["event_time"].count()

user_id
1    9
2    9
3    9
4    6
5    6
Name: event_time, dtype: int64

In [16]:
df_sessions = sessionizer.assign_sessions(df_raw)

In [8]:
from feature_store.clusterer import IntentClusterer

clusterer = IntentClusterer(n_clusters=3)
labels = clusterer.fit_predict(X)

fe_out["intent_cluster"] = labels
fe_out.head()


Unnamed: 0,session_id,raw_text,cleaned_text,intent_cluster
0,1,search hybrid suv deals view kona hybrid 2024 ...,search hybrid suv deals view kona hybrid 2024 ...,2
1,2,search ev charging station near home view ioni...,search ev charging station near home view ioni...,0
2,3,click charger map details search protein powde...,click charger map details search protein powde...,2
3,4,search tire replacement cost view tire brand c...,search tire replacement cost view tire brand c...,1
