### 01 â€“ EDA on YOOCHOOSE

Goal:
- Load the yoochoose-clicks file.
- Inspect schema (columns, types).
- Check basic stats: #sessions, #items, #rows.
- Mapping to unified schema: (user_id, session_id, item_id, timestamp, interaction_type).


In [1]:
import pandas as pd

# download the dataset from https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z
path = "..\\data\\raw\\yoochoose\\yoochoose-data\\yoochoose-clicks.dat"

df = pd.read_csv(
    path,
    sep=",",  
    header=None,
    names=["session_id", "timestamp", "item_id", "category"]
)

df.head()


  df = pd.read_csv(


Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


In [2]:
print("Shape:", df.shape)
print("Unique sessions:", df["session_id"].nunique())
print("Unique items:", df["item_id"].nunique())
print(df.dtypes)


Shape: (33003944, 4)
Unique sessions: 9249729
Unique items: 52739
session_id     int64
timestamp     object
item_id        int64
category      object
dtype: object


In [3]:
df["timestamp"] = pd.to_datetime(df["timestamp"])
df.sort_values(["session_id", "timestamp"], inplace=True)
df.head()


Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07 10:51:09.277000+00:00,214536502,0
1,1,2014-04-07 10:54:09.868000+00:00,214536500,0
2,1,2014-04-07 10:54:46.998000+00:00,214536506,0
3,1,2014-04-07 10:57:00.306000+00:00,214577561,0
4,2,2014-04-07 13:56:37.614000+00:00,214662742,0


In [4]:
# Map to unified schema
df_uni = pd.DataFrame({
    "dataset": "yoochoose",
    "user_id": df["session_id"],      # no user in original, use session as user
    "session_id": df["session_id"],
    "item_id": df["item_id"],
    "timestamp": df["timestamp"],
    "interaction_type": "click"
})

df_uni.head()

Unnamed: 0,dataset,user_id,session_id,item_id,timestamp,interaction_type
0,yoochoose,1,1,214536502,2014-04-07 10:51:09.277000+00:00,click
1,yoochoose,1,1,214536500,2014-04-07 10:54:09.868000+00:00,click
2,yoochoose,1,1,214536506,2014-04-07 10:54:46.998000+00:00,click
3,yoochoose,1,1,214577561,2014-04-07 10:57:00.306000+00:00,click
4,yoochoose,2,2,214662742,2014-04-07 13:56:37.614000+00:00,click


In [5]:
session_lengths = df_uni.groupby("session_id")["item_id"].size()
valid_sessions = session_lengths[session_lengths >= 2].index

df_uni = df_uni[df_uni["session_id"].isin(valid_sessions)].copy()
print("After filtering, rows:", len(df_uni))

After filtering, rows: 31744233


In [7]:
out_path = "../data/processed/yoochoose_interactions.parquet"
df_uni.to_parquet(out_path, index=False)
print("Saved to", out_path)


Saved to ../data/processed/yoochoose_interactions.parquet
