### 02 â€“ EDA on Amazon Books Dataset

Goal:
- Load an Amazon Books subset.
- Confirm presence of user_id, item_id, rating, timestamp.
- Map to unified schema.


In [1]:
import pandas as pd
from pathlib import Path


In [2]:
# download dataset from https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Books.jsonl.gz
path = "..\\data\\raw\\amazon_books\\Books.jsonl.gz"
sample_df = pd.read_json(
    path,
    lines=True,
    compression="gzip",
    nrows=100_000  # just to inspect columns
)

print(sample_df.shape)
print(sample_df.columns)
sample_df.head()


(100000, 10)
Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,1,Not a watercolor book! Seems like copies imo.,It is definitely not a watercolor book. The p...,[{'small_image_url': 'https://m.media-amazon.c...,B09BGPFTDB,B09BGPFTDB,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2022-01-17 06:06:38.485,0,True
1,5,Updated: after 1st arrived damaged this one is...,Updated: after first book arrived very damaged...,[],0593235657,0593235657,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-12-27 18:26:44.904,1,True
2,5,Excellent! I love it!,I bought it for the bag on the front so it pai...,[],1782490671,1782490671,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-12-24 22:04:55.102,0,True
3,5,Updated after 1st arrived damaged. Excellent,Updated: after 1st arrived damaged the replace...,[],0593138228,0593138228,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-12-24 16:55:06.602,0,False
4,5,Beautiful patterns!,I love this book! The patterns are lovely. I ...,[{'small_image_url': 'https://m.media-amazon.c...,0823098079,0823098079,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,2021-11-19 08:57:33.230,0,True


In [3]:
#df_full = pd.read_json(path, lines=True, compression="gzip")

In [4]:
user_col = "user_id"
item_col = "parent_asin"
rating_col = "rating"
time_col = "timestamp"

df = sample_df[[user_col, item_col, rating_col, time_col]].copy()

# Keep only positive interactions (e.g., rating >= 3)
df = df[df[rating_col] >= 3].copy()

# Parse timestamp
df["timestamp"] = pd.to_datetime(df[time_col], errors="coerce")
df = df.dropna(subset=["timestamp"])
df = df.sort_values([user_col, "timestamp"])

df_uni = pd.DataFrame({
    "dataset": "amazon_books_2023",
    "user_id": df[user_col].astype(str),
    "session_id": df[user_col].astype(str),   # placeholder; sessions later
    "item_id": df[item_col].astype(str),
    "timestamp": df["timestamp"],
    "interaction_type": "implicit_pos"
})

df_uni.head()


Unnamed: 0,dataset,user_id,session_id,item_id,timestamp,interaction_type
94249,amazon_books_2023,AE223GHNZEI5MRMBVVRGJONDNWRQ,AE223GHNZEI5MRMBVVRGJONDNWRQ,1501191969,2019-12-28 06:31:46.943,implicit_pos
94248,amazon_books_2023,AE223GHNZEI5MRMBVVRGJONDNWRQ,AE223GHNZEI5MRMBVVRGJONDNWRQ,1631065653,2020-02-02 03:21:02.540,implicit_pos
94247,amazon_books_2023,AE223GHNZEI5MRMBVVRGJONDNWRQ,AE223GHNZEI5MRMBVVRGJONDNWRQ,0671023934,2020-02-02 03:32:37.439,implicit_pos
94246,amazon_books_2023,AE223GHNZEI5MRMBVVRGJONDNWRQ,AE223GHNZEI5MRMBVVRGJONDNWRQ,1250847958,2022-12-02 07:32:10.679,implicit_pos
82900,amazon_books_2023,AE224GVO7OHTYF26U6ER6BEVIUAQ,AE224GVO7OHTYF26U6ER6BEVIUAQ,B001EE4RLY,2009-07-18 03:23:25.000,implicit_pos


In [5]:
path = Path("../data/raw/amazon_books/Books.jsonl.gz")
out_path = Path("../data/processed/amazon_books_2023_interactions.parquet")

# If rerunning, delete old file
if out_path.exists():
    out_path.unlink()

chunks = pd.read_json(
    path,
    lines=True,
    compression="gzip",
    chunksize=500_000  # adjust to your RAM
)

dfs = []
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i}...")
    user_col = "user_id"       # adjust if needed
    item_col = "parent_asin"
    rating_col = "rating"
    time_col = "timestamp"

    df = chunk[[user_col, item_col, rating_col, time_col]].copy()
    df = df[df[rating_col] >= 3].copy()
    df["timestamp"] = pd.to_datetime(df[time_col], errors="coerce")
    df = df.dropna(subset=["timestamp"])
    df = df.sort_values([user_col, "timestamp"])

    df_uni = pd.DataFrame({
        "dataset": "amazon_books_2023",
        "user_id": df[user_col].astype(str),
        "session_id": df[user_col].astype(str),
        "item_id": df[item_col].astype(str),
        "timestamp": df["timestamp"],
        "interaction_type": "implicit_pos"
    })

    dfs.append(df_uni)

df_all = pd.concat(dfs, ignore_index=True)
df_all.to_parquet(out_path, index=False)
print("Saved unified interactions to:", out_path)


Processing chunk 0...
Processing chunk 1...
Processing chunk 2...
Processing chunk 3...
Processing chunk 4...
Processing chunk 5...
Processing chunk 6...
Processing chunk 7...
Processing chunk 8...
Processing chunk 9...
Processing chunk 10...
Processing chunk 11...
Processing chunk 12...
Processing chunk 13...
Processing chunk 14...
Processing chunk 15...
Processing chunk 16...
Processing chunk 17...
Processing chunk 18...
Processing chunk 19...
Processing chunk 20...
Processing chunk 21...
Processing chunk 22...
Processing chunk 23...
Processing chunk 24...
Processing chunk 25...
Processing chunk 26...
Processing chunk 27...
Processing chunk 28...
Processing chunk 29...
Processing chunk 30...
Processing chunk 31...
Processing chunk 32...
Processing chunk 33...
Processing chunk 34...
Processing chunk 35...
Processing chunk 36...
Processing chunk 37...
Processing chunk 38...
Processing chunk 39...
Processing chunk 40...
Processing chunk 41...
Processing chunk 42...
Processing chunk 43..

In [6]:
df_all.head()

Unnamed: 0,dataset,user_id,session_id,item_id,timestamp,interaction_type
0,amazon_books_2023,AE222ABE7SFNVT34H5XDXASHAP5A,AE222ABE7SFNVT34H5XDXASHAP5A,1419098551,2014-01-14 15:31:55.000,implicit_pos
1,amazon_books_2023,AE222ABE7SFNVT34H5XDXASHAP5A,AE222ABE7SFNVT34H5XDXASHAP5A,719840392,2022-09-09 02:49:55.106,implicit_pos
2,amazon_books_2023,AE223CEMC6KPVKTNZL53G2X3PLPA,AE223CEMC6KPVKTNZL53G2X3PLPA,1101967706,2018-11-02 18:40:40.733,implicit_pos
3,amazon_books_2023,AE223GHNZEI5MRMBVVRGJONDNWRQ,AE223GHNZEI5MRMBVVRGJONDNWRQ,1501191969,2019-12-28 06:31:46.943,implicit_pos
4,amazon_books_2023,AE223GHNZEI5MRMBVVRGJONDNWRQ,AE223GHNZEI5MRMBVVRGJONDNWRQ,1631065653,2020-02-02 03:21:02.540,implicit_pos


In [7]:
sample_df.shape

(100000, 10)

In [8]:
df_all.shape

(27078467, 6)

In [9]:
df_all.columns

Index(['dataset', 'user_id', 'session_id', 'item_id', 'timestamp',
       'interaction_type'],
      dtype='object')