## Part 1: Data Exploration

In [1]:
import pandas as pd

book_test_df = pd.read_csv('data/Books.test.csv.gz', compression='gzip', sep=',', header=0)
book_val_df = pd.read_csv('data/Books.valid.csv.gz', compression='gzip', sep=',', header=0)
book_train_df = pd.read_csv('data/Books.train.csv.gz', compression='gzip', sep=',', header=0)

book_train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1446304000,5.0,1441260345000,
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1564770672,5.0,1441260365000,1446304000
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1442450703,5.0,1523093714024,1446304000 1564770672
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1780671067,1.0,1611623223325,1446304000 1564770672 1442450703
4,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1645671127,3.0,1612044209266,1446304000 1564770672 1442450703 1780671067


In [2]:
# Combine all interactions
merged_df = pd.concat([book_train_df, book_val_df, book_test_df], ignore_index=True)

In [5]:
print("Unique Users in all sets: " , merged_df['user_id'].nunique())
print("Unique Items in all sets: ", merged_df['parent_asin'].nunique())
print("Interactions across all sets: ", len(merged_df))

Unique Users in all sets:  776370
Unique Items in all sets:  495063
Interactions across all sets:  9488297


In [6]:
high_rating_mask = merged_df['rating'] >= 4

percent_high = high_rating_mask.mean() * 100
print(f"Percentage of reviews that are 4 or 5 stars: {percent_high:.2f}%")

Percentage of reviews that are 4 or 5 stars: 84.72%


In [2]:
book_test_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,593235657,5.0,1640629604904,1446304000 1564770672 1442450703 1780671067 16...
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,803736800,4.0,1454676557000,0811849783 0803729952 0735336296 1508558884 08...
2,AGXFEGMNVCSTSYYA5UWXDV7AFSXA,1542046599,5.0,1605649719611,1578052009 1477493395 1594747350 1594749310
3,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,679450815,5.0,1638987703546,B00INIQVJA 1496407903 1974633225 B07KD27RHM 16...
4,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1250866448,5.0,1669414969335,0920668372 1589255208 2764322836 2764330898 00...


In [3]:
book_val_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1782490671,5.0,1640383495102,1446304000 1564770672 1442450703 1780671067 16...
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,802737803,5.0,1454676232000,0811849783 0803729952 0735336296 1508558884
2,AGXFEGMNVCSTSYYA5UWXDV7AFSXA,1594749310,5.0,1541884305941,1578052009 1477493395 1594747350
3,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1633573001,5.0,1612225279592,B00INIQVJA 1496407903 1974633225 B07KD27RHM
4,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,451450523,2.0,1635710722120,0920668372 1589255208 2764322836 2764330898 00...


In [6]:
import gzip
import json

file_path = "data/meta_Books.jsonl.gz"

N = 5  # number of lines to preview

with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= N:
            break
        try:
            obj = json.loads(line)
            print(json.dumps(obj, indent=2))
            print("-" * 50)
        except json.JSONDecodeError:
            print("Error decoding line:", line)

{
  "main_category": "Books",
  "title": "Chaucer",
  "subtitle": "Hardcover \u2013 Import, January 1, 2004",
  "author": {
    "avatar": "https://m.media-amazon.com/images/I/21Je2zja9pL._SY600_.jpg",
    "name": "Peter Ackroyd",
    "about": [
      "Peter Ackroyd, (born 5 October 1949) is an English biographer, novelist and critic with a particular interest in the history and culture of London. For his novels about English history and culture and his biographies of, among others, William Blake, Charles Dickens, T. S. Eliot and Sir Thomas More, he won the Somerset Maugham Award and two Whitbread Awards. He is noted for the volume of work he has produced, the range of styles therein, his skill at assuming different voices and the depth of his research.",
      "He was elected a fellow of the Royal Society of Literature in 1984 and appointed a Commander of the Order of the British Empire in 2003.",
      "Bio from Wikipedia, the free encyclopedia."
    ]
  },
  "average_rating": 4.5,
  

In [7]:
len(book_train_df)

7935557

In [8]:
len(book_val_df)

776370

In [None]:
len(book_test_df)