In [16]:
# Exploratory Data Analysis
import pandas as pd
import matplotlib.pyplot as plt

In [17]:
# Base path from notebook to data folder
BASE = '../data/'

In [18]:
# Load tables
products = pd.read_csv(BASE + 'products.csv')
edges    = pd.read_csv(BASE + 'edges.csv')
reviews  = pd.read_csv(BASE + 'reviews.csv', parse_dates=['date'])

In [19]:
# Quick look
display(products.head())
display(edges.head())
display(reviews.head())

Unnamed: 0,Id,ASIN,title,salesrank,group,average_rating,num_reviews
0,0,771044445,,300490,,,
1,1,827229534,Patterns of Preaching: A Sermon Sampler,396585,Book,5.0,2.0
2,2,738700797,Candlemas: Feast of Flames,168596,Book,4.5,12.0
3,3,486287785,World War II Allied Fighter Planes Trading Cards,1270652,Book,5.0,1.0
4,4,842328327,Life Application Bible Commentary: 1 and 2 Tim...,631289,Book,4.0,1.0


Unnamed: 0,source,target
0,827229534,0804215715
1,827229534,156101074X
2,827229534,0687023955
3,827229534,0687074231
4,827229534,082721619X


Unnamed: 0,ASIN,date,rating
0,827229534,2000-07-28,5
1,827229534,2003-12-14,5
2,738700797,2001-12-16,5
3,738700797,2002-01-07,4
4,738700797,2002-01-24,5


In [20]:
# Products: check for NaNs
print(products.isna().sum())
print('Review date nulls:', reviews['date'].isna().sum())

Id                   0
ASIN                 0
title             5868
salesrank            0
group             5868
average_rating    5868
num_reviews       5868
dtype: int64
Review date nulls: 0


In [21]:
# Drop products missing titles (and thus group)
before = len(products)
products = products.dropna(subset=['title'])
after = len(products)
print(f"Dropped {before - after} products without titles → {after} remaining")

Dropped 5868 products without titles → 542684 remaining


In [22]:
# Fill any missing group labels
products['group'] = products['group'].fillna('Unknown')

In [23]:
# Drop reviews with invalid dates
before_r = len(reviews)
reviews = reviews.dropna(subset=['date'])
after_r = len(reviews)
print(f"Dropped {before_r - after_r} reviews with bad dates → {after_r} remaining")

Dropped 0 reviews with bad dates → 7593109 remaining


In [24]:
# Verify no more NaNs
print(products.isna().sum())
print('Review date nulls:', reviews['date'].isna().sum())

Id                0
ASIN              0
title             0
salesrank         0
group             0
average_rating    0
num_reviews       0
dtype: int64
Review date nulls: 0


In [25]:
# Reviews: check for NaNs in date or rating
print("Reviews null counts:")
print(reviews.isna().sum())

Reviews null counts:
ASIN      0
date      0
rating    0
dtype: int64


In [26]:
# Edges: check for NaNs in source or target
print("\nEdges null counts:")
print(edges.isna().sum())


Edges null counts:
source    0
target    0
dtype: int64


In [27]:
valid_asins = set(products['ASIN'])
before = len(edges)
edges = edges[
    edges['source'].isin(valid_asins) &
    edges['target'].isin(valid_asins)
]
after = len(edges)
print(f"Dropped {before - after} edges to discontinued/nonexistent ASINs → {after} remaining")

Dropped 557325 edges to discontinued/nonexistent ASINs → 1231400 remaining


In [28]:
# Drop any malformed edges
before = len(edges)
edges = edges.dropna(subset=['source', 'target'])
after = len(edges)
print(f"Dropped {before - after} malformed edges → {after} remaining")

Dropped 0 malformed edges → 1231400 remaining
