In [76]:
import pandas as pd
import polars as pl

pl.Config.set_fmt_str_lengths(100)

polars.config.Config

In [77]:
def read_json(path):
    return pl.from_pandas(pd.read_json(path, orient="records", compression="gzip"))

In [78]:
path_data_selected = 'data/selected'
path_data_final = 'data/final'

file_name_products = 'products.json.gz'
file_name_reviews = 'reviews.json.gz'
file_name_users = 'users.json.gz'
file_name_products_categories = 'products_categories.json.gz'
file_name_categories = 'categories.json.gz'

categories = ['cameras', 'desktops', 'headphones', 'laptops', 'monitors', 'smartwatches', 'tablets']

Load Products

In [79]:
df_products = pl.DataFrame(schema=[
    ("product_id", pl.Utf8), 
    ("category", pl.Utf8), 
    ("name", pl.Utf8), 
    ("description", pl.Utf8), 
    ("price", pl.Float64), 
    ("image_url", pl.List(pl.Utf8))
])

In [80]:
for i in range(len(categories)):
    df_products.extend(read_json(f'{path_data_selected}/{categories[i]}/{file_name_products}'))

df_products = df_products.sort(by=['product_id'])

In [81]:
df_products.describe()

describe,product_id,category,name,description,price,image_url
str,str,str,str,str,f64,str
"""count""","""1798""","""1798""","""1798""","""1798""",1798.0,"""1798"""
"""null_count""","""0""","""0""","""0""","""0""",0.0,"""0"""
"""mean""",,,,,122.188877,
"""std""",,,,,182.196417,
"""min""","""B00001W0DG""","""Electronics,Camera & Photo,Digital Cameras""",""" Apple 13&quot; MacBook Notebook Computer MB404LL/A (Black)""","""""Experience brilliant PC performance that lets you communicate with friends and family, enjoy enter...",0.0,
"""max""","""B01HIHHRHQ""","""Electronics,Wearable Technology,Smartwatches,Seamleasly integrated with Alexa, Amazon's cloud-based...","""xtraem H2000 Pro Series Studio Style Headphone""","""white colors""",999.99,
"""median""",,,,,45.25,


In [82]:
df_products.head(10)

product_id,category,name,description,price,image_url
str,str,str,str,f64,list[str]
"""B00001W0DG""","""Electronics,Headphones,On-Ear Headphones""","""Sony MDR-V500DJ Monitor Series Headphones with Swivel Earcups (Discontinued by Manufacturer)""","""Revel in high-quality audio with the MDR-V500DJ Studio Monitor Series DJ headphones. Featuring reve...",6.61,"[""https://images-na.ssl-images-amazon.com/images/I/21UDx%2BHFMJL.jpg""]"
"""B00004TLW2""","""Electronics,Camera & Photo,Digital Cameras,Point & Shoot Digital Cameras""","""Fujifilm MX2900 2.3MP Digital Camera w/ 3x Optical Zoom Bundle""","""The FujiFilm MX-2900 digital camera includes some of the best features in digital imaging. With 2.3...",112.27,"[""https://images-na.ssl-images-amazon.com/images/I/41VAMXNBNPL.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/41QMVYA6RFL.jpg"", ... ""https://images-na.ssl-images-amazon.com/images/I/318C9AXX78L.jpg""]"
"""B00004VUM1""","""Electronics,Camera &amp; Photo,Digital Cameras""","""Sony MVC-FD95 Mavica 2MP Digital Camera with 10x Optical Zoom""","""w/ Canon SELPHY CP760 Compact Photo Printer 32 MB MultiMediaCard SD Memory Card SDHC Memory Card Mu...",99.0,"[""https://images-na.ssl-images-amazon.com/images/I/51ZQT04EX5L.jpg""]"
"""B00004WFYN""","""Electronics,Headphones,Over-Ear Headphones""","""Plantronics H141 Duoset Convertible Headset (Discontinued by Manufacturer)""","""- Convertible headset<br />- Quick disconnect (PL-M22 or PL-P10 adapter required)<br />- Convertibl...",59.0,"[""https://images-na.ssl-images-amazon.com/images/I/31KN5QDRGGL.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/31B0TQQPP4L.jpg"", ... ""https://images-na.ssl-images-amazon.com/images/I/41PJM3SF33L.jpg""]"
"""B00004XSHN""","""Electronics,Camera &amp; Photo,Digital Cameras,Point &amp; Shoot Digital Cameras""","""Fujifilm FinePix 4900 4.3MP Digital Camera w/ 6x Optical Zoom""","""Fuji's FinePix 4900 is one of a new style of consumer digital cameras appearing on the market. Borr...",60.0,"[""https://images-na.ssl-images-amazon.com/images/I/51ME06E3MRL.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/51ESWN8RSML.jpg""]"
"""B000051TOG""","""Electronics,Camera & Photo,Digital Cameras,DSLR Cameras""","""Olympus E-10 4MP Digital Camera w/4x Optical Zoom (Discontinued by Manufacturer)""","""The Camedia E-10 is a true SLR digital camera that features a 4-megapixel imager and new lens techn...",453.0,"[""https://images-na.ssl-images-amazon.com/images/I/51KQJ5RFY3L.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/51T11BHE9PL.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/517W7WVY7ZL.jpg""]"
"""B0000520I5""","""Electronics,Camera &amp; Photo,Digital Cameras,Point &amp; Shoot Digital Cameras""","""Kodak PalmPix Digital Camera for Palm m100 series Handhelds""","""Turn your Palm m100 into a digital camera with Kodak's latest PalmPix expansion. This new PalmPix i...",30.26,"[""https://images-na.ssl-images-amazon.com/images/I/41XVX2BTHKL.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/41Q28JNVXGL.jpg"", ... ""https://images-na.ssl-images-amazon.com/images/I/41BY9WZEPKL.jpg""]"
"""B00005MEN6""","""Electronics,Camera & Photo,Digital Cameras,Point & Shoot Digital Cameras""","""Olympus Camedia C-4040 4MP Digital Camera w/ 3x Optical Zoom""","""The Olympus C-4040 Zoom succeeds the C-3040, and ushers the C-series into the 4-million pixels cate...",43.27,"[""https://images-na.ssl-images-amazon.com/images/I/515T6YSQPHL.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/5127NEEE8ML.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/4174NV580NL.jpg""]"
"""B00005NKXC""","""Electronics,Computers & Accessories,Monitors""","""Apple M7649Zm/A Studio Display 17&quot; LCD Monitor""","""Requires ADC Port on your Machine.""",175.0,"[""https://images-na.ssl-images-amazon.com/images/I/41G0BEAHEWL.jpg""]"
"""B00005UKBD""","""Electronics,Camera & Photo,Digital Cameras""","""Sony DSCF707 Cyber-shot 5MP Digital Still Camera w/ 5x Optical Zoom""","""Accurately realistic photos are captured using this digital camera that has a 5.0 effective CCD and...",76.0,"[""https://images-na.ssl-images-amazon.com/images/I/41V6SKW8A5L.jpg"", ""https://images-na.ssl-images-amazon.com/images/I/51W0TG780BL.jpg"", ... ""https://images-na.ssl-images-amazon.com/images/I/410RPXECBFL.jpg""]"


In [83]:
df_products.to_pandas().to_json(f"{path_data_final}/{file_name_products}", compression="gzip", orient="records", indent=2)

Load Reviews

In [84]:
df_reviews = pl.DataFrame(schema=[
    ("user_id", pl.Utf8), 
    ("product_id", pl.Utf8), 
    ("ratings", pl.Int64), 
    ("review_text", pl.Utf8), 
    ("summary", pl.Utf8), 
    ("created_at", pl.Datetime("ns"))
])

In [85]:
for i in range(len(categories)):
    df_reviews.extend(read_json(f'{path_data_selected}/{categories[i]}/{file_name_reviews}'))

df_reviews = df_reviews.sort(by=['user_id', 'product_id', 'created_at'])

In [86]:
df_reviews.describe()

describe,user_id,product_id,ratings,review_text,summary,created_at
str,str,str,f64,str,str,str
"""count""","""59561""","""59561""",59561.0,"""59561""","""59561""","""59561"""
"""null_count""","""0""","""0""",0.0,"""0""","""0""","""0"""
"""mean""",,,4.251255,,,
"""std""",,,1.195757,,,
"""min""","""A0203183BAH3TR08FZGB""","""B00001W0DG""",1.0,""" These are fantastic for the price! I bought these as a gift for someone. When I tried them out...",""" 1 GB/s crypto. Great for pfSense!""","""2000-09-18 00:00:00.000000000"""
"""max""","""AZZZF5PSCPILV""","""B01HH8H2WQ""",5.0,"""~~~Update 12/15/2009~~~ Just came back from USPS after spending $10 on shipping it back insured. Ke...","""you're just used to that crap monitor you've been using since college""","""2018-09-28 00:00:00.000000000"""
"""median""",,,5.0,,,


In [87]:
df_reviews.head(10)

user_id,product_id,ratings,review_text,summary,created_at
str,str,i64,str,str,datetime[ns]
"""A0203183BAH3TR08FZGB""","""B0043T7FHK""",5,"""I got this to run as a dual monitor. This is my second time purchasing this monitor so now I have ...","""This is my second time purchasing this monitor so now I have two and they are great. I used to use ...",2015-06-30 00:00:00
"""A0261431Y0V4MHWY4B7W""","""B00AFH2E8E""",4,"""Not as good as I had hoped, music is very low, phone volume is pretty good.""","""Bluetooth headset""",2014-08-03 00:00:00
"""A034116598G557EYZ9BC""","""B0013FRNKG""",5,"""Appreciate if product Need to buy one more if any promotion is on to it Need to buy at more competi...","""great value""",2012-11-28 00:00:00
"""A0404374X0HL5T332XSN""","""B00MNOPS1C""",3,"""You get what you pay for""","""Three Stars""",2016-02-02 00:00:00
"""A0431622H67YR5IPJRN""","""B0058UUR6E""",5,"""Arrived in 2 days. working great. Recommend to others.""","""working great. Recommend to others""",2015-03-11 00:00:00
"""A0435554Z2P98AIGLNCS""","""B00XBQ93Q2""",5,"""VERY GOOD!!!!""","""Five Stars""",2016-07-18 00:00:00
"""A0436342QLT4257JODYJ""","""B00AM9WGGK""",5,"""we love it""","""Five Stars""",2014-12-01 00:00:00
"""A0508779FEO1DUNOSQNX""","""B004V4IWKG""",3,"""I was disappointed in the video quality, and sound quality even with an external microphone. The pi...","""Nikon Lenses are nice, but this camera could be better.""",2014-08-31 00:00:00
"""A0508779FEO1DUNOSQNX""","""B00728ZBA2""",5,"""Love this camera, hard to take a bad picture. The video quality is out standing for the price, bett...","""Great Little Camera""",2014-08-31 00:00:00
"""A0526222H977CBZM4DK7""","""B0058UUR6E""",5,"""EXCELLENT""","""Five Stars""",2017-04-26 00:00:00


In [88]:
df_reviews.to_pandas().to_json(f"{path_data_final}/{file_name_reviews}", compression="gzip", orient="records", indent=2)

Load Users

In [107]:
df_users = pl.DataFrame(schema=[
    ("user_id", pl.Utf8), 
    ("username", pl.Utf8)
])

In [108]:
for i in range(len(categories)):
    df_users.extend(read_json(f'{path_data_selected}/{categories[i]}/{file_name_users}'))

df_users = df_users.unique(subset=['user_id']).sort(by=['user_id'])

In [109]:
df_users.describe()

describe,user_id,username
str,str,str
"""count""","""55019""","""55019"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""","""A0203183BAH3TR08FZGB""",""" """
"""max""","""AZZZF5PSCPILV""","""~Miss~"""
"""median""",,


In [110]:
df_users.head(10)

user_id,username
str,str
"""A0203183BAH3TR08FZGB""","""Cynthea M."""
"""A0261431Y0V4MHWY4B7W""","""BoJess"""
"""A034116598G557EYZ9BC""","""Shams"""
"""A0404374X0HL5T332XSN""","""Anthony"""
"""A0431622H67YR5IPJRN""","""Jerry Greeley"""
"""A0435554Z2P98AIGLNCS""","""Belisario Manuel Azuero Sar"""
"""A0436342QLT4257JODYJ""","""James J"""
"""A0508779FEO1DUNOSQNX""","""Hazel"""
"""A0526222H977CBZM4DK7""","""JAIME SCARPITTA"""
"""A0579276Y8S37XMMI539""","""ITgreybeard"""


In [111]:
df_users.to_pandas().to_json(f"{path_data_final}/{file_name_users}", compression="gzip", orient="records", indent=2)

Load Product Categories

In [114]:
df_products_categories = pl.DataFrame(schema=[
    ("product_id", pl.Utf8), 
    ("category_id", pl.Int64)
])

In [115]:
for i in range(len(categories)):
    df_products_categories.extend(read_json(f'{path_data_selected}/{categories[i]}/{file_name_products_categories}'))

df_products_categories = df_products_categories.sort(by=['category_id', 'product_id'])

In [116]:
df_products_categories.describe()

describe,product_id,category_id
str,str,f64
"""count""","""1798""",1798.0
"""null_count""","""0""",0.0
"""mean""",,4.636263
"""std""",,2.070558
"""min""","""B00001W0DG""",2.0
"""max""","""B01HIHHRHQ""",8.0
"""median""",,4.0


In [117]:
df_products_categories.head(10)

product_id,category_id
str,i64
"""B00004TLW2""",2
"""B00004VUM1""",2
"""B00004XSHN""",2
"""B000051TOG""",2
"""B0000520I5""",2
"""B00005MEN6""",2
"""B00005UKBD""",2
"""B000063BGY""",2
"""B000069092""",2
"""B00006I53Z""",2


In [118]:
df_products_categories.to_pandas().to_json(f"{path_data_final}/{file_name_products_categories}", compression="gzip", orient="records", indent=2)

Load Categories

In [119]:
data_categories = [
    [1, 'Electronics', None],
    [2, 'Cameras', 1],
    [3, 'Headphones', 1],
    [4, 'Laptops', 1],
    [5, 'Monitors', 1],
    [6, 'Smartwatches', 1],
    [7, 'Tablets', 1],
    [8, 'Desktops', 1]
]

df_categories = pl.DataFrame(data_categories, schema=['category_id', 'name', 'parent_id'])

In [120]:
df_categories.describe()

describe,category_id,name,parent_id
str,f64,str,f64
"""count""",8.0,"""8""",8.0
"""null_count""",0.0,"""0""",1.0
"""mean""",4.5,,1.0
"""std""",2.44949,,0.0
"""min""",1.0,"""Cameras""",1.0
"""max""",8.0,"""Tablets""",1.0
"""median""",4.5,,1.0


In [122]:
df_categories.head(10)

category_id,name,parent_id
i64,str,i64
1,"""Electronics""",
2,"""Cameras""",1.0
3,"""Headphones""",1.0
4,"""Laptops""",1.0
5,"""Monitors""",1.0
6,"""Smartwatches""",1.0
7,"""Tablets""",1.0
8,"""Desktops""",1.0


In [123]:
df_categories.to_pandas().to_json(f"{path_data_final}/{file_name_categories}", compression="gzip", orient="records", indent=2)