In [29]:
%load_ext autoreload
%autoreload 1

%aimport utils.common

%aimport

Modules to reload:
utils.common

Modules to skip:



In [30]:
import pandas as pd

In [31]:
import csv

from pathlib import Path
from utils.common import (
    download_file,
    read_file_from_zip,
    bulk_insert_df
)
from orm.amazon_products import (
    session_scope,
    Product, ProductScalars
)
from sqlalchemy import select

# Download the data
- If you have trouble with this, just get it from the website directly

In [37]:
kaggle_url = 'https://www.kaggle.com/api/v1/datasets/download/karkavelrajaj/amazon-sales-dataset'
local_file =  'Downloads/amazon-sales-dataset.zip'
print(local_file)

download_file(kaggle_url, local_file)

Downloads/amazon-sales-dataset.zip
File downloaded to: Downloads/amazon-sales-dataset.zip


# Parse out what we need
- the only columns we care about are product_id, product_name, category, rating, rating count

In [93]:
df = pd.read_csv(
    read_file_from_zip(zip_path=local_file, file_name='amazon.csv'),
    sep=',',
    quoting=csv.QUOTE_MINIMAL,
    usecols=['product_id', 'product_name', 'category', 'rating', 'rating_count'],
    dtype='string'
)

In [95]:
print(df.head())

   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                                            category rating rating_count  
0  Computers&Accessories|Accessories&Peripherals|...    4.2       24,269  
1  Computers&Accessories|Accessories&Peripherals|...    4.0       43,994  
2  Computers&Accessories|Accessories&Peripherals|...    3.9        7,928  
3  Computers&Accessories|Accessories&Peripherals|...    4.2       94,363  
4  Computers&Accessories|Accessories&Peripherals|...    4.2       16,905  


## To simplify the problem let's assume we only care about the top level category ( delimted by | ) so extract that before continuing

In [96]:
df["category"] = df["category"].str.split("|").str[0].astype("string")
print(df.head())

   product_id                                       product_name  \
0  B07JW9H4J1  Wayona Nylon Braided USB to Lightning Fast Cha...   
1  B098NS6PVG  Ambrane Unbreakable 60W / 3A Fast Charging 1.5...   
2  B096MSW6CT  Sounce Fast Phone Charging Cable & Data Sync U...   
3  B08HDJ86NZ  boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...   
4  B08CF3B7N1  Portronics Konnect L 1.2M Fast Charging 3A 8 P...   

                category rating rating_count  
0  Computers&Accessories    4.2       24,269  
1  Computers&Accessories    4.0       43,994  
2  Computers&Accessories    3.9        7,928  
3  Computers&Accessories    4.2       94,363  
4  Computers&Accessories    4.2       16,905  


## What EDA would you do to figure out what the DDL in your database should be regarding things like primary key uniqueness, reasonable varchar lengths, numeric precision, nullability etc...?
- If PK violations might occur, maybe just keep the first occurence for simplicity?

In [97]:
# check to see if there are dubes by product_id
has_duplicates = df["product_id"].duplicated().any()
print(f"Any duplicates by product_id: {has_duplicates}")

#Show counts of product id
product_counts = df.groupby("product_id").size().reset_index(name="count")
product_counts = product_counts.sort_values(by="count", ascending=False)
print(product_counts)

# drop duplicates
df_unique = df.drop_duplicates(subset="product_id", keep="first")

#confirm no duplicates in has_duplicates df
has_duplicates = df_unique["product_id"].duplicated().any()
print(f"Any duplicates by product_id: {has_duplicates}")



Any duplicates by product_id: True
     product_id  count
881  B09CMP1SC8      3
771  B08Y1TFSP6      3
877  B09C6HXFC1      3
614  B08DDRGWTJ      3
522  B083342NKJ      3
..          ...    ...
453  B07WGPKMP5      1
452  B07WGPBXY9      1
451  B07WGMMQGP      1
449  B07WFPMGQQ      1
460  B07WJWRNVK      1

[1351 rows x 2 columns]
Any duplicates by product_id: False


In [98]:
# Get the data types
data_types = df[["product_id", "product_name", "rating", "rating_count", "category"]].dtypes

# Get the max length of each column
max_lengths = df[["product_id", "product_name", "rating", "rating_count", "category"]].apply(lambda x: x.astype(str).str.len().max())

# output
result = pd.DataFrame({
    "Data Type": data_types,
    "Max Length": max_lengths
})

print(result)

# I realized when I created "top level category" column, it needed to be explicitly cast to string. I updated that up above and reran.


                   Data Type  Max Length
product_id    string[python]          10
product_name  string[python]         485
rating        string[python]           3
rating_count  string[python]           8
category      string[python]          21


# What type coercions do you need to ensure compatibility with your ORM model / DB tables?   If there's any bad data, maybe set it to null?

In [99]:
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

nan_values = df.isna().sum()
print("NaN Values:\n", nan_values)

unique_values = df.nunique()
print("Unique Values Per Column:\n", unique_values)

print("Unique Categories in top_level_category:")
print(df["category"].unique())

print("\nDescriptive Statistics for Numeric Columns:")
print(df.describe())  # Provides stats like min, max, mean, etc.

Missing Values:
 product_id      0
product_name    0
category        0
rating          0
rating_count    2
dtype: int64
NaN Values:
 product_id      0
product_name    0
category        0
rating          0
rating_count    2
dtype: int64
Unique Values Per Column:
 product_id      1351
product_name    1337
category           9
rating            28
rating_count    1143
dtype: int64
Unique Categories in top_level_category:
<StringArray>
['Computers&Accessories',           'Electronics',    'MusicalInstruments',
        'OfficeProducts',          'Home&Kitchen',       'HomeImprovement',
            'Toys&Games',         'Car&Motorbike',   'Health&PersonalCare']
Length: 9, dtype: string

Descriptive Statistics for Numeric Columns:
        product_id                                       product_name  \
count         1465                                               1465   
unique        1351                                               1337   
top     B077Z65HSD  Fire-Boltt Ninja Call Pro P

# Now, we want to separate our data into two tables compatible with your ORM models / foreign key relationship

In [100]:
# Create Products DataFrame
df_products = df[["product_id", "product_name"]].drop_duplicates()

# Create ProductScalars DataFrame
df_scalars = df[["product_id", "category", "rating", "rating_count"]].drop_duplicates()

# Bulk Import data

Test my connection. Having trouble connecting with `bulk_insert_df`

In [None]:
from sqlalchemy import create_engine

engine = create_engine('postgresql+psycopg2://brett:mypassword@localhost:5432/postgres')

try:
    with engine.connect() as connection:
        result = connection.execute("SELECT datname FROM pg_database WHERE datistemplate = false;")
        databases = [row[0] for row in result]
        print("Databases:", databases)
except Exception as e:
    print(f"Connection failed: {e}")


Databasesz: ['postgres', 'fetch_rewardsssss', 'amazon_products_db']


In [126]:
# Use the engine to bulk insert data
bulk_insert_df(df_products, Product)
# bulk_insert_df(df_scalars, ProductScalars)

OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5432 failed: FATAL:  password authentication failed for user "user"
connection to server at "localhost" (127.0.0.1), port 5432 failed: FATAL:  password authentication failed for user "user"

(Background on this error at: https://sqlalche.me/e/14/e3q8)

# Queries
- Show me (in Pandas, SQL & SQLAlchemy how you'd get the rating average by category, but weigthed by the rating count, i.e. if the rating has a higher rating count, it is weighted proportionally within the category)
- Show me a couple other interesting queries, using SQL and/or Pandas

In [122]:
# example getting data with ORM query
with session_scope() as session:
    qry = select(ProductScalars)
    df_db = pd.read_sql_query(qry, con=session.connection(),
        index_col='product_id',
        dtype={
            'product_id': 'string',
            'category': 'string',
            'rating': 'Float64',
            'rating_count': 'Int64'
        })

OperationalError: (psycopg2.OperationalError) connection to server at "localhost" (127.0.0.1), port 5432 failed: FATAL:  password authentication failed for user "user"
connection to server at "localhost" (127.0.0.1), port 5432 failed: FATAL:  password authentication failed for user "user"

(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [None]:
# example raw sql
with session_scope() as session:
    qry = """
        SELECT ps.category, count(*) as category_count
        FROM product_scalars ps
        GROUP BY ps.category
        ORDER BY category_count DESC
        """

    df_db = pd.read_sql_query(qry, con=session.connection(),
        index_col='category',
        dtype={
            'category_count': 'Int64'
        })