# CANMEX-REPCOL

In [5]:
import pandas as pd
import os
from azure.storage.blob import BlobServiceClient
import gzip
import json

# Importing local modules
from data_ingestion import download_and_create_df

In [6]:
# Retrieve environment variables
storage_account = os.environ.get("STORAGE_ACCOUNT")
container = os.environ.get("CONTAINER")
sas_token = os.environ.get("ADLS_SAS_TOKEN")
sas_url = os.environ.get("SAS_URL")

In [7]:
account_url = f"https://{storage_account}.blob.core.windows.net"
service_client = BlobServiceClient(account_url=account_url, credential=sas_token)

# Access the container
container_client = service_client.get_container_client(container)

In [8]:
blob_list = list(container_client.list_blobs())

# Create two lists corresponding to each dataset's downloadable data
amazon_metadata = []
amazon_reviews = []
for blob in blob_list:
    blob_name = blob['name']
    if blob_name.endswith('.json.gz'):
        if 'amazon_metadata' in blob_name:
            amazon_metadata.append(blob_name)
        elif 'amazon_reviews' in blob_name:
            amazon_reviews.append(blob_name)

In [9]:
len(amazon_reviews)

5230

In [19]:
blob_client=container_client.get_blob_client(amazon_metadata[1]) # First file
downloaded_blob = blob_client.download_blob().readall()
json_data = gzip.decompress(downloaded_blob).decode('utf-8')
df = pd.read_json(json_data, lines=True)
df

Unnamed: 0,also_buy,also_view,asin,brand,category,date,description,details,feature,fit,image,main_cat,price,rank,similar_item,tech1,tech2,title
0,[],[],B0016MJOQM,99 Volts,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...","<div class=""a-fixed-left-grid a-spacing-none"">...",[Great looking 100% cotton shirt. For big boy'...,{},"[100% Cotton, Machine wash and dry, To purchas...",,[https://images-na.ssl-images-amazon.com/image...,"<img src=""https://images-na.ssl-images-amazon....",$11.94,"24,469,411 in Clothing, Shoes & Jewelry (",,,,Got Orr Little Boy's Kids Tee Shirt
1,[],"[B00SFZYXAK, B0013KD7B0, B01LWU4SZT, B072XTQFJ...",B0016MJWDW,RG Costumes,"[Clothing, Shoes & Jewelry, Costumes & Accesso...",,"[Includes: Jumpsuit, hood. Not included: Shoes.]",{},[90079-S],,[],Toys & Games,$27.25,"["">#720,785 in Toys & Games (See Top 100 in To...",,,,Grey Cute-T Bat Kids Costumes
2,[],[],B0016MJQO2,99 Volts,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...","<div class=""a-fixed-left-grid a-spacing-none"">...",[Great looking 100% cotton shirt. Please refer...,{},"[100% Cotton, Machine wash and dry, <span clas...",,[https://images-na.ssl-images-amazon.com/image...,"<img src=""https://images-na.ssl-images-amazon....",$11.44 - $14.77,"6,110,607 in Clothing, Shoes & Jewelry (",,,,Got Orr Men's Tee Shirt
3,[],[],B0016MLJ1K,Switchables,"[Clothing, Shoes & Jewelry, Novelty & More, Je...","March 27, 2008",[That's right! The original Switchables night-...,{},"[Stained glass, Dog design, Surgical stainless...",,[],Tools & Home Improvement,,"["">#2,134,193 in Tools & Home Improvement (See...",,"class=""a-keyvalue prodDetTable"" role=""present...",,Switchables Stained Glass Dog Earrings
4,[],[],B0016MM1IK,,"[Clothing, Shoes & Jewelry, Women, Shoes, Pump...","<div class=""a-fixed-left-grid a-spacing-none"">...",[From its humble beginnings in 1825 in the tin...,{},"[100% Leather, Synthetic sole, Heel measures a...",,[https://images-na.ssl-images-amazon.com/image...,"<img src=""https://images-na.ssl-images-amazon....",,"18,929,031 in Clothing, Shoes & Jewelry (",,,,CLARKS Women's Geraldine Mary Jane
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,[],"[B01MT636SD, B00XLD3ZIM, B01MU7NWUI, B00XLD41D...",B001A7B844,,"[Clothing, Shoes & Jewelry, Women, Accessories...","<div class=""a-fixed-left-grid a-spacing-none"">...",[This Ariat belt by medium and f western produ...,{},"[100% Leather, Imported, Belt closure, Hand Wa...",,[],"<img src=""https://images-na.ssl-images-amazon....",$35.26 - $62.11,"2,881,512 in Clothing, Shoes & Jewelry (",,,,Ariat Men's Floral Embossed Nail Edge Belt
9996,"[B005L665L6, B019YPUGGY, B005L66ELW, B005L67JR...","[B005L67JRK, B019YPUGGY, B005L665L6, B005L77N0...",B001A710QK,5.11,"[Clothing, Shoes & Jewelry, Men, Clothing, Act...",,[Having the right under shirt for a ballistic ...,{'  Product Dimensions: ': '1 x 1 x 1 ...,"[polyester, Durable, comfortable and functiona...",,[https://images-na.ssl-images-amazon.com/image...,Sports & Outdoors,$27.99 - $38.98,"339,571 in Sports & Outdoors (",,,,5.11 Tactical #40005 Short Sleeve Tight Crew S...
9997,[],[],B001A78NHO,Disney,"[Clothing, Shoes & Jewelry, Luggage & Travel G...","May 24, 2008",[Disney Mickey Mouse designer inspired messeng...,{},"[16"" x 13"" x 4"", Interior krafted nicely with ...",,[],,,"["">#27,918 in Clothing, Shoes & Jewelry > Lugg...",,,,Disney Mickey Mouse Designer Inspired Messenge...
9998,"[B07CBFJXMS, B078G1BG5W, B00OQ21TIA, B01ECYWJ2...","[B07CHY6RP4, B07CBFJXMS, B079FWTBFL, B00MEEO9P...",B001A7B8FI,,"[Clothing, Shoes & Jewelry, Women, Accessories...","<div class=""a-fixed-left-grid a-spacing-none"">...",[This Ariat belt by medium and f western produ...,{},"[100% Leather, Imported, Belt closure, Hand Wa...",,[],"<img src=""https://images-na.ssl-images-amazon....",$32.95 - $55.86,"95,075 in Clothing, Shoes & Jewelry (",,,,Ariat Women's Fatbaby Center Stitch Belt


In [None]:
dataframes = []
for file_name in amazon_reviews:
    try:
        # Download the blob to a stream
        blob_client = container_client.get_blob_client(file_name)
        downloaded_blob = blob_client.download_blob().readall()

        # Decompress the gzipped data and read it into a DataFrame
        json_data = gzip.decompress(downloaded_blob).decode('utf-8')
        df = pd.read_json(json_data, lines=True)
        dataframes.append(df)
    except Exception as e:
        print(f"Error with file {file_name}: {e}")

In [None]:
# Use the function to download the files and create the DataFrames
df_metadata = download_and_create_df(amazon_metadata[1], container_client)
df_metadata

In [None]:
df_reviews = download_and_create_df(amazon_reviews, container_client)
df_reviews