In [439]:
## Assignment
import gzip
import json
import pandas as pd
import re

## 1. Review Existing Unstructured Data and Diagram a New Structured Relational Data Model


### 1a. Read json data into DataFrame Objects

In [440]:
jsonFiles = ["brands.json.gz", "receipts.json.gz", "users.json.gz"]

def readJsonGz(file_path):
    with gzip.open(file_path, 'r') as data_file:    
        # data = json.load(data_file)  
        data = data_file.read()

    data = data.decode("utf-8")
    lines = data.split('\n')

    for i, line in enumerate(lines):
        if line.strip():  # Skip empty lines
            try:
                json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Invalid JSON on line {i+1}: {e}")
                print(f"Content: {line}")
                break

    data = "[" + ",".join(line for line in lines if line.strip()) + "]"
    data = json.loads(data)
    normalized_data = pd.json_normalize(data)
    return normalized_data

"""
    Use regex to extract data wrapped between '{' and '}' in an input string 
    input: line
    output: a string
"""
def extractData(line):
    pattern = r'\{.*\}'

    # Search for the pattern in the input string
    match = re.search(pattern, line)
    json_object =""

    if match:
        json_object = match.group(0)
    return json_object

"""
Given a path to a json.gz file, `file_path`
Read the data into a json object and return
"""
def read_json_gz_skip_gzip_lines(file_path):
    valid_lines = []

    # Open the gzip file
    with gzip.open(file_path, 'r') as f:
        d = f.read()
        lines = d.decode("utf-8").split("\n")
        valid_lines = []
        for line in lines:
            extracted = extractData(line)
            if extracted != "":
                valid_lines.append(extracted)
    json_string =  "[" + ",".join(valid_lines) + "]"
    data = json.loads(json_string)
    
    return data

rewardsReceiptItemsLst = []
num_nan = 0



"""
Extract 'reward receipt item' from column `rewardsReceiptItemList`
and map receipt id to each item
"""
def normalize_rewardsReceiptItems(row, dest = rewardsReceiptItemsLst):
    global num_nan
    receipt_id = row["_id.$oid"]
    rewardsReceiptItemList = row["rewardsReceiptItemList"]
    if pd.isna(rewardsReceiptItemList).all() if isinstance(rewardsReceiptItemList, (list, pd.Series)) else pd.isna(rewardsReceiptItemList):
        num_nan += 1
        return 
    new_row = pd.json_normalize(rewardsReceiptItemList)
    new_row["receiptId"] = receipt_id
    dest.append(new_row)



Found that users.json.gz is not able to be imported due to unexpected symbols.  
Unzip it, and try to investigate

In [441]:
dataDict = {}
for file in jsonFiles:
    try:
        data= readJsonGz(file)
        dataDict[file.split(".")[0]] = data
    except json.JSONDecodeError as e:
        print(e)
        print("Fail to read {}".format(file))


Invalid JSON on line 1: Expecting value: line 1 column 1 (char 0)
Content: users.json                                                                                          0100777 0000000 0000000 00000254301 14567170355 010072  0                                                                                                    ustar 00                                                                                                                                                                                                                                                       {"_id":{"$oid":"5ff1e194b6a9d73a3a9f1052"},"active":true,"createdDate":{"$date":1609687444800},"lastLogin":{"$date":1609687537858},"role":"consumer","signUpSource":"Email","state":"WI"}
Expecting value: line 1 column 2 (char 1)
Fail to read users.json.gz


In [442]:
!gzip -dk users.json.gz

users.json already exists -- do you wish to overwrite (y or n)? ^C


Opened the users.json file and found that  
1)  in the first line,
there're bunch of "\x00" characters in front of a actual data row.  
2) In the last row, it's full of "\x00".  
Therefore, created a function `read_json_gz_skip_gzip_lines` to extract data placed between "{}" in each line

In [443]:
file = jsonFiles[2]
try:
    data= read_json_gz_skip_gzip_lines(file)
    dataDict[file.split(".")[0]] = pd.json_normalize(data)
except json.JSONDecodeError:
    print(e)
    print(file)
dataDict.keys()

dict_keys(['brands', 'receipts', 'users'])

In [444]:
dataDict.keys()

dict_keys(['brands', 'receipts', 'users'])

In [445]:
brandsData = dataDict["brands"]
receiptsData = dataDict["receipts"]
usersData = dataDict["users"]

Found that Receipt dataframe has a column that contains list of other objects: *rewardsReceiptItemList*

In [446]:
receiptsData.head()

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,pointsEarned,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,_id.$oid,createDate.$date,dateScanned.$date,finishedDate.$date,modifyDate.$date,pointsAwardedDate.$date,purchaseDate.$date
0,500.0,"Receipt number 2 completed, bonus point schedu...",500.0,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.0,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575,1609687531000,1609687531000,1609688000000.0,1609687536000,1609688000000.0,1609632000000.0
1,150.0,"Receipt number 5 completed, bonus point schedu...",150.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.0,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b,1609687483000,1609687483000,1609687000000.0,1609687488000,1609687000000.0,1609601000000.0
2,5.0,All-receipts receipt bonus,5.0,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.0,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a,1609687537000,1609687537000,,1609687542000,,1609632000000.0
3,5.0,All-receipts receipt bonus,5.0,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.0,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f,1609687534000,1609687534000,1609688000000.0,1609687539000,1609688000000.0,1609632000000.0
4,5.0,All-receipts receipt bonus,5.0,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.0,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561,1609687506000,1609687506000,1609688000000.0,1609687511000,1609688000000.0,1609601000000.0


In [447]:
# Rereceipts table has a column that contains list of other objects
receiptsData.apply(normalize_rewardsReceiptItems, axis = 1)
rewardsReceiptItems = pd.concat(rewardsReceiptItemsLst, ignore_index= True)
rewardsReceiptItems.head()

Unnamed: 0,barcode,description,finalPrice,itemPrice,needsFetchReview,partnerItemId,preventTargetGapPoints,quantityPurchased,userFlaggedBarcode,userFlaggedNewItem,...,itemNumber,originalMetaBriteQuantityPurchased,pointsEarned,targetPrice,competitiveProduct,originalFinalPrice,originalMetaBriteItemPrice,deleted,priceAfterCoupon,metabriteCampaignId
0,4011.0,ITEM NOT FOUND,26.0,26.0,False,1,True,5.0,4011.0,True,...,,,,,,,,,,
1,4011.0,ITEM NOT FOUND,1.0,1.0,,1,,1.0,,,...,,,,,,,,,,
2,28400642255.0,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.0,10.0,True,2,True,1.0,28400642255.0,True,...,,,,,,,,,,
3,,,,,False,1,True,,4011.0,True,...,,,,,,,,,,
4,4011.0,ITEM NOT FOUND,28.0,28.0,False,1,True,4.0,4011.0,True,...,,,,,,,,,,


Validate data integrity after applying data extraction to receipt table

In [448]:
actual_nan_rri = receiptsData["_id.$oid"].count() - receiptsData.rewardsReceiptItemList.count()

report1 = """
During extracting receipt reward items, found {} receipts that nan value in 'receiptsRewordsItemList'\n
Matched with actual missing values in the column: {}\n
Total number of receipt rewards items extracted: {}\n
"""
print(report1.format(num_nan, actual_nan_rri == num_nan, rewardsReceiptItems.shape[0]))



During extracting receipt reward items, found 440 receipts that nan value in 'receiptsRewordsItemList'

Matched with actual missing values in the column: True

Total number of receipt rewards items extracted: 6941


