In [75]:
import json
import pandas as pd
from datetime import datetime

In [76]:
# load json
def load_json(file_path):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data.append(json.loads(line.strip()))
                except json.JSONDecodeError as e:
                    print(f"Error decoding line: {line.strip()} - {e}")
    except FileNotFoundError:
        print("File not found.")
    return data


def convert_date(timestamp_ms):
    if timestamp_ms is not None:
        return datetime.fromtimestamp(timestamp_ms/1000)
    
    return timestamp_ms

In [77]:
# Convert Users data to table
user_data = load_json('data/users.json')
df_users = pd.DataFrame(columns=['id', 'active', 'createdDate', 'lastLogin', 'role', 'signUpSource', 'state'])
for row in user_data:
    if 'lastLogin' in row:
        lastLogin = row['lastLogin']['$date']
    else:
        lastLogin = None
    
    data = [row['_id']['$oid'], row['active'], row['createdDate']['$date'], lastLogin, row['role'], row.get('signUpSource', None), row.get('state', None)]
    df_users.loc[len(df_users)] = data

df_users['createdDate'] = df_users['createdDate'].apply(convert_date)
df_users['lastLogin'] = df_users['lastLogin'].apply(convert_date)
df_users

Unnamed: 0,id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 07:24:04.800,2021-01-03 07:25:37.858,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 07:24:04.800,2021-01-03 07:25:37.858,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 07:24:04.800,2021-01-03 07:25:37.858,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,2021-01-03 07:25:30.554,2021-01-03 07:25:30.597,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 07:24:04.800,2021-01-03 07:25:37.858,consumer,Email,WI
...,...,...,...,...,...,...,...
490,54943462e4b07e684157a532,True,2014-12-19 06:21:22.381,2021-03-05 08:52:23.204,fetch-staff,,
491,54943462e4b07e684157a532,True,2014-12-19 06:21:22.381,2021-03-05 08:52:23.204,fetch-staff,,
492,54943462e4b07e684157a532,True,2014-12-19 06:21:22.381,2021-03-05 08:52:23.204,fetch-staff,,
493,54943462e4b07e684157a532,True,2014-12-19 06:21:22.381,2021-03-05 08:52:23.204,fetch-staff,,


In [78]:
# Convert Brands data to table
brand_data = load_json('data/brands.json')
df_brands = pd.DataFrame(columns=['id', 'barcode', 'brandCode', 'categoryCode', 'cpgId', 'name', 'topBrand'])
for row in brand_data:
    data = [row['_id']['$oid'], row['barcode'], row.get('brandCode', None), row.get('categoryCode', None), row['cpg']['$id']['$oid'], row['name'], row.get('topBrand', None)]
    df_brands.loc[len(df_brands)] = data

df_brands

Unnamed: 0,id,barcode,brandCode,categoryCode,cpgId,name,topBrand
0,601ac115be37ce2ead437551,511111019862,,BAKING,601ac114be37ce2ead437550,test brand @1612366101024,False
1,601c5460be37ce2ead43755f,511111519928,STARBUCKS,BEVERAGES,5332f5fbe4b03c9a25efd0ba,Starbucks,False
2,601ac142be37ce2ead43755d,511111819905,TEST BRANDCODE @1612366146176,BAKING,601ac142be37ce2ead437559,test brand @1612366146176,False
3,601ac142be37ce2ead43755a,511111519874,TEST BRANDCODE @1612366146051,BAKING,601ac142be37ce2ead437559,test brand @1612366146051,False
4,601ac142be37ce2ead43755e,511111319917,TEST BRANDCODE @1612366146827,CANDY_AND_SWEETS,5332fa12e4b03c9a25efd1e7,test brand @1612366146827,False
...,...,...,...,...,...,...,...
1162,5f77274dbe37ce6b592e90c0,511111116752,,BAKING,5f77274dbe37ce6b592e90bf,test brand @1601644365844,
1163,5dc1fca91dda2c0ad7da64ae,511111706328,DIPPIN DOTS CEREAL,,53e10d6368abd3c7065097cc,Dippin Dots® Cereal,
1164,5f494c6e04db711dd8fe87e7,511111416173,TEST BRANDCODE @1598639215217,CANDY_AND_SWEETS,5332fa12e4b03c9a25efd1e7,test brand @1598639215217,
1165,5a021611e4b00efe02b02a57,511111400608,LIPTON TEA Leaves,,5332f5f6e4b03c9a25efd0b4,LIPTON TEA Leaves,False


In [79]:
brand_codes = list(df_brands['brandCode'])
ids = list(df_brands['id'])
brand_code_id_dict = {}
for i in range(len(brand_codes)):
    if brand_codes[i] is not None:
        brand_code_id_dict[brand_codes[i]] = ids[i]

brand_code_id_dict

{'STARBUCKS': '601c5460be37ce2ead43755f',
 'TEST BRANDCODE @1612366146176': '601ac142be37ce2ead43755d',
 'TEST BRANDCODE @1612366146051': '601ac142be37ce2ead43755a',
 'TEST BRANDCODE @1612366146827': '601ac142be37ce2ead43755e',
 'TEST BRANDCODE @1612366146091': '601ac142be37ce2ead43755b',
 'TEST BRANDCODE @1612366146133': '601ac142be37ce2ead43755c',
 'J.L. KRAFT': '5cdad0f5166eb33eb7ce0faa',
 'CAMPBELLS HOME STYLE': '5ab15636e4b0be0a89bb0b07',
 'TEST': '5c408e8bcd244a1fdb47aee7',
 'TEST BRANDCODE @1598813526777': '5f4bf556be37ce0b4491554d',
 'CALUMET': '588ba07be4b02187f85cdadd',
 '511111205012': '5d6413156d5f3b23d1bc790a',
 'AUNT JEMIMA SYRUP': '585a9611e4b03e62d1ce0e74',
 'MOLSON': '57e5820ce4b0ac389136a311',
 'LOTRIMIN': '5fb6adb8be37ce522e165cb8',
 'TEST BRANDCODE @1597342520277': '5f358338be37ce443bf9d55a',
 'ST IVES': '592486bfe410d61fcea3d139',
 'CHRISIMAGE': '5c4699f387ff3577e203ea29',
 'ALKA SELTZER': '5da6071ea60b87376833e34d',
 "JACK DANIEL'S BARBECUE": '57ebc011e4b0ac389136

In [80]:
# Convert Receipt data to table
receipts_data = load_json('data/receipts.json')
df_receipts = pd.DataFrame(columns=['id', 'createDate', 'dateScanned', 'finishedDate', 
                                    'modifyDate', 'pointsAwardedDate', 'purchaseDate',
                                    'pointsEarned', 'bonusPointsEarned', 'bonusPointsEarnedReason', 'purchasedItemCount', 
                                    'rewardsReceiptStatus', 'totalSpent', 'userId'])
for row in receipts_data:
    if 'finishedDate' in row:
        finished_date = row['finishedDate']['$date']
    else:
        finished_date = None

    if 'pointsAwardedDate' in row:
        points_awarded_date = row['pointsAwardedDate']['$date']
    else:
        points_awarded_date = None

    if 'purchaseDate' in row:
        purchase_date = row['purchaseDate']['$date']
    else:
        purchase_date = None

    data = [row['_id']['$oid'], row['createDate']['$date'], row['dateScanned']['$date'], finished_date,
            row['modifyDate']['$date'], points_awarded_date, purchase_date, row.get('pointsEarned', None),
            row.get('bonusPointsEarned', None), row.get('bonusPointsEarnedReason', None), row.get('purchasedItemCount', None), 
            row['rewardsReceiptStatus'], row.get('totalSpent', None), row['userId']]
    df_receipts.loc[len(df_receipts)] = data

df_receipts['createDate'] = df_receipts['createDate'].apply(convert_date)
df_receipts['dateScanned'] = df_receipts['dateScanned'].apply(convert_date)
df_receipts['finishedDate'] = df_receipts['finishedDate'].apply(convert_date)
df_receipts['modifyDate'] = df_receipts['modifyDate'].apply(convert_date)
df_receipts['pointsAwardedDate'] = df_receipts['pointsAwardedDate'].apply(convert_date)
df_receipts['purchaseDate'] = df_receipts['purchaseDate'].apply(convert_date)
df_receipts

Unnamed: 0,id,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,purchaseDate,pointsEarned,bonusPointsEarned,bonusPointsEarnedReason,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,2021-01-03 07:25:31.000,2021-01-03 07:25:31.000,2021-01-03 07:25:31,2021-01-03 07:25:36.000,2021-01-03 07:25:31,2021-01-02 16:00:00,500.0,500,"Receipt number 2 completed, bonus point schedu...",5,FINISHED,26.00,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,2021-01-03 07:24:43.000,2021-01-03 07:24:43.000,2021-01-03 07:24:43,2021-01-03 07:24:48.000,2021-01-03 07:24:43,2021-01-02 07:24:43,150.0,150,"Receipt number 5 completed, bonus point schedu...",2,FINISHED,11.00,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,2021-01-03 07:25:37.000,2021-01-03 07:25:37.000,NaT,2021-01-03 07:25:42.000,NaT,2021-01-02 16:00:00,5,5,All-receipts receipt bonus,1,REJECTED,10.00,5ff1e1f1cfcf6c399c274b0b
3,5ff1e1ee0a7214ada100056f,2021-01-03 07:25:34.000,2021-01-03 07:25:34.000,2021-01-03 07:25:34,2021-01-03 07:25:39.000,2021-01-03 07:25:34,2021-01-02 16:00:00,5.0,5,All-receipts receipt bonus,4,FINISHED,28.00,5ff1e1eacfcf6c399c274ae6
4,5ff1e1d20a7214ada1000561,2021-01-03 07:25:06.000,2021-01-03 07:25:06.000,2021-01-03 07:25:11,2021-01-03 07:25:11.000,2021-01-03 07:25:06,2021-01-02 07:25:06,5.0,5,All-receipts receipt bonus,2,FINISHED,1.00,5ff1e194b6a9d73a3a9f1052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,603cc0630a720fde100003e6,2021-03-01 02:22:27.000,2021-03-01 02:22:27.000,NaT,2021-03-01 02:22:28.000,NaT,2020-08-16 17:00:00,25.0,25,COMPLETE_NONPARTNER_RECEIPT,2,REJECTED,34.96,5fc961c3b8cfca11a077dd33
1115,603d0b710a720fde1000042a,2021-03-01 07:42:41.873,2021-03-01 07:42:41.873,NaT,2021-03-01 07:42:41.873,NaT,NaT,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1116,603cf5290a720fde10000413,2021-03-01 06:07:37.664,2021-03-01 06:07:37.664,NaT,2021-03-01 06:07:37.664,NaT,NaT,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1117,603ce7100a7217c72c000405,2021-03-01 05:07:28.000,2021-03-01 05:07:28.000,NaT,2021-03-01 05:07:29.000,NaT,2020-08-16 17:00:00,25.0,25,COMPLETE_NONPARTNER_RECEIPT,2,REJECTED,34.96,5fc961c3b8cfca11a077dd33


In [81]:
# Get bonus Points Earned Reasons dict
bonusPointsEarnedReasons = df_receipts['bonusPointsEarnedReason'].unique()
bonusPointsEarnedReasonsDict = {}
for i in range(len(bonusPointsEarnedReasons)):
    bonusPointsEarnedReasonsDict[bonusPointsEarnedReasons[i]] = i + 1

bonusPointsEarnedReasonsDict

{'Receipt number 2 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)': 1,
 'Receipt number 5 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)': 2,
 'All-receipts receipt bonus': 3,
 'Receipt number 1 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)': 4,
 'Receipt number 3 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)': 5,
 'Receipt number 6 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)': 6,
 'Receipt number 4 completed, bonus point schedule DEFAULT (5cefdcacf3693e0b50e83a36)': 7,
 None: 8,
 'COMPLETE_PARTNER_RECEIPT': 9,
 'COMPLETE_NONPARTNER_RECEIPT': 10}

In [82]:
df_receipts['bonusPointsEarnedReason'] = df_receipts['bonusPointsEarnedReason'].apply(lambda x: bonusPointsEarnedReasonsDict[x])
df_receipts

Unnamed: 0,id,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,purchaseDate,pointsEarned,bonusPointsEarned,bonusPointsEarnedReason,purchasedItemCount,rewardsReceiptStatus,totalSpent,userId
0,5ff1e1eb0a720f0523000575,2021-01-03 07:25:31.000,2021-01-03 07:25:31.000,2021-01-03 07:25:31,2021-01-03 07:25:36.000,2021-01-03 07:25:31,2021-01-02 16:00:00,500.0,500,1,5,FINISHED,26.00,5ff1e1eacfcf6c399c274ae6
1,5ff1e1bb0a720f052300056b,2021-01-03 07:24:43.000,2021-01-03 07:24:43.000,2021-01-03 07:24:43,2021-01-03 07:24:48.000,2021-01-03 07:24:43,2021-01-02 07:24:43,150.0,150,2,2,FINISHED,11.00,5ff1e194b6a9d73a3a9f1052
2,5ff1e1f10a720f052300057a,2021-01-03 07:25:37.000,2021-01-03 07:25:37.000,NaT,2021-01-03 07:25:42.000,NaT,2021-01-02 16:00:00,5,5,3,1,REJECTED,10.00,5ff1e1f1cfcf6c399c274b0b
3,5ff1e1ee0a7214ada100056f,2021-01-03 07:25:34.000,2021-01-03 07:25:34.000,2021-01-03 07:25:34,2021-01-03 07:25:39.000,2021-01-03 07:25:34,2021-01-02 16:00:00,5.0,5,3,4,FINISHED,28.00,5ff1e1eacfcf6c399c274ae6
4,5ff1e1d20a7214ada1000561,2021-01-03 07:25:06.000,2021-01-03 07:25:06.000,2021-01-03 07:25:11,2021-01-03 07:25:11.000,2021-01-03 07:25:06,2021-01-02 07:25:06,5.0,5,3,2,FINISHED,1.00,5ff1e194b6a9d73a3a9f1052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,603cc0630a720fde100003e6,2021-03-01 02:22:27.000,2021-03-01 02:22:27.000,NaT,2021-03-01 02:22:28.000,NaT,2020-08-16 17:00:00,25.0,25,10,2,REJECTED,34.96,5fc961c3b8cfca11a077dd33
1115,603d0b710a720fde1000042a,2021-03-01 07:42:41.873,2021-03-01 07:42:41.873,NaT,2021-03-01 07:42:41.873,NaT,NaT,,,8,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1116,603cf5290a720fde10000413,2021-03-01 06:07:37.664,2021-03-01 06:07:37.664,NaT,2021-03-01 06:07:37.664,NaT,NaT,,,8,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1117,603ce7100a7217c72c000405,2021-03-01 05:07:28.000,2021-03-01 05:07:28.000,NaT,2021-03-01 05:07:29.000,NaT,2020-08-16 17:00:00,25.0,25,10,2,REJECTED,34.96,5fc961c3b8cfca11a077dd33


In [83]:
# Items table
df_items = pd.DataFrame(columns=['barcode', 'brandId', 'description', 'itemPrice', 'deleted', 'competitiveProduct'])
df_reward_receipt_items = pd.DataFrame(columns=['receiptsId', 'itemId', 'discountedItemPrice', 'priceAfterCoupon', 'finalPrice', 'itemPurchased', 
                                                'needsFetchReview', 'needsFetchReviewReason', 'pointsEarned', 'metabriteCampaignId', 'preventTargetGapPoints'])
barcode_seen = set()
for row in receipts_data:
    receiptsId = row['_id']['$oid']
    item_list = row.get('rewardsReceiptItemList', [])
    for item_info in item_list:
        barcode = item_info.get('barcode')
        if barcode is not None and barcode not in barcode_seen:
            barcode_seen.add(barcode)
            if 'brandCode' in item_info:
                brand_id = brand_code_id_dict.get(item_info['brandCode'], None)
            else:
                brand_id = None
            description = item_info.get('description')
            itemPrice = item_info.get('description')
            deleted = item_info.get('deleted')
            competitiveProduct = item_info.get('competitiveProduct')
            itemPrice = item_info.get('itemPrice')
            df_items.loc[len(df_items)] = [barcode, brand_id, description, itemPrice, deleted, competitiveProduct]

        if barcode is not None:
            curr_idx = df_items.index[df_items['barcode'] == barcode].tolist()[0]
            discountedItemPrice = item_info.get('discountedItemPrice')
            priceAfterCoupon = item_info.get('priceAfterCoupon')
            finalPrice = item_info.get('finalPrice')
            itemPurchased = item_info.get('itemPurchased')
            needsFetchReview = item_info.get('needsFetchReview')
            needsFetchReviewReason = item_info.get('needsFetchReviewReason')
            pointsEarned = item_info.get('pointsEarned')
            metabriteCampaignId = item_info.get('metabriteCampaignId')
            preventTargetGapPoints = item_info.get('preventTargetGapPoints')
            df_reward_receipt_items.loc[len(df_reward_receipt_items)] = [receiptsId, curr_idx, discountedItemPrice, priceAfterCoupon, finalPrice, itemPurchased,
                                                                        needsFetchReview, needsFetchReviewReason, pointsEarned, metabriteCampaignId, preventTargetGapPoints]


In [84]:
df_items

Unnamed: 0,barcode,brandId,description,itemPrice,deleted,competitiveProduct
0,4011,,ITEM NOT FOUND,26.00,,
1,028400642255,,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.00,,
2,1234,,,2.56,,
3,046000832517,,"Old El Paso Mild Chopped Green Chiles, 4.5 Oz",10.00,,
4,013562300631,,Annie's Homegrown Organic White Cheddar Macaro...,50.00,,
...,...,...,...,...,...,...
563,B076FJ92M4,,mueller austria hypergrind precision electric ...,22.97,,
564,B07BRRLSVC,,thindust summer face mask - sun protection nec...,11.99,,
565,B08BGBHHP6,,spigen thin fit designed for iphone 12 mini ca...,12.59,,
566,B08DQDHR2S,,spigen tempered glass screen protector [glas.t...,12.99,,


In [85]:
df_reward_receipt_items

Unnamed: 0,receiptsId,itemId,discountedItemPrice,priceAfterCoupon,finalPrice,itemPurchased,needsFetchReview,needsFetchReviewReason,pointsEarned,metabriteCampaignId,preventTargetGapPoints
0,5ff1e1eb0a720f0523000575,0,,,26.00,,False,,,,True
1,5ff1e1bb0a720f052300056b,0,,,1,,,,,,
2,5ff1e1bb0a720f052300056b,1,,,10.00,,True,USER_FLAGGED,,,True
3,5ff1e1ee0a7214ada100056f,0,,,28.00,,False,,,,True
4,5ff1e1d20a7214ada1000561,0,,,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
3085,603cc2bc0a720fde100003e9,564,11.99,11.99,11.99,,,,,,
3086,603cc0630a720fde100003e6,563,22.97,22.97,22.97,,,,,,
3087,603cc0630a720fde100003e6,564,11.99,11.99,11.99,,,,,,
3088,603ce7100a7217c72c000405,563,22.97,22.97,22.97,,,,,,


In [86]:
df_users.to_csv("data/df_users.csv")
df_brands.to_csv("data/df_brands.csv")
df_receipts.to_csv("data/df_receipts.csv")
df_items.to_csv("data/df_items.csv")
df_reward_receipt_items.to_csv("data/df_reward_receipt_items.csv")
