**Purpose**

Part 1 (ERD) can be seen in the readme. 

From here the following steps are taken below: 

1. Data quality is evaluated 

Investigate and clean the data 

2. Data is properly formatted 

Data will be stored in cleaned CSV files, similar to what can be seen in the ERD

3. Data is exported 

Data will be sent to a folder, from there in a separate jupyter notebook, we can create queries using PANDASQL.  This will ensure that the questions proposed in part 2 can be properly answered and evaluated

**Data Quality and cleaning**

In [1]:
import re
import pandas as pd
import json

In [2]:
#Here we import the files
users_df = pd.read_json('old_data/users.json', orient='columns', lines=True, convert_dates=True)
brands_df = pd.read_json('old_data/brands.json', orient='columns', lines=True, convert_dates=True)
receipts_df = pd.read_json('old_data/receipts.json', orient='columns', lines=True, convert_dates=True)
users_df.head()


Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
3,{'$oid': '5ff1e1eacfcf6c399c274ae6'},True,{'$date': 1609687530554},{'$date': 1609687530597},consumer,Email,WI
4,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [3]:
#format columns to the way it is in ERD
users_df['userId'] = users_df['_id'].str['$oid']
users_df['createdDate'] = users_df['createdDate'].str['$date'].astype('datetime64[ms]')
users_df['lastLogin'] = users_df['lastLogin'].str['$date'].astype('datetime64[ms]')
users_df = users_df.drop('_id', axis=1)
users_df

Unnamed: 0,active,createdDate,lastLogin,role,signUpSource,state,userId
0,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052
1,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052
2,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052
3,True,2021-01-03 15:25:30.554,2021-01-03 15:25:30.597,consumer,Email,WI,5ff1e1eacfcf6c399c274ae6
4,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI,5ff1e194b6a9d73a3a9f1052
...,...,...,...,...,...,...,...
490,True,2014-12-19 14:21:22.381,2021-03-05 16:52:23.204,fetch-staff,,,54943462e4b07e684157a532
491,True,2014-12-19 14:21:22.381,2021-03-05 16:52:23.204,fetch-staff,,,54943462e4b07e684157a532
492,True,2014-12-19 14:21:22.381,2021-03-05 16:52:23.204,fetch-staff,,,54943462e4b07e684157a532
493,True,2014-12-19 14:21:22.381,2021-03-05 16:52:23.204,fetch-staff,,,54943462e4b07e684157a532


In [4]:
#first cleaning step is looking at our data
print(users_df[~users_df['userId'].isnull()])
users_df.describe()

     active             createdDate               lastLogin         role  \
0      True 2021-01-03 15:24:04.800 2021-01-03 15:25:37.858     consumer   
1      True 2021-01-03 15:24:04.800 2021-01-03 15:25:37.858     consumer   
2      True 2021-01-03 15:24:04.800 2021-01-03 15:25:37.858     consumer   
3      True 2021-01-03 15:25:30.554 2021-01-03 15:25:30.597     consumer   
4      True 2021-01-03 15:24:04.800 2021-01-03 15:25:37.858     consumer   
..      ...                     ...                     ...          ...   
490    True 2014-12-19 14:21:22.381 2021-03-05 16:52:23.204  fetch-staff   
491    True 2014-12-19 14:21:22.381 2021-03-05 16:52:23.204  fetch-staff   
492    True 2014-12-19 14:21:22.381 2021-03-05 16:52:23.204  fetch-staff   
493    True 2014-12-19 14:21:22.381 2021-03-05 16:52:23.204  fetch-staff   
494    True 2014-12-19 14:21:22.381 2021-03-05 16:52:23.204  fetch-staff   

    signUpSource state                    userId  
0          Email    WI  5ff1e194b6a9

  users_df.describe()


Unnamed: 0,active,createdDate,lastLogin,role,signUpSource,state,userId
count,495,495,433,495,447,439,495
unique,2,212,172,2,2,8,212
top,True,2014-12-19 14:21:22.381000,2021-03-05 16:52:23.204000,consumer,Email,WI,54943462e4b07e684157a532
freq,494,20,20,413,443,396,20
first,,2014-12-19 14:21:22.381000,2018-05-07 17:23:40.003000,,,,
last,,2021-02-12 14:11:06.240000,2021-03-05 16:52:23.204000,,,,


In [5]:
#clearly, looks like we always have userId so that is good, we also do not neccesarily
# need the other non primary values (they are ok as null)
#therefore, lets move onto the next df (re-importing for my testing convienence)
brands_df = pd.read_json('old_data/brands.json', orient='columns', lines=True, convert_dates=True)
brands_df['cpg'] = brands_df['cpg'].astype(str)
brands_df['topBrand'] = brands_df['topBrand'].astype(bool)
brands_df

Unnamed: 0,_id,barcode,category,categoryCode,cpg,name,topBrand,brandCode
0,{'$oid': '601ac115be37ce2ead437551'},511111019862,Baking,BAKING,"{'$id': {'$oid': '601ac114be37ce2ead437550'}, ...",test brand @1612366101024,False,
1,{'$oid': '601c5460be37ce2ead43755f'},511111519928,Beverages,BEVERAGES,"{'$id': {'$oid': '5332f5fbe4b03c9a25efd0ba'}, ...",Starbucks,False,STARBUCKS
2,{'$oid': '601ac142be37ce2ead43755d'},511111819905,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146176,False,TEST BRANDCODE @1612366146176
3,{'$oid': '601ac142be37ce2ead43755a'},511111519874,Baking,BAKING,"{'$id': {'$oid': '601ac142be37ce2ead437559'}, ...",test brand @1612366146051,False,TEST BRANDCODE @1612366146051
4,{'$oid': '601ac142be37ce2ead43755e'},511111319917,Candy & Sweets,CANDY_AND_SWEETS,"{'$id': {'$oid': '5332fa12e4b03c9a25efd1e7'}, ...",test brand @1612366146827,False,TEST BRANDCODE @1612366146827
...,...,...,...,...,...,...,...,...
1162,{'$oid': '5f77274dbe37ce6b592e90c0'},511111116752,Baking,BAKING,"{'$ref': 'Cogs', '$id': {'$oid': '5f77274dbe37...",test brand @1601644365844,True,
1163,{'$oid': '5dc1fca91dda2c0ad7da64ae'},511111706328,Breakfast & Cereal,,"{'$ref': 'Cogs', '$id': {'$oid': '53e10d6368ab...",Dippin Dots® Cereal,True,DIPPIN DOTS CEREAL
1164,{'$oid': '5f494c6e04db711dd8fe87e7'},511111416173,Candy & Sweets,CANDY_AND_SWEETS,"{'$ref': 'Cogs', '$id': {'$oid': '5332fa12e4b0...",test brand @1598639215217,True,TEST BRANDCODE @1598639215217
1165,{'$oid': '5a021611e4b00efe02b02a57'},511111400608,Grocery,,"{'$ref': 'Cogs', '$id': {'$oid': '5332f5f6e4b0...",LIPTON TEA Leaves,False,LIPTON TEA Leaves


In [6]:
brands_df['cpg'][0]

"{'$id': {'$oid': '601ac114be37ce2ead437550'}, '$ref': 'Cogs'}"

In [7]:
#similar cleaning for brands df
#likely need special logic for cpg, looks messy so we can string search
# may want to cast topBrand to bool (should not be float)
brands_df['brandId'] = brands_df['_id'].str['$oid']
brands_df['cpg'] = brands_df.cpg.str.extract(r"oid': \s*'([^']+)")
brands_df = brands_df.drop('_id', axis=1)
brands_df

Unnamed: 0,barcode,category,categoryCode,cpg,name,topBrand,brandCode,brandId
0,511111019862,Baking,BAKING,601ac114be37ce2ead437550,test brand @1612366101024,False,,601ac115be37ce2ead437551
1,511111519928,Beverages,BEVERAGES,5332f5fbe4b03c9a25efd0ba,Starbucks,False,STARBUCKS,601c5460be37ce2ead43755f
2,511111819905,Baking,BAKING,601ac142be37ce2ead437559,test brand @1612366146176,False,TEST BRANDCODE @1612366146176,601ac142be37ce2ead43755d
3,511111519874,Baking,BAKING,601ac142be37ce2ead437559,test brand @1612366146051,False,TEST BRANDCODE @1612366146051,601ac142be37ce2ead43755a
4,511111319917,Candy & Sweets,CANDY_AND_SWEETS,5332fa12e4b03c9a25efd1e7,test brand @1612366146827,False,TEST BRANDCODE @1612366146827,601ac142be37ce2ead43755e
...,...,...,...,...,...,...,...,...
1162,511111116752,Baking,BAKING,5f77274dbe37ce6b592e90bf,test brand @1601644365844,True,,5f77274dbe37ce6b592e90c0
1163,511111706328,Breakfast & Cereal,,53e10d6368abd3c7065097cc,Dippin Dots® Cereal,True,DIPPIN DOTS CEREAL,5dc1fca91dda2c0ad7da64ae
1164,511111416173,Candy & Sweets,CANDY_AND_SWEETS,5332fa12e4b03c9a25efd1e7,test brand @1598639215217,True,TEST BRANDCODE @1598639215217,5f494c6e04db711dd8fe87e7
1165,511111400608,Grocery,,5332f5f6e4b03c9a25efd0b4,LIPTON TEA Leaves,False,LIPTON TEA Leaves,5a021611e4b00efe02b02a57


In [51]:
#lets move onto the next df (re-importing for my testing convienence)
receipts_df = pd.read_json('old_data/receipts.json', orient='columns', lines=True, convert_dates=True)
receipts_df

Unnamed: 0,_id,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId
0,{'$oid': '5ff1e1eb0a720f0523000575'},500.0,"Receipt number 2 completed, bonus point schedu...",{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687531000},{'$date': 1609687536000},{'$date': 1609687531000},500.0,{'$date': 1609632000000},5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.00,5ff1e1eacfcf6c399c274ae6
1,{'$oid': '5ff1e1bb0a720f052300056b'},150.0,"Receipt number 5 completed, bonus point schedu...",{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687483000},{'$date': 1609687488000},{'$date': 1609687483000},150.0,{'$date': 1609601083000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.00,5ff1e194b6a9d73a3a9f1052
2,{'$oid': '5ff1e1f10a720f052300057a'},5.0,All-receipts receipt bonus,{'$date': 1609687537000},{'$date': 1609687537000},,{'$date': 1609687542000},,5.0,{'$date': 1609632000000},1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.00,5ff1e1f1cfcf6c399c274b0b
3,{'$oid': '5ff1e1ee0a7214ada100056f'},5.0,All-receipts receipt bonus,{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687534000},{'$date': 1609687539000},{'$date': 1609687534000},5.0,{'$date': 1609632000000},4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.00,5ff1e1eacfcf6c399c274ae6
4,{'$oid': '5ff1e1d20a7214ada1000561'},5.0,All-receipts receipt bonus,{'$date': 1609687506000},{'$date': 1609687506000},{'$date': 1609687511000},{'$date': 1609687511000},{'$date': 1609687506000},5.0,{'$date': 1609601106000},2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.00,5ff1e194b6a9d73a3a9f1052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,{'$oid': '603cc0630a720fde100003e6'},25.0,COMPLETE_NONPARTNER_RECEIPT,{'$date': 1614594147000},{'$date': 1614594147000},,{'$date': 1614594148000},,25.0,{'$date': 1597622400000},2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33
1115,{'$oid': '603d0b710a720fde1000042a'},,,{'$date': 1614613361873},{'$date': 1614613361873},,{'$date': 1614613361873},,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1116,{'$oid': '603cf5290a720fde10000413'},,,{'$date': 1614607657664},{'$date': 1614607657664},,{'$date': 1614607657664},,,,,,SUBMITTED,,5fc961c3b8cfca11a077dd33
1117,{'$oid': '603ce7100a7217c72c000405'},25.0,COMPLETE_NONPARTNER_RECEIPT,{'$date': 1614604048000},{'$date': 1614604048000},,{'$date': 1614604049000},,25.0,{'$date': 1597622400000},2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33


In [52]:
#clean up the columns!
receipts_df['receiptId'] = receipts_df['_id'].str['$oid']
receipts_df['createDate'] = receipts_df['createDate'].str['$date'].astype('datetime64[ms]')
receipts_df['dateScanned'] = receipts_df['dateScanned'].str['$date'].astype('datetime64[ms]')
receipts_df['finishedDate'] = receipts_df['finishedDate'].str['$date'].astype('datetime64[ms]')
receipts_df['modifyDate'] = receipts_df['modifyDate'].str['$date'].astype('datetime64[ms]')
receipts_df['purchaseDate'] = receipts_df['purchaseDate'].str['$date'].astype('datetime64[ms]')
receipts_df = receipts_df.drop('_id', axis=1)
receipts_df['pointsEarned'] = receipts_df['pointsEarned'].fillna(0)
receipts_df['purchasedItemCount'] = receipts_df['purchasedItemCount'].fillna(0)
receipts_df['totalSpent'] = receipts_df['totalSpent'].fillna(0)
receipts_df['bonusPointsEarned'] = receipts_df['bonusPointsEarned'].fillna(0)
receipts_df

Unnamed: 0,bonusPointsEarned,bonusPointsEarnedReason,createDate,dateScanned,finishedDate,modifyDate,pointsAwardedDate,pointsEarned,purchaseDate,purchasedItemCount,rewardsReceiptItemList,rewardsReceiptStatus,totalSpent,userId,receiptId
0,500.0,"Receipt number 2 completed, bonus point schedu...",2021-01-03 15:25:31.000,2021-01-03 15:25:31.000,2021-01-03 15:25:31,2021-01-03 15:25:36.000,{'$date': 1609687531000},500.0,2021-01-03 00:00:00,5.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,26.00,5ff1e1eacfcf6c399c274ae6,5ff1e1eb0a720f0523000575
1,150.0,"Receipt number 5 completed, bonus point schedu...",2021-01-03 15:24:43.000,2021-01-03 15:24:43.000,2021-01-03 15:24:43,2021-01-03 15:24:48.000,{'$date': 1609687483000},150.0,2021-01-02 15:24:43,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,11.00,5ff1e194b6a9d73a3a9f1052,5ff1e1bb0a720f052300056b
2,5.0,All-receipts receipt bonus,2021-01-03 15:25:37.000,2021-01-03 15:25:37.000,NaT,2021-01-03 15:25:42.000,,5.0,2021-01-03 00:00:00,1.0,"[{'needsFetchReview': False, 'partnerItemId': ...",REJECTED,10.00,5ff1e1f1cfcf6c399c274b0b,5ff1e1f10a720f052300057a
3,5.0,All-receipts receipt bonus,2021-01-03 15:25:34.000,2021-01-03 15:25:34.000,2021-01-03 15:25:34,2021-01-03 15:25:39.000,{'$date': 1609687534000},5.0,2021-01-03 00:00:00,4.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,28.00,5ff1e1eacfcf6c399c274ae6,5ff1e1ee0a7214ada100056f
4,5.0,All-receipts receipt bonus,2021-01-03 15:25:06.000,2021-01-03 15:25:06.000,2021-01-03 15:25:11,2021-01-03 15:25:11.000,{'$date': 1609687506000},5.0,2021-01-02 15:25:06,2.0,"[{'barcode': '4011', 'description': 'ITEM NOT ...",FINISHED,1.00,5ff1e194b6a9d73a3a9f1052,5ff1e1d20a7214ada1000561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,25.0,COMPLETE_NONPARTNER_RECEIPT,2021-03-01 10:22:27.000,2021-03-01 10:22:27.000,NaT,2021-03-01 10:22:28.000,,25.0,2020-08-17 00:00:00,2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33,603cc0630a720fde100003e6
1115,0.0,,2021-03-01 15:42:41.873,2021-03-01 15:42:41.873,NaT,2021-03-01 15:42:41.873,,0.0,NaT,0.0,,SUBMITTED,0.00,5fc961c3b8cfca11a077dd33,603d0b710a720fde1000042a
1116,0.0,,2021-03-01 14:07:37.664,2021-03-01 14:07:37.664,NaT,2021-03-01 14:07:37.664,,0.0,NaT,0.0,,SUBMITTED,0.00,5fc961c3b8cfca11a077dd33,603cf5290a720fde10000413
1117,25.0,COMPLETE_NONPARTNER_RECEIPT,2021-03-01 13:07:28.000,2021-03-01 13:07:28.000,NaT,2021-03-01 13:07:29.000,,25.0,2020-08-17 00:00:00,2.0,"[{'barcode': 'B076FJ92M4', 'description': 'mue...",REJECTED,34.96,5fc961c3b8cfca11a077dd33,603ce7100a7217c72c000405


In [33]:
#let's create our receipt item DF
receipts_item_df = pd.DataFrame.from_dict(receipts_df[['rewardsReceiptItemList', 'receiptId']]).set_index('receiptId')
receipts_item_df

Unnamed: 0_level_0,rewardsReceiptItemList
receiptId,Unnamed: 1_level_1
5ff1e1eb0a720f0523000575,"[{'barcode': '4011', 'description': 'ITEM NOT ..."
5ff1e1bb0a720f052300056b,"[{'barcode': '4011', 'description': 'ITEM NOT ..."
5ff1e1f10a720f052300057a,"[{'needsFetchReview': False, 'partnerItemId': ..."
5ff1e1ee0a7214ada100056f,"[{'barcode': '4011', 'description': 'ITEM NOT ..."
5ff1e1d20a7214ada1000561,"[{'barcode': '4011', 'description': 'ITEM NOT ..."
...,...
603cc0630a720fde100003e6,"[{'barcode': 'B076FJ92M4', 'description': 'mue..."
603d0b710a720fde1000042a,
603cf5290a720fde10000413,
603ce7100a7217c72c000405,"[{'barcode': 'B076FJ92M4', 'description': 'mue..."


In [34]:
#this was a toughie. I had to explode the DF, but inherintly, json_normalize drops
#index values, so I had to maintain that information in a temp dict.
df_temp = pd.DataFrame(receipts_item_df.rewardsReceiptItemList.explode())
receipts_item_df = pd.json_normalize(receipts_item_df.rewardsReceiptItemList.explode()).set_index(df_temp.index).reset_index()
print(receipts_item_df.columns)
receipts_item_df = receipts_item_df[['receiptId', 'brandCode', 'barcode', 'description', 'finalPrice', 'partnerItemId', 'pointsEarned', 'pointsPayerId', 'quantityPurchased', 'rewardsGroup', 'rewardsProductPartnerId', 'targetPrice', 'userFlaggedNewItem', 'needsFetchReview']]
receipts_item_df

Index(['receiptId', 'barcode', 'description', 'finalPrice', 'itemPrice',
       'needsFetchReview', 'partnerItemId', 'preventTargetGapPoints',
       'quantityPurchased', 'userFlaggedBarcode', 'userFlaggedNewItem',
       'userFlaggedPrice', 'userFlaggedQuantity', 'needsFetchReviewReason',
       'pointsNotAwardedReason', 'pointsPayerId', 'rewardsGroup',
       'rewardsProductPartnerId', 'userFlaggedDescription',
       'originalMetaBriteBarcode', 'originalMetaBriteDescription', 'brandCode',
       'competitorRewardsGroup', 'discountedItemPrice',
       'originalReceiptItemText', 'itemNumber',
       'originalMetaBriteQuantityPurchased', 'pointsEarned', 'targetPrice',
       'competitiveProduct', 'originalFinalPrice',
       'originalMetaBriteItemPrice', 'deleted', 'priceAfterCoupon',
       'metabriteCampaignId'],
      dtype='object')


Unnamed: 0,receiptId,brandCode,barcode,description,finalPrice,partnerItemId,pointsEarned,pointsPayerId,quantityPurchased,rewardsGroup,rewardsProductPartnerId,targetPrice,userFlaggedNewItem,needsFetchReview
0,5ff1e1eb0a720f0523000575,,4011,ITEM NOT FOUND,26.00,1,,,5.0,,,,True,False
1,5ff1e1bb0a720f052300056b,,4011,ITEM NOT FOUND,1,1,,,1.0,,,,,
2,5ff1e1bb0a720f052300056b,,028400642255,DORITOS TORTILLA CHIP SPICY SWEET CHILI REDUCE...,10.00,2,,5332f5fbe4b03c9a25efd0ba,1.0,DORITOS SPICY SWEET CHILI SINGLE SERVE,5332f5fbe4b03c9a25efd0ba,,True,True
3,5ff1e1f10a720f052300057a,,,,,1,,,,,,,True,False
4,5ff1e1ee0a7214ada100056f,,4011,ITEM NOT FOUND,28.00,1,,,4.0,,,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7376,603d0b710a720fde1000042a,,,,,,,,,,,,,
7377,603cf5290a720fde10000413,,,,,,,,,,,,,
7378,603ce7100a7217c72c000405,,B076FJ92M4,mueller austria hypergrind precision electric ...,22.97,0,,,1.0,,,,,
7379,603ce7100a7217c72c000405,,B07BRRLSVC,thindust summer face mask - sun protection nec...,11.99,1,,,1.0,,,,,


In [36]:
#manual searching reveils 2600 brand codes in the raw data... am I retaining that all
receipts_item_df[~receipts_item_df.brandCode.isnull()]
#looks good

Unnamed: 0,receiptId,brandCode,barcode,description,finalPrice,partnerItemId,pointsEarned,pointsPayerId,quantityPurchased,rewardsGroup,rewardsProductPartnerId,targetPrice,userFlaggedNewItem,needsFetchReview
8,5ff1e1cd0a720f052300056f,MISSION,,MSSN TORTLLA,2.23,1009,,,1.0,,,,,
9,5ff1e1a40a720f0523000569,BRAND,046000832517,"Old El Paso Mild Chopped Green Chiles, 4.5 Oz",10.00,0,,5332f5f3e4b03c9a25efd0ae,1.0,OLD EL PASO BEANS & PEPPERS,5332f5f3e4b03c9a25efd0ae,,,
150,5ff29be20a7214ada1000571,KRAFT EASY CHEESE,044000000745,-Cheddar,1.00,1030,,,1.0,SARGENTO RICOTTA CHEESE,5e7cf838f221c312e698a628,,,
308,5ff618e30a7214ada10005fa,PEPSI,012000809941,REGULAR PEPSI SODA COLA CAN 12 CT 144 OZ,1.96,1016,,5332f5fbe4b03c9a25efd0ba,2.0,PEPSI 12 OZ 12 PACK,5332f5fbe4b03c9a25efd0ba,,,
310,5ff618e30a7214ada10005fa,DORITOS,028400642033,DORITOS TORTILLA CHIP NACHO CHEESE BAG 7.625 OZ,6.90,1022,,5332f5fbe4b03c9a25efd0ba,2.0,DORITOS NACHO CHEESE MULTI SERVE,5332f5fbe4b03c9a25efd0ba,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6871,60189cc10a7214ad28000050,BRAND,070470149394,Yoplait Fiber One Non-Fat Yogurt - Peach & Van...,10.00,0,,5332f5f3e4b03c9a25efd0ae,1.0,YOPLAIT FIBER ONE YOGURT,5332f5f3e4b03c9a25efd0ae,,,
6874,60189cae0a7214ad2800004f,MISSION,,MSSN TORTLLA,2.23,1009,,,1.0,,,,,
7123,602454340a720f05a80001ae,VIVA,036000494129,VIVA PAPER TOWEL 1 PACK 2 COUNT 83 SHEET,3.92,1010,39.2,550b2565e4b001d5e9e4146f,1.0,VIVA MULTI SURFACE CLOTH PAPER TOWEL,550b2565e4b001d5e9e4146f,,,
7170,602538b30a7214d8e9000248,BRAND,042800108005,"Totino's Triple Meat Party Pizza, 10.5 Oz",10.00,0,,5332f5f3e4b03c9a25efd0ae,1.0,TOTINO'S PARTY PIZZA - SINGLE PACK,5332f5f3e4b03c9a25efd0ae,,,


**Now we have 4 cleaned dataframes!!!! lets save these**

In [37]:
receipts_item_df.to_csv('clean_data/receipts_items.csv')
receipts_df.to_csv('clean_data/receipts.csv')
brands_df.to_csv('clean_data/brands.csv')
users_df.to_csv('clean_data/users.csv')

In [39]:
#any other data quality issues? 

In [47]:
   def describex(data):
        stats = data.describe()
        skewness = data.skew()
        kurtosis = data.kurtosis()
        skewness_df = pd.DataFrame({'skewness':skewness}).T
        kurtosis_df = pd.DataFrame({'kurtosis':kurtosis}).T
        return stats.append([kurtosis_df,skewness_df])

In [48]:
describex(receipts_item_df)

  skewness = data.skew()
  kurtosis = data.kurtosis()


Unnamed: 0,quantityPurchased,finalPrice,partnerItemId,pointsEarned,targetPrice,userFlaggedNewItem,needsFetchReview
count,6767.0,,,,,,
mean,1.386139,,,,,,
std,1.204363,,,,,,
min,1.0,,,,,,
25%,1.0,,,,,,
50%,1.0,,,,,,
75%,1.0,,,,,,
max,17.0,,,,,,
kurtosis,36.002117,207.389462,-0.136408,5.959663,0.065721,0.0,-0.917245
skewness,5.113736,11.383034,-1.017486,2.6551,-1.437151,0.0,1.041642


In [53]:
describex(receipts_df)

  skewness = data.skew()
  kurtosis = data.kurtosis()


Unnamed: 0,bonusPointsEarned,pointsEarned,purchasedItemCount,totalSpent
count,1119.0,1119.0,1119.0,1119.0
mean,116.137623,318.902055,8.374441,47.554111
std,240.243665,1042.554968,46.614586,273.944502
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,5.0,1.0,1.0
75%,45.0,231.4,2.0,25.0
max,750.0,10199.8,689.0,4721.95
kurtosis,2.401638,46.746804,111.29771,200.366105
skewness,2.004417,6.369047,9.560635,12.990598
