Write a query that:

1. Returns the top 5 users and their emails by total gross orders (only successful orders) in the last 1 year by active vendor type with ordering by the oldest user on the platform
2. Vendor ID and vendor type that did the most in amt in non-cancelled orders in the last 3 years
3. Vendor ID and vendor type with the most amt in any in cancelled orders

Python: implement query 1 in Python using only the standard library. No spark, pandas, etc. Imagine you have a CSV for each table with the same headers.

In [1]:
# import the required modules
from datetime import datetime,timedelta
from collections import Counter
from itertools import groupby
import operator
import csv

In [2]:
# read csv file separated by ;, it has been assumed that file is with headers
def read_csv(filename):
    ls = []
    with open(filename, newline='') as csvfile:
        records = csv.reader(csvfile, delimiter=';', quotechar='"')
        try:
            for row in records:
                ls.append(row)
        except csv.Error as e:
            sys.exit('file {}, line {}: {}'.format(filename, records.line_num, e))
    return ls[1:]

In [3]:
# read all csvs
listusers = read_csv('C:/Users/onepoint/Desktop/DIM_USERS.csv')
listorders = read_csv('C:/Users/onepoint/Desktop/FCT_ORDERS.csv')
listvendors = read_csv('C:/Users/onepoint/Desktop/DIM_VENDORS.csv')

In [4]:
# loop to join and filter only successful orders (status == '0') in the last 1 year (placed_at > currentdate - 1 year) by active vendor type (vendor_is_active == true)
listmerge= []
for (order_id,status,amt,placed_at,cancelled_at,returned_at,failed_at,vendor_id_fk,user_id_fk) in listorders:
    for (user_id, name, phone, added_at, removed_at, email, is_active) in listusers:
        for (vendor_id, vendor_type, vendor_added_at,vendor_removed_at,vendor_is_active,vendor_is_platform_online) in listvendors:
            if user_id == user_id_fk and status == '0' and datetime.strptime(placed_at, '%d-%m-%Y %H:%M:%S') > (datetime.now() - timedelta(days=365)) and vendor_is_active == 'true' :
                Dict = {}
                Dict['order_id'] = int(order_id)
                Dict['user_id'] = int(user_id)
                Dict['added_at'] = datetime.strptime(added_at, '%d-%m-%Y')
                listmerge.append(Dict)

In [5]:
# to get top 5 users and their emails by total gross orders
# group by user_id and get the total count of orders (gross orders)
# output {'user_id':'count'}

d = sorted(listmerge, key = operator.itemgetter('user_id')) # sort the dict first by user_id
orderCount = {}
for uid, data in groupby(d, key = operator.itemgetter('user_id')):
    cnt = 0
    for row in data:
        cnt = cnt + 1
    orderCount[uid] = cnt

In [6]:
# get top 5 user id with most occurrence

top5 = dict(Counter(orderCount).most_common(5))

In [7]:
top5

{12: 3549, 2: 3042, 40: 3042, 64: 3042, 94: 3042}

In [8]:
# now we have top 5 users with most orders, get the other details like name, email from listusers
listOut = []
for uid in top5:
    for (user_id, name, phone, added_at, removed_at, email, is_active) in listusers:
        if (int(user_id) == uid):
            sub = {}
            sub['user_id'] = user_id
            sub['name'] = name
            sub['added_at'] = datetime.strptime(added_at, '%d-%m-%Y')
            sub['email'] = email
            listOut.append(sub)

In [9]:
listOut.sort(key=lambda item:(item['added_at']))

In [10]:
listOut

[{'user_id': '64',
  'name': 'James',
  'added_at': datetime.datetime(2020, 1, 15, 0, 0),
  'email': 'Grover.Taft@GMAIL.COM'},
 {'user_id': '94',
  'name': 'Warren',
  'added_at': datetime.datetime(2020, 1, 29, 0, 0),
  'email': 'Lyndon.Clinton@GMAIL.COM'},
 {'user_id': '40',
  'name': 'Ulysses',
  'added_at': datetime.datetime(2020, 2, 14, 0, 0),
  'email': 'Bill.Pierce@GMAIL.COM'},
 {'user_id': '2',
  'name': 'Herbert',
  'added_at': datetime.datetime(2020, 2, 25, 0, 0),
  'email': 'John.Quincy@GMAIL.COM'},
 {'user_id': '12',
  'name': 'Millard',
  'added_at': datetime.datetime(2020, 2, 26, 0, 0),
  'email': 'George.Fillmore@GMAIL.COM'}]

In [11]:
#Vendor ID and vendor type that did the most in amt in non-cancelled orders in the last 3 years

SELECT TOP 1 v.ID, v.TYPE, SUM(o.AMT) fct_orders o
INNER JOIN dim_vendors v ON v.ID = o.VENDOR_ID
WHERE o.STATUS != -1 AND o.PLACED_AT>= DATEADD(year, -3, GETDATE())
GROUP BY v.ID,v.TYPE
ORDER BY SUM(o.AMT) DESC

In [12]:
#Vendor ID and vendor type with the most in AMT in cancelled orders

SELECT TOP 1 v.ID, v.TYPE, SUM(o.AMT) fct_orders o
INNER JOIN dim_vendors v ON v.ID = o.VENDOR_ID
WHERE o.STATUS = -1
GROUP BY v.ID,v.TYPE
ORDER BY SUM(o.AMT) DESC