In [1]:
import pandas as pd

In [224]:
customers = pd.read_csv('../data/01_raw/Customers.csv', sep=';', decimal=',')

orders = pd.read_csv('../data/01_raw/Orders.csv', sep=';', decimal=',')

# convert to date time format
orders.OrderDate = pd.to_datetime(orders["OrderDate"])

# get year column as it is used often
orders["OrderYear"] = orders['OrderDate'].dt.year

order_details = pd.read_csv('../data/01_raw/OrderDetails.csv', sep=';', decimal=',')
order_details['orderTotalValue'] = order_details.Quantity * order_details.UnitPrice

In [212]:
orders_year = orders.loc[orders['OrderDate'].dt.year == 2016].reset_index(drop=True)
orders_year = orders_year[["OrderID", "CustomerID"]]


order_details['orderTotalValue'] = order_details.Quantity * order_details.UnitPrice
order_details = order_details[["OrderID", "UnitPrice", "Quantity", "orderTotalValue"]]
orders_with_order_details = order_details.merge(orders_year, on='OrderID')
orders_with_order_details = orders_with_order_details[['CustomerID', "OrderID",'orderTotalValue']]

orders_grouped = orders_with_order_details.groupby(["CustomerID", "OrderID"], as_index=False).sum()[["CustomerID", "orderTotalValue"]]

orders_grouped.sort_values("orderTotalValue", ascending=False, inplace=True)
orders_grouped = orders_grouped[orders_grouped["orderTotalValue"] >= 10000]

# Query 48

In [313]:
# get customers for year 2016
relevant_customers = orders[orders["OrderYear"] == 2016].reset_index(drop=True)[
    ["OrderID", "CustomerID"]
]

In [314]:
# merge tables to get customerID + orderTotalValue
customer_values_flat = relevant_customers.merge(orders, on="OrderID").merge(order_details, on="OrderID")[
    ["CustomerID_x", "orderTotalValue"]
]
customer_values_flat.rename(columns={"CustomerID_x": "CustomerID"}, inplace=True)
customer_values_flat.head()

Unnamed: 0,CustomerID,orderTotalValue
0,OLDWO,760.0
1,OLDWO,900.0
2,WELLI,140.0
3,LAUGB,42.0
4,LAUGB,70.0


In [315]:
# get the total orderTotalValue per customer by grouping + summing
customer_values_grouped = customer_values_flat.groupby("CustomerID", as_index=False).sum()

customer_values_grouped.head()

Unnamed: 0,CustomerID,orderTotalValue
0,ALFKI,2302.2
1,ANATR,514.4
2,ANTON,660.0
3,AROUT,5838.5
4,BERGS,8110.55


In [318]:
# fill the grouping names

# define the grouing ranges + default group name
customer_groups = {
    "0-1000":"low"
    , "1001-5000":"medium"
    , "5001-10000":"high"
}
default_customer_group = "very high"

# fill the CustomerGroupNames using a loop
for ranges, group_name in customer_groups.items():
    range_from = int(ranges.split("-")[0])
    range_to =  int(ranges.split("-")[1])
    customer_values_grouped.loc[
        customer_values_grouped['orderTotalValue'].between(
            range_from, range_to, inclusive=True), 'CustomerGroup'
    ] = group_name
    
# add the default CustomerGroup for value above 10000
customer_values_grouped["CustomerGroup"] = customer_values_grouped["CustomerGroup"].fillna(default_customer_group)

In [None]:
# sort the result by CustomerID
customer_values_grouped.sort_values("CustomerID", inplace=True)

In [327]:
# get final result with company name

# join the customers table to get company name
result = customers.merge(customer_values_grouped, on="CustomerID")[
    ["CustomerID", "CompanyName", "orderTotalValue", "CustomerGroup"]
]
result.head()

Unnamed: 0,CustomerID,CompanyName,orderTotalValue,CustomerGroup
0,ALFKI,Alfreds Futterkiste,2302.2,medium
1,ANATR,Ana Trujillo Emparedados y helados,514.4,low
2,ANTON,Antonio Moreno Taquería,660.0,low
3,AROUT,Around the Horn,5838.5,high
4,BERGS,Berglunds snabbköp,8110.55,high
