In [1]:
import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

import chardet


In [2]:
dir_root = "../../data/AutoML_DatasetsV1/Northwind/"

path_categories = os.path.join(dir_root, "Categories.csv")
path_customers =os.path.join(dir_root, "Customers.csv")
path_employees = os.path.join(dir_root, "Employees.csv")
path_employee_territories =os.path.join(dir_root, "EmployeeTerritories.csv")

path_order_details = os.path.join(dir_root, "Order_Details.csv")
path_orders =os.path.join(dir_root, "Orders.csv")
path_products = os.path.join(dir_root, "Products.csv")
path_region =os.path.join(dir_root, "Region.csv")

path_suppliers = os.path.join(dir_root, "Suppliers.csv")
path_territories =os.path.join(dir_root, "Territories.csv")

STR_LINE = ''.join(['-']*50)

In [3]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as rawdata:
        result = chardet.detect(rawdata.read(100000))
    print(result)
    return result['encoding']

In [4]:
def print_df(df_data, title="DataFrame"):
    str_report = f"{STR_LINE}\n{title}\n{STR_LINE}\n"
    str_report += f"shape: {df_data.shape}\n{STR_LINE}\n"
    str_report += f"describe: \n{df_data.describe()}\n{STR_LINE}\n"
    str_report += f"dtypes: \n{df_data.dtypes}\n{STR_LINE}\n"
    str_report += f"head: \n{df_data.head()}\n{STR_LINE}\n"
    str_report += f"Unique values: \n{STR_LINE}\n"
    for column in df_data.columns:
        num_uniq = len(df_data[column].unique())
        str_report += f"column: {column}: {num_uniq}\n{STR_LINE}\n"
        if num_uniq < 10:
            dict_val_counts = df_data[column].value_counts()
            str_report += f"{dict_val_counts}\n{STR_LINE}\n"
    # str_report += f"\n{STR_LINE}\n"
    
    print(str_report)
    # return str_report

In [6]:
encoding = detect_encoding(path_categories)
df_categories = pd.read_csv(path_categories, encoding=encoding)
print_df(df_categories, "Categories")

encoding = detect_encoding(path_customers)
df_customers = pd.read_csv(path_customers, encoding=encoding)
print_df(df_customers, "Customers")

encoding = detect_encoding(path_orders)
df_orders = pd.read_csv(path_orders, encoding=encoding)
print_df(df_orders, "Orders")

encoding = detect_encoding(path_order_details)
df_order_details = pd.read_csv(path_order_details, encoding=encoding)
print_df(df_order_details, "Order Details")

encoding = detect_encoding(path_employees)
df_employees = pd.read_csv(path_employees, encoding=encoding)
print_df(df_employees, "Employees")

encoding = detect_encoding(path_region)
df_regions = pd.read_csv(path_region, encoding=encoding)
print_df(df_regions, "Regions")

encoding = detect_encoding(path_employee_territories)
df_employee_territories = pd.read_csv(path_employee_territories, encoding=encoding)
print_df(df_employee_territories, "Employee Territories")

encoding = detect_encoding(path_products)
df_products = pd.read_csv(path_products, encoding=encoding)
print_df(df_products, "Products")

encoding = detect_encoding(path_suppliers)
df_suppliers = pd.read_csv(path_suppliers, encoding=encoding)
print_df(df_suppliers, "Suppliers")

encoding = detect_encoding(path_territories)
df_territories = pd.read_csv(path_territories, encoding=encoding)
print_df(df_territories, "Territories")

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
--------------------------------------------------
Categories
--------------------------------------------------
shape: (8, 3)
--------------------------------------------------
describe: 
       CategoryID
count     8.00000
mean      4.50000
std       2.44949
min       1.00000
25%       2.75000
50%       4.50000
75%       6.25000
max       8.00000
--------------------------------------------------
dtypes: 
CategoryID       int64
CategoryName    object
Description     object
dtype: object
--------------------------------------------------
head: 
   CategoryID    CategoryName  \
0           1       Beverages   
1           2      Condiments   
2           3     Confections   
3           4  Dairy Products   
4           5  Grains/Cereals   

                                         Description  
0        Soft drinks, coffees, teas, beers, and ales  
1  Sweet and savory sauces, relishes, spreads, an...  
2                Desserts, 

In [19]:
def total_sale_amt(df_data):
    for row_i, row in df_data.iterrows():
        pass

def sale_summary(df_data):
    print(df_data.head())
    df_data = df_data.head(10)
    df_group = df_data.groupby('OrderID')
    
    list_records = []
    for order_id, df_details in df_group:
        print(f"{order_id}\n{df_details}")
        df_agg = df_details.agg(['sum', 'min'])
        print(f"{type(df_agg)}{df_agg}")
        
        dict_record = {
            'OrderID': order_id,
            'num_products': df_agg.loc['sum', 'Quantity']
        }
        list_records.append(dict_record)
    
    df_summary = pd.DataFrame.from_records(list_records)
    # print(df_summary)
    return df_summary
    
    
    
sale_summary(df_order_details)

   OrderID  ProductID  UnitPrice  Quantity  Discount
0    10248         11       14.0        12       0.0
1    10248         42        9.8        10       0.0
2    10248         72       34.8         5       0.0
3    10249         14       18.6         9       0.0
4    10249         51       42.4        40       0.0
10248
   OrderID  ProductID  UnitPrice  Quantity  Discount
0    10248         11       14.0        12       0.0
1    10248         42        9.8        10       0.0
2    10248         72       34.8         5       0.0
<class 'pandas.core.frame.DataFrame'>     OrderID  ProductID  UnitPrice  Quantity  Discount
sum    30744        125       58.6        27       0.0
min    10248         11        9.8         5       0.0
10249
   OrderID  ProductID  UnitPrice  Quantity  Discount
3    10249         14       18.6         9       0.0
4    10249         51       42.4        40       0.0
<class 'pandas.core.frame.DataFrame'>     OrderID  ProductID  UnitPrice  Quantity  Discount
sum  

Unnamed: 0,OrderID,num_products
0,10248,27
1,10249,49
2,10250,60
3,10251,21
