# **Northwind Traders**

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# file imports

categories = pd.read_csv('./data/categories.csv')

# becuase the file is saved with ansi encoding
customers = pd.read_csv('./data/customers.csv', encoding='ansi')
categories = pd.read_csv('./data/categories.csv')
employees = pd.read_csv('./data/employees.csv')
order_details = pd.read_csv('./data/order_details.csv')
orders = pd.read_csv('./data/orders.csv')

# because the file is saved with ansi encoding
products = pd.read_csv('./data/products.csv', encoding='ansi')
shippers = pd.read_csv('./data/shippers.csv')

## **Data Manipulation**

In [3]:
# joining products, categories dataframe with order_details dataframe

order_details_combined = (
    order_details
    .join(
        products.set_index('productID'),
        on='productID',
        rsuffix='_p')
    .join(
        categories.set_index('categoryID'),
        on='categoryID',
        rsuffix='__r')
    .drop(['productID', 'categoryID', 'description', 'discontinued', 'quantityPerUnit'], axis=1)
    .rename(columns={
        'unitPrice' : 'sellingPrice',
        'unitPrice_p' : 'originalPrice'})
)

In [4]:
# joining shippers, customers, employees dataframe with orders dataframe

orders_combined = (
    orders
    .join(
        shippers.set_index('shipperID'),
        on='shipperID',
        rsuffix='_s')
    .join(
        customers.drop(['contactName', 'contactTitle'], axis=1).set_index('customerID'),
        on='customerID',
        rsuffix='_c')
    .join(
        employees.drop('reportsTo', axis=1).set_index('employeeID'),
        on='employeeID',
        rsuffix='_e')
    .drop(
        ['customerID', 'employeeID', 'shipperID'],
        axis=1)
    .rename(
        columns={
            'companyName' : 'shipperName',
            'companyName_c' : 'customerName',
            'city' : 'customerCity',
            'country' : 'customerCountry',
            'title' : 'employeeTitle',
            'city_e' : 'employeeCity',
            'country_e' : 'employeeCountry'
            })
)

In [5]:
# converting string date data type to datetime data type

orders_combined['orderDate'] = pd.to_datetime(orders_combined['orderDate'])
orders_combined['requiredDate'] = pd.to_datetime(orders_combined['requiredDate'])
orders_combined['shippedDate'] = pd.to_datetime(orders_combined['shippedDate'])

orders_combined['deliveryTime'] = (orders_combined['shippedDate'] - orders_combined['orderDate']).dt.days
orders_combined['daysDelayed'] = (orders_combined['shippedDate'] - orders_combined['requiredDate']).dt.days
orders_combined['daysDelayed'] = np.where(orders_combined['daysDelayed'] < 1, np.nan, orders_combined['daysDelayed'])

orders_combined

Unnamed: 0,orderID,orderDate,requiredDate,shippedDate,freight,shipperName,customerName,customerCity,customerCountry,employeeName,employeeTitle,employeeCity,employeeCountry,deliveryTime,daysDelayed
0,10248,2013-07-04,2013-08-01,2013-07-16,32.38,Federal Shipping,Vins et alcools Chevalier,Reims,France,Steven Buchanan,Sales Manager,London,UK,12.0,
1,10249,2013-07-05,2013-08-16,2013-07-10,11.61,Speedy Express,Toms Spezialitäten,Münster,Germany,Michael Suyama,Sales Representative,London,UK,5.0,
2,10250,2013-07-08,2013-08-05,2013-07-12,65.83,United Package,Hanari Carnes,Rio de Janeiro,Brazil,Margaret Peacock,Sales Representative,New York,USA,4.0,
3,10251,2013-07-08,2013-08-05,2013-07-15,41.34,Speedy Express,Victuailles en stock,Lyon,France,Janet Leverling,Sales Representative,New York,USA,7.0,
4,10252,2013-07-09,2013-08-06,2013-07-11,51.30,United Package,Suprêmes délices,Charleroi,Belgium,Margaret Peacock,Sales Representative,New York,USA,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
825,11073,2015-05-05,2015-06-02,NaT,24.95,United Package,Pericles Comidas clásicas,Mexico City,Mexico,Andrew Fuller,Vice President Sales,New York,USA,,
826,11074,2015-05-06,2015-06-03,NaT,18.44,United Package,Simons bistro,Kobenhavn,Denmark,Robert King,Sales Representative,London,UK,,
827,11075,2015-05-06,2015-06-03,NaT,6.19,United Package,Richter Supermarkt,Genève,Switzerland,Laura Callahan,Sales Manager,New York,USA,,
828,11076,2015-05-06,2015-06-03,NaT,38.28,United Package,Bon app',Marseille,France,Margaret Peacock,Sales Representative,New York,USA,,


In [6]:
order_details_combined['discount'] = np.round(1 - order_details_combined['sellingPrice']/order_details_combined['originalPrice'], 2)

# replacing less than 0 values with null so that when we avg out the data, nulls are not considered
order_details_combined['discount'] = np.where(order_details_combined['discount'] > 0, order_details_combined['discount'], np.nan)

order_details_combined['sale'] = order_details_combined['sellingPrice']*order_details_combined['quantity']

order_details_combined

Unnamed: 0,orderID,sellingPrice,quantity,discount,productName,originalPrice,categoryName,sale
0,10248,14.00,12,0.33,Queso Cabrales,21.00,Dairy Products,168.0
1,10248,9.80,10,0.30,Singaporean Hokkien Fried Mee,14.00,Grains & Cereals,98.0
2,10248,34.80,5,,Mozzarella di Giovanni,34.80,Dairy Products,174.0
3,10249,18.60,9,0.20,Tofu,23.25,Produce,167.4
4,10249,42.40,40,0.20,Manjimup Dried Apples,53.00,Produce,1696.0
...,...,...,...,...,...,...,...,...
2150,11077,33.25,2,,Wimmers gute Semmelknödel,33.25,Grains & Cereals,66.5
2151,11077,17.00,1,,Louisiana Hot Spiced Okra,17.00,Condiments,17.0
2152,11077,15.00,2,,Röd Kaviar,15.00,Seafood,30.0
2153,11077,7.75,4,,Rhönbräu Klosterbier,7.75,Beverages,31.0


In [7]:
# joining the orders_combined dataframe and sale column from order_details_combined to get one consolidated dataframe

orders_combined = (
    orders_combined
    .join(
        order_details_combined
        .groupby('orderID')['sale']
        .sum()
        .reset_index()
        .rename(columns={'sale':'order_sale'})
        .set_index('orderID'),
        
        on='orderID')
        )

## **Exporting files**

In [8]:
try: os.mkdir('./clean-data')
except: pass

orders_combined.to_csv('./clean-data/orders.csv', index=False, encoding='ansi')
order_details_combined.to_csv('./clean-data/order_details.csv', index=False, encoding='ansi')