**TEAM SUBMISSION FOR PROJECT CHECKPOINT 4**



In [2]:
# Import necessary packages
import pandas as pd
import string
import random
import numpy as np
from sqlalchemy import create_engine

In [3]:
# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost/5310Group'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

In [75]:
# Pass the SQL statements that create all 3NF tables 
createCmd = """
CREATE TABLE customer(
      customer_id integer,
      customer_fname varchar(120) NOT NULL,
      customer_lname varchar(120) NOT NULL,
      customer_street varchar(255),
      customer_city varchar(255),
      customer_state varchar(255),
      customer_country varchar(255),
      customer_zipcode integer,
      customer_segment varchar(255),
      PRIMARY KEY (customer_id)
);

CREATE TABLE customer_phones(
      phone_id integer,
      customer_id integer,
      phone_number integer,
      phone_type varchar(10),
      PRIMARY KEY (phone_id),
      FOREIGN KEY (customer_id) REFERENCES customer(customer_id)
);

CREATE TABLE department (
      department_id int,
      department_name varchar(120) NOT NULL,
      PRIMARY KEY (department_id)
);

CREATE TABLE product_category (
      category_id integer, 
      category_name varchar (120) NOT NULL,  
      department_id int, 
	  PRIMARY KEY (category_id), 
	  FOREIGN KEY (department_id) REFERENCES department
);

CREATE TABLE supplier (
      supplier_id serial,
      company_name varchar(50),
      supplier_name varchar(50),
      supplier_contactinfo varchar(50),
      PRIMARY KEY (supplier_id)
);

CREATE TABLE market (
      market_id integer,
      city varchar(120) NOT NULL,
      state varchar (120) NOT NULL,
      country varchar(120) NOT NULL,
      continent varchar(120),
      street varchar(120),
      PRIMARY KEY (market_id)
);

CREATE TABLE shipment (
      ship_id integer,
      days_for_shipping_real int NOT NULL,
      days_for_shipping_scheduled int,
      delivery_status varchar(120),
      late_delivery_risk int,
      shipping_mode varchar(35) NOT NULL,
      shipping_date timestamp NOT NULL,
      market_id int,
      PRIMARY KEY (ship_id),
      FOREIGN KEY (market_id) REFERENCES market
);

CREATE TABLE orders(
      order_id integer,
      order_product_quantity integer NOT NULL,
      customer_id integer,
      order_date timestamp NOT NULL,
      order_status varchar(30) NOT NULL,
      PRIMARY KEY (order_id),
      FOREIGN KEY (customer_id) REFERENCES customer (customer_id)
);

CREATE TABLE order_ship(
      order_id int,
      ship_id int,
      PRIMARY KEY (order_id, ship_id),
      FOREIGN KEY (order_id) REFERENCES orders (order_id),
      FOREIGN KEY (ship_id) REFERENCES shipment (ship_id)
);
                       
CREATE TABLE product(
      product_id integer,
      category_id integer,
      product_name varchar(255) NOT NULL,
      product_unit_price numeric(12,2) NOT NULL,
      product_status integer,
      supplier_id serial,
      PRIMARY KEY (product_id),
      FOREIGN KEY (category_id) REFERENCES product_category (category_id),
      FOREIGN KEY (supplier_id) REFERENCES supplier (supplier_id)
);  

CREATE TABLE order_product(
      order_id integer,
      product_id integer,
      PRIMARY KEY (order_id, product_id),
      FOREIGN KEY (order_id) REFERENCES orders(order_id),
      FOREIGN KEY (product_id) REFERENCES product(product_id)
);

CREATE TABLE transaction_details(
      transaction_id int,
      sales numeric(12,2) NOT NULL,
      order_profit_per_order numeric(12,2) NOT NULL,
      order_item_discount_rate numeric(3,2),
      order_id integer,
      PRIMARY KEY (transaction_id),
      FOREIGN KEY (order_id) REFERENCES orders (order_id)
);

CREATE TABLE billing_info(
      billing_id int,
      billing_type varchar(20),
      bill_date timestamp,
      billing_address varchar(120),
      transaction_id int,
      PRIMARY KEY (billing_id),
      FOREIGN KEY (transaction_id) REFERENCES transaction_details
);
"""
# Execute the statement to create tables
connection.execute(createCmd)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1abae214370>

In [61]:
# Load the csv file in a dataframe
df = pd.read_csv('C:/Users/123/Downloads/DataCoSupplyChainDataset.csv (1)/DataCoSupplyChainDataset.csv', encoding = "ISO-8859-1")

In [62]:
# Rename columns
df = df.rename(columns={"Category Id":"category_id",
             "Category Name":"category_name",
             "Department Id":"department_id",
             "Department Name":"department_name",
             "Product Card Id":"product_id",
             "Product Name":"product_name",
             "Product Price":"product_unit_price",
             "Product Status":"product_status_name",
             "Customer Id":"customer_id",
             "Customer Fname":"customer_fname",
             "Customer Lname":"customer_lname",
             "Customer Street":"customer_street",
             "Customer City":"customer_city",
             "Customer Country":"customer_country",
             "Customer Zipcode":"customer_zipcode",
             "Customer Segment":"customer_segment",
             "Order City":"city",
             "Order Country":"country",
             "Order State":"state",
             "Order Region":"continent",
             "Days for shipping (real)":"days_for_shipping_real",
             "Days for shipment (scheduled)":"days_for_shipping_scheduled",
             "Delivery Status":"delivery_status",
             "Late_delivery_risk":"late_delivery_risk",
             "Shipping Mode":"shipping_mode",
             "shipping date (DateOrders)":"shipping_date",
             "Order Id":"order_id",
             "Order Item Quantity":"order_product_quantity",
             "order date (DateOrders)":"order_date",
             "Order Status":"order_status",
             "Sales":"sales",
             "Order Profit Per Order":"order_profit_per_order",
             "Order Item Discount Rate":"order_item_discount_rate",
             "Type":"payment_name"
            })

In [57]:
# check null values to match not null constraints
df.isnull().sum()

payment_name                        0
days_for_shipping_real              0
days_for_shipping_scheduled         0
Benefit per order                   0
Sales per customer                  0
delivery_status                     0
late_delivery_risk                  0
category_id                         0
category_name                       0
customer_city                       0
customer_country                    0
Customer Email                      0
customer_fname                      0
customer_id                         0
customer_lname                      8
Customer Password                   0
customer_segment                    0
Customer State                      0
customer_street                     0
customer_zipcode                    3
department_id                       0
department_name                     0
Latitude                            0
Longitude                           0
Market                              0
city                                0
country     

In [63]:
# drop 8 null values for customer_lname
df = df.dropna(subset=['customer_lname'])

Unnamed: 0,payment_name,days_for_shipping_real,days_for_shipping_scheduled,Benefit per order,Sales per customer,delivery_status,late_delivery_risk,category_id,category_name,customer_city,...,Order Zipcode,product_id,Product Category Id,Product Description,Product Image,product_name,product_unit_price,product_status_name,shipping_date,shipping_mode
0,DEBIT,3,4,91.250000,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.750000,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.750000,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.750000,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.750000,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.250000,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.750000,0,1/15/2018 11:24,Standard Class
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180514,CASH,4,4,40.000000,399.980011,Shipping on time,0,45,Fishing,Brooklyn,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/20/2016 3:40,Standard Class
180515,DEBIT,3,2,-613.770019,395.980011,Late delivery,1,45,Fishing,Bakersfield,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/19/2016 1:34,Second Class
180516,TRANSFER,5,4,141.110001,391.980011,Late delivery,1,45,Fishing,Bristol,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/20/2016 21:00,Standard Class
180517,PAYMENT,3,4,186.229996,387.980011,Advance shipping,0,45,Fishing,Caguas,...,,1004,45,,http://images.acmesports.sports/Field+%26+Stre...,Field & Stream Sportsman 16 Gun Fire Safe,399.980011,0,1/18/2016 20:18,Standard Class


In [64]:
# Add new columns in the df to meet ERD design
df = df.assign(company_name=np.nan, supplier_name='', supplier_contactinfo='', billing_type='',bill_date='', billing_address='', phone_number='', phone_type='') 

In [67]:
# Add new columns in the df to meet ERD design
df = df.assign(company_name=np.nan, supplier_name=np.nan, supplier_contactinfo=np.nan, billing_type=np.nan, bill_date=np.nan, billing_address=np.nan, phone_number=np.nan, phone_type=np.nan) 

In [65]:
# Create new columns with incrementing integer numbers for product_status_id
df.insert(0, 'product_status_id', range(1, 1 + len(df)))
# Create new columns with incrementing integer numbers for market table - market_id
df.insert(0, 'market_id', range(1, 1 + len(df)))
# Create new columns with incrementing integer numbers for shipment table - ship_id
df.insert(0, 'ship_id', range(1, 1 + len(df)))
# Create new columns with incrementing integer numbers for payment_id
df.insert(0, 'payment_id', range(1, 1 + len(df)))    
# Create new columns with incrementing integer numbers for transaction_id
df.insert(0, 'transaction_id', range(1, 1 + len(df)))
# Create new columns with incrementing integer numbers for supplier_id
df.insert(0, 'supplier_id', range(1, 1 + len(df)))
# Create new columns with incrementing integer numbers for phone_id
df.insert(0, 'phone_id', range(1, 1 + len(df)))
# Create new columns with incrementing integer numbers for billing_id
df.insert(0, 'billing_id', range(1, 1 + len(df)))

In [76]:
# Create a subset of df corresponding to the department table and load data
department_df = df[['department_id', 'department_name']].drop_duplicates()
department_df.to_sql(name='department', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the product_category table and load data
category_df = df[['category_id', 'category_name', 'department_id']].drop_duplicates()
category_df.to_sql(name='product_category', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the supplier table and load data
supplier_df = df[['supplier_id', 'company_name', 'supplier_name', 'supplier_contactinfo']].drop_duplicates()
supplier_df.to_sql(name='supplier', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the product table and load data
df2 = df.drop_duplicates(['product_id'])
product_df = df2[['product_id', 'product_name', 'product_unit_price', 'category_id', "supplier_id"]]
product_df.to_sql(name='product', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the customer table and load data
customer_df = df[['customer_id','customer_fname','customer_lname','customer_street','customer_city','customer_country','customer_zipcode','customer_segment']].drop_duplicates()
customer_df.to_sql(name='customer', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the customer_phones table and load data
customer_phones_df = df[['phone_id','customer_id','phone_number','phone_type']].drop_duplicates()
customer_phones_df.to_sql(name='customer_phones', con=engine, if_exists='append', index=False) 

# Create a subset of df corresponding to the market table and load data
market_df = df[['market_id','city','country','state','continent']].drop_duplicates()
market_df.to_sql(name='market', con=engine, if_exists='append', index=False) 

# Create a subset of df corresponding to the shipment table and load data
shipment_df = df[['ship_id', 'days_for_shipping_real','days_for_shipping_scheduled','delivery_status','late_delivery_risk','shipping_mode','shipping_date','market_id']].drop_duplicates()
shipment_df.to_sql(name='shipment', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the orders table and load data
df1 = df.drop_duplicates(['order_id'])
orders_df = df1[['order_id','order_product_quantity','customer_id','order_date','order_status']]
orders_df.to_sql(name='orders', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the order_product table and load data
order_product_df = df[['order_id','product_id']].drop_duplicates()
order_product_df.to_sql(name='order_product', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the transaction_details table and load data
transaction_df = df[['transaction_id','sales','order_profit_per_order','order_item_discount_rate','order_id']].drop_duplicates()
transaction_df.to_sql(name='transaction_details', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the billing_info table and load data
billing_info_df = df[['billing_id', 'billing_type','bill_date','billing_address','transaction_id']]
billing_info_df.to_sql(name='billing_info', con=engine, if_exists='append', index=False)

# Create a subset of df corresponding to the order_ship table and load data
order_ship_df = df[['order_id', 'ship_id']].drop_duplicates()
order_ship_df.to_sql(name='order_ship', con=engine, if_exists='append', index=False)

In [77]:
# Procedure 1: Top 10 sales product?
Cmd1 = """
SELECT product.product_name, ROUND(SUM(product.product_unit_price*orders.order_product_quantity)::NUMERIC,2) AS price
FROM product
JOIN order_product
  ON product.product_id = order_product.product_id
JOIN orders
  ON order_product.order_id = orders.order_id
GROUP BY 1
ORDER BY price DESC
LIMIT 10;
"""
# Execute the statement and get the results
result1 = connection.execute(Cmd1).fetchall()

# Extract column names
column_name1 = result1[0].keys()

# Store results in a new dataframe
temp_df1 = pd.DataFrame(result1, columns=column_name1)

# Show results
temp_df1

Unnamed: 0,product_name,price
0,Field & Stream Sportsman 16 Gun Fire Safe,9882705.84
1,Diamondback Women's Serene Classic Comfort Bi,5283847.72
2,Pelican Sunstream 100 Kayak,4072796.35
3,Nike Men's CJ Elite 2 TD Football Cleat,3478922.37
4,Perfect Fitness Perfect Rip Deck,2374584.17
5,Nike Men's Free 5.0+ Running Shoe,2172982.68
6,O'Brien Men's Neoprene Life Vest,1777888.56
7,Nike Men's Dri-FIT Victory Golf Polo,1729500.0
8,Under Armour Girls' Toddler Spine Surge Runni,748132.92
9,Dell Laptop,663000.0


In [78]:
# Which day of the week, customers tend to go shopping?
Cmd2 = """
SELECT EXTRACT(isodow FROM order_date) AS day_of_week, COUNT(order_id) AS frequency
FROM orders
GROUP BY 1
ORDER BY frequency;
"""
# Execute the statement and get the results
result2 = connection.execute(Cmd2).fetchall()

# Extract column names
column_name2 = result2[0].keys()

# Store results in a new dataframe
temp_df2 = pd.DataFrame(result2, columns=column_name2)

# Show results
temp_df2

Unnamed: 0,day_of_week,frequency
0,7,9367
1,1,9368
2,2,9392
3,4,9393
4,3,9395
5,5,9400
6,6,9429


In [79]:
# How many orders in the different order price range?
Cmd3 = """
WITH order_value AS (SELECT orders.order_id, transaction_details.sales
FROM orders
JOIN transaction_details
  ON orders.order_id = transaction_details.order_id)

SELECT ROUND(sales::NUMERIC,-2) AS price_range,COUNT(order_id)
FROM order_value
GROUP BY price_range
ORDER BY price_range;
"""
# Execute the statement and get the results
result3 = connection.execute(Cmd3).fetchall()

# Extract column names
column_name3 = result3[0].keys()

# Store results in a new dataframe
temp_df3 = pd.DataFrame(result3, columns=column_name3)

# Show results
temp_df3

Unnamed: 0,price_range,count
0,0,11947
1,100,63326
2,200,52086
3,300,27773
4,400,20768
5,500,4123
6,600,21
7,1000,10
8,1500,442
9,2000,15


In [74]:
#Delete tables if needed
deleteCmd = """
DROP TABLE order_product;
DROP TABLE product;
DROP TABLE supplier;
DROP TABLE product_category;
DROP TABLE department;
DROP TABLE billing_info;
DROP TABLE transaction_details;
DROP TABLE order_ship;
DROP TABLE orders;
DROP TABLE customer_phones;
DROP TABLE customer;
DROP TABLE shipment;
DROP TABLE market;
"""
connection.execute(deleteCmd)

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1abcf2abe80>