## Customer Segmentation Project

### Analysing data with SQL

This is an extra section for the project utilizing SQL to query the required information to answer business questions. 

The data for this section of analysis are extracted from bootcamp.online_transactions_cleaned table in AWS Redshift where the following transformation has been performed on the data:
1. joint the description field stock_description table to bootcamp.online_transactions 
2. removed customer_id = ''
3. removed stock_code = 'BANK CHARGES', 'POST', 'D', 'M', 'CRUK'
4. removed duplicated entries from the data
5. added 'total_order_value'(quantity * price) column

In [1]:
# Import the required libraries

import psycopg2
import pandas as pd
import numpy as np

# using this library for reading the password
from dotenv import load_dotenv
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import environment variables from .env file

load_dotenv()

dbname = os.getenv("dbname")
host = os.getenv("host")
port = os.getenv("port")
user = os.getenv("user")
password = os.getenv("password")

In [3]:
# Connect to AWS Redshift

connect = psycopg2.connect(dbname=dbname, host=host, port=port, user=user, password=password)
cursor = connect.cursor()

In [4]:
# Check the informations in the tables(online_transactions_cleaned)

query = """
SELECT * 
FROM bootcamp.online_transactions_cleaned 
LIMIT 10
"""

online_transactions_cleaned = pd.read_sql(query, connect)
online_transactions_cleaned

Unnamed: 0,invoice,stock_code,description,price,quantity,total_order_value,invoice_date,customer_id,country
0,536370,21791,VINTAGE HEADS AND TAILS CARD GAME,1.25,24,30.0,2010-12-01 08:45:00,u12583,France
1,536385,22783,SET 3 WICKER OVAL BASKETS W LIDS,19.95,1,19.95,2010-12-01 09:56:00,u1742,United Kingdom
2,536392,22128,PARTY CONES CANDY ASSORTED,1.25,12,15.0,2010-12-01 10:29:00,u13705,United Kingdom
3,536404,22469,HEART OF WICKER SMALL,1.65,12,19.8,2010-12-01 11:29:00,u16218,United Kingdom
4,536412,22274,FELTCRAFT DOLL EMILY,2.95,2,5.9,2010-12-01 11:49:00,u1792,United Kingdom
5,536425,22585,PACK OF 6 BIRDY GIFT TAGS,1.25,12,15.0,2010-12-01 12:08:00,u13758,United Kingdom
6,536464,22960,JAM MAKING SET WITH JARS,4.25,1,4.25,2010-12-01 12:23:00,u17968,United Kingdom
7,536520,20754,RETROSPOT RED WASHING UP GLOVES,2.1,1,2.1,2010-12-01 12:43:00,u14729,United Kingdom
8,536526,21135,VICTORIAN METAL POSTCARD SPRING,1.69,16,27.04,2010-12-01 12:58:00,u14001,United Kingdom
9,536381,22411,JUMBO SHOPPER VINTAGE RED PAISLEY,1.95,10,19.5,2010-12-01 09:41:00,u15311,United Kingdom


#### Explore data and provide business information using SQL

In [5]:
# What is the total no of rows in the data?

query = """ 
SELECT COUNT (*)
FROM bootcamp.online_transactions_cleaned;
"""

pd.read_sql(query, connect)

Unnamed: 0,count
0,399841


In [6]:
# What is the total no of invoices?

query = """
SELECT COUNT (DISTINCT invoice) AS total_no_inv
FROM bootcamp.online_transactions_cleaned otc;
"""

pd.read_sql(query, connect)

Unnamed: 0,total_no_inv
0,21791


In [7]:
# When is the first and last invoice date of the orders?

query = """
SELECT MIN(invoice_date) AS first_inv_date, 
       MAX(invoice_date) AS last_inv_date 
FROM bootcamp.online_transactions_cleaned otc;
"""

pd.read_sql(query, connect)

Unnamed: 0,first_inv_date,last_inv_date
0,2010-12-01 08:26:00,2011-12-09 12:50:00


In [8]:
# What is the total_sales of the company?

query = """
SELECT ROUND(SUM(total_order_value)) AS total_co_sales
FROM bootcamp.online_transactions_cleaned otc;
"""

pd.read_sql(query, connect)

Unnamed: 0,total_co_sales
0,8283467.0


In [9]:
# What is the total no of customers?

query = """
SELECT COUNT(DISTINCT customer_id) AS total_no_cust
FROM bootcamp.online_transactions_cleaned otc;
"""

pd.read_sql(query, connect)

Unnamed: 0,total_no_cust
0,4363


In [10]:
# What is the total no of stock purchased?

query = """
SELECT COUNT(DISTINCT stock_code) AS total_stock_purchased
FROM bootcamp.online_transactions_cleaned otc;
"""

pd.read_sql(query, connect)

Unnamed: 0,total_stock_purchased
0,3679


In [11]:
# What is the top 10 most popular stock sold based on quantity sold

query = """
SELECT stock_code, description, sum(quantity) AS total_qty
FROM bootcamp.online_transactions_cleaned otc 
GROUP BY stock_code, description
ORDER BY sum(quantity) DESC
LIMIT 10;
"""

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description,total_qty
0,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,53119
1,22197,POPCORN HOLDER,48689
2,85099B,JUMBO BAG RED RETROSPOT,44963
3,84879,ASSORTED COLOUR BIRD ORNAMENT,35215
4,85123A,CREAM HANGING HEART T-LIGHT HOLDER,34185
5,21212,PACK OF 72 RETROSPOT CAKE CASES,33386
6,23084,RABBIT NIGHT LIGHT,27045
7,22492,MINI PAINT SET VINTAGE,25880
8,22616,PACK OF 12 LONDON TISSUES,25305
9,21977,PACK OF 60 PINK PAISLEY CAKE CASES,24129


In [12]:
# What is the top 10 most popular stock sold based on revenue

query = """
SELECT stock_code, description, sum(total_order_value) AS total_revenue
FROM bootcamp.online_transactions_cleaned otc 
GROUP BY stock_code, description
ORDER BY sum(total_order_value) DESC
LIMIT 10;
"""

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description,total_revenue
0,22423,REGENCY CAKESTAND 3 TIER,132567.7
1,85123A,CREAM HANGING HEART T-LIGHT HOLDER,93923.15
2,85099B,JUMBO BAG RED RETROSPOT,83056.52
3,47566,PARTY BUNTING,67628.43
4,84879,ASSORTED COLOUR BIRD ORNAMENT,56331.91
5,23084,RABBIT NIGHT LIGHT,51042.84
6,22502,PICNIC BASKET WICKER SMALL,46963.1
7,79321,CHILLI LIGHTS,45915.41
8,22086,PAPER CHAIN KIT 50'S CHRISTMAS,41423.78
9,21137,BLACK RECORD COVER FRAME,38990.63


In [13]:
# What is the average order value?

query = """
SELECT ROUND(AVG(total_order_value),2) AS avg_order_value
FROM bootcamp.online_transactions_cleaned otc;
"""

pd.read_sql(query, connect)

Unnamed: 0,avg_order_value
0,20.72


In [14]:
# What is the total no of invoices and total sales by country?

query = """
SELECT country, COUNT(invoice) AS no_inv, ROUND(SUM(total_order_value),2) as total_order
FROM bootcamp.online_transactions_cleaned otc
GROUP BY country
ORDER BY total_order DESC;
"""

pd.read_sql(query, connect)

Unnamed: 0,country,no_inv,total_order
0,United Kingdom,356158,6815375.09
1,Netherlands,2330,283479.54
2,EIRE,7469,251557.47
3,Germany,9081,200619.66
4,France,8154,181571.54
5,Australia,1256,136922.5
6,Switzerland,1844,51859.4
7,Spain,2463,51746.65
8,Belgium,1971,36662.96
9,Japan,355,35419.79


In [15]:
# What is the total no of customer per country

query = """
SELECT country, COUNT(DISTINCT(customer_id))
FROM bootcamp.online_transactions_cleaned otc 
GROUP BY country 
ORDER BY COUNT(DISTINCT(customer_id)) DESC;
"""

pd.read_sql(query, connect)

Unnamed: 0,country,count
0,United Kingdom,3943
1,Germany,95
2,France,87
3,Spain,30
4,Belgium,25
5,Switzerland,21
6,Portugal,19
7,Italy,14
8,Finland,12
9,Austria,11


In [16]:
# What is the top 10 customer based on total_revenue

query = """
SELECT customer_id, country, SUM(total_order_value) AS total_revenue
FROM bootcamp.online_transactions_cleaned otc 
GROUP BY customer_id, country
ORDER BY total_revenue DESC
LIMIT 10;
"""

pd.read_sql(query, connect)

Unnamed: 0,customer_id,country,total_revenue
0,u14646,Netherlands,278778.02
1,u18102,United Kingdom,259657.3
2,u1745,United Kingdom,189575.53
3,u14911,EIRE,132893.24
4,u12415,Australia,123638.18
5,u14156,EIRE,114335.77
6,u17511,United Kingdom,88138.2
7,u16684,United Kingdom,65920.12
8,u14096,United Kingdom,65164.79
9,u13694,United Kingdom,62961.54


In [17]:
# What is the total revenue and no of invoices generated per month 

query = """
SELECT
    CAST(DATE_PART(YEAR, invoice_date) as int) as year,
    CAST(DATE_PART(MONTH, invoice_date) as int) as month,
    ROUND(SUM(total_order_value), 2) as total_revenue,
    COUNT(DISTINCT invoice) as num_invoices
FROM bootcamp.online_transactions_cleaned
GROUP BY year, month
ORDER BY year, month
"""

pd.read_sql(query, connect)

Unnamed: 0,year,month,total_revenue,num_invoices
0,2010,12,548443.92,1692
1,2011,1,471580.34,1225
2,2011,2,434218.17,1181
3,2011,3,573838.05,1588
4,2011,4,421527.77,1358
5,2011,5,650735.39,1808
6,2011,6,641129.21,1686
7,2011,7,580714.73,1555
8,2011,8,612966.3,1506
9,2011,9,924390.57,2038


In [18]:
# What were the top performing stocks per country based on quantity sold?

query = """
SELECT 
    country, 
    stock_code, 
    description,
    qty_sold
FROM (
    SELECT 
        country, 
        stock_code, 
        description,
        SUM(quantity) as qty_sold,
        RANK() OVER(PARTITION BY country ORDER BY SUM(quantity) DESC) as rank
    FROM bootcamp.online_transactions_cleaned
    GROUP BY country, stock_code, description) as sold_per_country
WHERE rank = 1
ORDER BY qty_sold DESC, country;
"""

pd.read_sql(query, connect)

Unnamed: 0,country,stock_code,description,qty_sold
0,United Kingdom,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,47886
1,Netherlands,23084,RABBIT NIGHT LIGHT,4801
2,France,23084,RABBIT NIGHT LIGHT,3999
3,Japan,23084,RABBIT NIGHT LIGHT,3401
4,Australia,22492,MINI PAINT SET VINTAGE,2916
...,...,...,...,...
57,Saudi Arabia,22553,PLASTERS IN TIN SKULLS,12
58,Saudi Arabia,22556,PLASTERS IN TIN CIRCUS PARADE,12
59,Saudi Arabia,22969,HOMEMADE JAM SCENTED CANDLES,12
60,Saudi Arabia,22915,ASSORTED BOTTLE TOP MAGNETS,12


In [19]:
# What is the total no of stock with 'Unknown' description

query = """
SELECT COUNT(DISTINCT stock_code)
FROM bootcamp.online_transactions_cleaned otc 
WHERE description = 'Unknown';
"""

pd.read_sql(query, connect)

Unnamed: 0,count
0,18


In [20]:
# What is the total no of invoice with 'Unknown' description

query = """
SELECT COUNT(description)
FROM bootcamp.online_transactions_cleaned otc 
WHERE description = 'Unknown';
"""

pd.read_sql(query, connect)

Unnamed: 0,count
0,1172


In [21]:
# What are the total qty and total revenue of the stock code with 'Unknown' description

query = """
SELECT stock_code, description, SUM(quantity) AS total_qty, SUM(total_order_value) AS total_revenue
FROM bootcamp.online_transactions_cleaned otc 
WHERE description = 'Unknown'
GROUP BY stock_code, description
ORDER BY SUM(quantity) DESC;
"""

pd.read_sql(query, connect)

Unnamed: 0,stock_code,description,total_qty,total_revenue
0,21703,Unknown,9186,3395.23
1,18007,Unknown,5856,383.76
2,21704,Unknown,4999,3852.52
3,46000S,Unknown,1507,2185.15
4,21705,Unknown,1357,1267.61
5,46000M,Unknown,1007,1560.85
6,22686,Unknown,835,967.2
7,22889,Unknown,143,156.18
8,46000U,Unknown,107,133.75
9,46000R,Unknown,101,146.45
