# **Bee Cycle - SQL Queries for Data Analysis**

---



This file contains the queries I used in order to answer the key business questions for this project. <br>

## **1) Which products are most profitable?**

In [21]:
query = '''
SELECT
  A.sub_category,
  CAST(SUM(B.totalprice - B.totalcost) AS INT64) AS total_profit
FROM `beecycle-413111.BeeCycle.dim_product` A
JOIN `beecycle-413111.BeeCycle.fact_sales` B ON A.product_id = B.product_id
GROUP BY sub_category
ORDER BY total_profit DESC;
'''

df = gcpdf(query)
df

Unnamed: 0,sub_category,total_profit
0,Mountain Bikes,855013
1,Road Bikes,490650
2,Touring Bikes,243571
3,Helmets,9138
4,Tires and Tubes,8067
5,Bottles and Cages,2543
6,Hydration Packs,2241
7,Jerseys,2084
8,Fenders,1812
9,Bike Racks,1525


## **2) Which months are most profitable?**

In [22]:
query = '''
SELECT
  FORMAT_DATE('%B', DATE_TRUNC(order_date, MONTH)) AS month,
  CAST(SUM(totalprice - totalcost) AS INT64) AS total_profit
FROM `beecycle-413111.BeeCycle.fact_sales`
GROUP BY month
ORDER BY total_profit DESC;
'''

df = gcpdf(query)
df

Unnamed: 0,month,total_profit
0,July,208525
1,February,183687
2,December,164776
3,October,152871
4,August,151241
5,September,147939
6,November,145787
7,January,124276
8,March,123563
9,April,88455


## **3) Which regions are most profitable?**

In [23]:
query = '''
SELECT
  CASE
    WHEN A.country = 'United States' THEN CONCAT(A.region, ' ', A.country)
    ELSE A.country
    END AS region,
  CAST(SUM(B.totalprice - B.totalcost) AS INT64) AS total_profit
FROM `beecycle-413111.BeeCycle.dim_territory` A
JOIN `beecycle-413111.BeeCycle.fact_sales` B ON A.territory_id = B.territory_id
GROUP BY region
ORDER BY total_profit DESC;
'''

df = gcpdf(query)
df

Unnamed: 0,region,total_profit
0,Australia,579923
1,United Kingdom,214997
2,Southwest United States,210810
3,France,192114
4,Germany,191043
5,Northwest United States,138576
6,Canada,93371
7,Southeast United States,773


## **4) What type of customer generates the most profit?**

In [24]:
query = '''
SELECT
  CASE
    WHEN age BETWEEN 18 AND 24 THEN '18-24'
    WHEN age BETWEEN 25 AND 34 THEN '25-34'
    WHEN age BETWEEN 35 AND 44 THEN '35-44'
    WHEN age BETWEEN 45 AND 54 THEN '45-54'
    WHEN age BETWEEN 55 AND 64 THEN '55-64'
    ELSE '65+'
    END AS customer_age,
  gender,
  CAST(SUM(totalprice - totalcost) AS INT64) AS total_profit
FROM(
  SELECT
    A.gender,
    IF(EXTRACT(MONTH FROM A.birthdate) * 100 + EXTRACT(DAY FROM A.birthdate) > EXTRACT(MONTH FROM CURRENT_DATE()) * 100 + EXTRACT(DAY FROM CURRENT_DATE()),
      DATE_DIFF(CURRENT_DATE(), DATE(A.birthdate), YEAR) - 1,
      DATE_DIFF(CURRENT_DATE(), DATE(A.birthdate), YEAR)) AS age,
    B.totalprice,
    B.totalcost
  FROM `beecycle-413111.BeeCycle.dim_customer` AS A
  JOIN `beecycle-413111.BeeCycle.fact_sales` AS B ON A.customer_id = B.customer_id
)
GROUP BY
    customer_age,
    gender
ORDER BY
    total_profit DESC;
'''

df = gcpdf(query)
df

Unnamed: 0,customer_age,gender,total_profit
0,35-44,F,276495
1,25-34,M,241080
2,45-54,F,236522
3,25-34,F,235469
4,35-44,M,222865
5,45-54,M,208982
6,55-64,M,73487
7,55-64,F,68950
8,65+,M,16125
9,18-24,F,15571


## **5) Which customers spend the most money?**

In [25]:
query = '''
SELECT
  A.customer_name,
  CAST(SUM(B.totalprice) AS INT64) AS total_spent
FROM `beecycle-413111.BeeCycle.dim_customer` AS A
JOIN `beecycle-413111.BeeCycle.fact_sales` AS B ON A.customer_id = B.customer_id
GROUP BY customer_name
ORDER BY total_spent DESC
LIMIT 10;
'''

df = gcpdf(query)
df

Unnamed: 0,customer_name,total_spent
0,Nichole Nara,9307
1,Kaitlyn Henderson,9306
2,Margaret He,9288
3,Randall Dominguez,9286
4,Adriana Gonzalez,9270
5,Rosa Hu,9251
6,Brandi Gill,9237
7,Brad She,9221
8,Francisco Sara,9215
9,Kate Anand,7610


## **6a) Which cities are most profitable?**

In [26]:
query = '''
WITH CityData AS(
  SELECT
    A.city,
    CAST(SUM(C.totalprice - C.totalcost) AS INT64) AS total_profit
  FROM `beecycle-413111.BeeCycle.dim_geography` AS A
  JOIN `beecycle-413111.BeeCycle.dim_customer` AS B ON A.geography_id = B.geography_id
  JOIN `beecycle-413111.BeeCycle.fact_sales` AS C ON B.customer_id = C.customer_id
  GROUP BY city
)

SELECT
  city,
  total_profit
FROM CityData
ORDER BY total_profit DESC
LIMIT 5;
'''

df = gcpdf(query)
df

Unnamed: 0,city,total_profit
0,London,48625
1,Paris,29522
2,Warrnambool,26034
3,Geelong,25181
4,Newcastle,23339


## **6b) What are the most popular products within the top 5 most profitable cities?**

In [27]:
query = '''
SELECT
  city,
  model_name,
  purchase_count
FROM (
  SELECT
    A.city,
    D.model_name,
    COUNT(C.product_id) AS purchase_count,
    RANK() OVER (PARTITION BY A.city ORDER BY COUNT(C.product_id) DESC) AS ranking
  FROM `beecycle-413111.BeeCycle.dim_geography` A
  JOIN `beecycle-413111.BeeCycle.dim_customer` B ON A.geography_id = B.geography_id
  JOIN `beecycle-413111.BeeCycle.fact_sales` C ON B.customer_id = C.customer_id
  JOIN `beecycle-413111.BeeCycle.dim_product` D ON C.product_id = D.product_id
  WHERE city IN ('London', 'Paris', 'Warrnambool', 'Geelong', 'Newcastle')
  GROUP BY
    city,
    model_name
  ORDER By
    city,
    purchase_count DESC
)
WHERE
  ranking <= 2;
'''

df = gcpdf(query)
df

Unnamed: 0,city,model_name,purchase_count
0,Geelong,Mountain-200,15
1,Geelong,Touring-1000,10
2,London,Mountain-200,25
3,London,Sport-100,24
4,Newcastle,Sport-100,11
5,Newcastle,Mountain-200,10
6,Paris,Mountain-200,27
7,Paris,Sport-100,12
8,Warrnambool,Mountain-200,13
9,Warrnambool,Touring-1000,10


## **6c) What are the most popular colours for each of the products above in that particular city?**

In [28]:
query = '''
WITH PopularProducts AS(
  SELECT
    A.city,
    D.model_name,
    COUNT(C.product_id) AS purchase_count
  FROM `beecycle-413111.BeeCycle.dim_geography` A
  JOIN `beecycle-413111.BeeCycle.dim_customer` B ON A.geography_id = B.geography_id
  JOIN `beecycle-413111.BeeCycle.fact_sales` C ON B.customer_id = C.customer_id
  JOIN `beecycle-413111.BeeCycle.dim_product` D ON C.product_id = D.product_id
  WHERE city IN ('London', 'Paris', 'Warrnambool', 'Geelong', 'Newcastle')
  GROUP BY
    city,
    model_name
),

RankedProducts AS(
  SELECT
    city,
    model_name
  FROM(
    SELECT
      city,
      model_name,
      RANK() OVER (PARTITION BY city ORDER BY purchase_count DESC) AS model_rank
    FROM
      PopularProducts
  )
  WHERE
    model_rank <= 2
)

SELECT
  A.city,
  D.model_name,
  D.color,
  COUNT(C.product_id) AS purchase_count
FROM `beecycle-413111.BeeCycle.dim_geography` A
JOIN `beecycle-413111.BeeCycle.dim_customer` B ON A.geography_id = B.geography_id
JOIN `beecycle-413111.BeeCycle.fact_sales` C ON B.customer_id = C.customer_id
JOIN `beecycle-413111.BeeCycle.dim_product` D ON C.product_id = D.product_id
JOIN RankedProducts E ON A.city = E.city AND D.model_name = E.model_name
GROUP BY
  city,
  model_name,
  color
ORDER BY
  city,
  model_name,
  purchase_count DESC;
'''

df = gcpdf(query)
df

Unnamed: 0,city,model_name,color,purchase_count
0,Geelong,Mountain-200,Black,10
1,Geelong,Mountain-200,Silver,5
2,Geelong,Touring-1000,Yellow,5
3,Geelong,Touring-1000,Blue,5
4,London,Mountain-200,Silver,13
5,London,Mountain-200,Black,12
6,London,Sport-100,Red,11
7,London,Sport-100,Black,7
8,London,Sport-100,Blue,6
9,Newcastle,Mountain-200,Silver,7
