# Development environment setup

## Libraries

In [3]:
import duckdb as db
import pandas as pd
import plotly.express as px
import sys

from importlib import reload
from pathlib import Path
from pprint import pprint


## Project path

In [4]:
# Resolve path to project root
project_path = Path("__file__").resolve().parents[1]

In [5]:
# Add project paht to sys.path
sys.path.append(str(project_path))

## Custom functions

In [6]:
from src import utils

## Parameters

In [7]:
prm = utils.load_parameters()

pprint(prm)

{'plotly': {'render': 'vscode+pdf+png+jpeg', 'theme': 'plotly_white'}}


# Data

## Product

In [8]:
product = pd.read_csv(project_path.joinpath("data", "raw", "Product.csv"))

product.head().T

Unnamed: 0,0,1,2,3,4
product_id,218834610,253458328,466262361,613311395,528943857
producer_id,1,4,6,2,2
registry_date,2020-03-30,2021-05-24,2020-11-13,2018-02-28,2018-09-15
recovery_active,1,1,1,0,1
member_area_active,1,1,1,0,0
deletion_date,,,,,
niche,Marketing,Educação Infantil,Empreendedorismo,Financas,Finanças
base_price,780.00,560.00,800.00,680.00,500.00
type,Curso,Assinatura,Curso,Curso,Curso


## Producer

In [9]:
producer = pd.read_csv(project_path.joinpath("data", "raw", "Producer.csv"))

producer.head().T

Unnamed: 0,0,1,2,3,4
producer_id,1,2,3,4,5
registry_date,2018-02-01,2018-01-04,2020-01-02,2021-01-02,2019-02-11
country,BRASIL,COLOMBIA,BRASIL,COLOMBIA,BRASIL


## Sales

In [10]:
sales = pd.read_csv(project_path.joinpath("data", "raw", "Sales.csv"))

sales.head().T

Unnamed: 0,0,1,2,3,4
purchase_id,546791636,863821559,342304284,421394541,596413254
purchase_date,2020-03-18,2021-09-15,2021-08-28,2021-08-15,2019-12-22
product_id,903828613,264261718,382406448,218834610,603531440
product_price,531.00,560.00,480.00,780.00,406.00
service_tax,53.1,56.0,62.4,101.4,40.6
comission_value,477.90,504.00,417.60,0.00,365.40
has_coupon,1,0,0,0,1
discount,10%,0%,0%,0%,30%
refund,0,0,0,0,0
cancelled,0,0,0,0,0


# Technical case

## 1) The top 10 products that sold the most in each niche with deactivated membership area and activated recovery

In [11]:
query = """
WITH sales_count AS (
    SELECT
        p.niche,
        p.product_id,
        COUNT(s.purchase_id) AS total_sales
    FROM
        product AS p
    INNER JOIN
        sales AS s ON p.product_id = s.product_id
    WHERE
        p.member_area_active = 0
        AND
        p.recovery_active = 1
    GROUP BY
        ALL
),

sales_ranking AS (
    SELECT
        sc.*,
        RANK() OVER (PARTITION BY sc.niche ORDER BY sc.total_sales DESC) AS niche_rank
    FROM
        sales_count AS sc
)

SELECT
    sr.niche,
    sr.product_id,
    sr.total_sales,
    sr.niche_rank
FROM
    sales_ranking AS sr
WHERE
    sr.niche_rank <= 10
ORDER BY
    sr.niche,
    sr.niche_rank
"""

top_10_products_niche = db.query(query).to_df()

top_10_products_niche.head(30)

Unnamed: 0,niche,product_id,total_sales,niche_rank
0,Artes e Design,273593751,54,1
1,Empreendedorismo,983844542,77,1
2,Empreendedorismo,844949729,55,2
3,Empreendedorismo,382406448,37,3
4,Empreendedorismo,555755392,1,4
5,Empreendedorismo,954425664,1,4
6,Finanças,603531440,53,1
7,Finanças,264261718,18,2
8,Finanças,348532296,3,3
9,Finanças,937565380,2,4


## 2) The top 10 producers who joined XPCourses from 2020 onwards and achieved the highest sales using recovery.

Here we'll use the **comission value** of each sale to rank the producers.

All sales with chargeback, refund and cancelled flags will have $0,00 comission value.

In [29]:
query = """
WITH producers_sales AS (
    SELECT
        pr.producer_id,
        COUNT(DISTINCT s.purchase_id) AS total_sales,
        SUM(s.comission_value::FLOAT) AS total_comission
    FROM
        producer AS pr
    INNER JOIN
        product AS p ON pr.producer_id = p.producer_id
    INNER JOIN
        sales AS s ON p.product_id = s.product_id
    WHERE
        YEAR(pr.registry_date::DATE) >= 2020
    GROUP BY
        pr.producer_id
),

producers_ranking AS (
    SELECT
        ps.*,
        RANK() OVER (ORDER BY ps.total_comission DESC) AS producer_rank
    FROM
        producers_sales AS ps
)

SELECT
    pr_r.producer_id,
    pr_r.total_sales,
    pr_r.total_comission,
    pr_r.producer_rank
FROM
    producers_ranking AS pr_r
WHERE
    pr_r.producer_rank <= 10
ORDER BY
    pr_r.producer_rank
"""

top_10_producers = db.query(query).to_df()

top_10_producers

Unnamed: 0,producer_id,total_sales,total_comission,producer_rank
0,3,181,77343.369888,1
1,7,80,37754.649841,2
2,10,59,19052.450089,3
3,4,14,7136.80011,4


## 3) How much more a producer with the recovery feature activated is likely to sell in each niche? Consider only producers who registered from 2020 onwards.

Although we cannot say that the recovery feature is the **cause** of the increase in sales, we will calculate the average of all sales values ​​of products in each niche with and without this feature and then compare this value.

In [32]:
query = """
WITH avg_per_product_niche AS (
    SELECT
        p.niche,
        (SUM(s.product_price::FLOAT) FILTER (p.recovery_active = 0)) / (COUNT(DISTINCT p.product_id) FILTER (p.recovery_active = 0)) AS avg_price_without_recovery,
        (SUM(s.product_price::FLOAT) FILTER (p.recovery_active = 1)) / (COUNT(DISTINCT p.product_id) FILTER (p.recovery_active = 1)) AS avg_price_with_recovery,
    FROM
        producer AS pr
    INNER JOIN
        product AS p ON pr.producer_id = p.producer_id
    INNER JOIN
        sales AS s ON p.product_id = s.product_id
    WHERE
        YEAR(pr.registry_date::DATE) >= 2020
    GROUP BY
        p.niche
)

SELECT
    a.niche,
    a.avg_price_without_recovery,
    a.avg_price_with_recovery,
    a.avg_price_with_recovery - a.avg_price_without_recovery AS recovery_gain,
    a.avg_price_with_recovery / a.avg_price_without_recovery AS recovery_gain_pct
FROM
    avg_per_product_niche AS a
ORDER BY
    a.niche
"""

recovery_agin = db.query(query).to_df()

recovery_agin


Unnamed: 0,niche,avg_price_without_recovery,avg_price_with_recovery,recovery_gain,recovery_gain_pct
0,Artes e Design,,11017.0,,
1,Educação Infantil,,4403.0,,
2,Empreendedorismo,,34183.5,,
3,Finanças,,2040.0,,
4,Saúde e Alimentação,,6604.0,,
5,Tecnologia e Inovação,,26705.0,,


In [40]:
query = """
SELECT
    pr.producer_id,
    pr.registry_date,
    p.niche,
    p.recovery_active
FROM
    producer AS pr
INNER JOIN
    product AS p ON pr.producer_id = p.producer_id
WHERE
    YEAR(pr.registry_date::DATE) >= 2020
ORDER BY
    p.niche,
    p.recovery_active
"""

db.query(query).to_df()

Unnamed: 0,producer_id,registry_date,niche,recovery_active
0,10,2020-01-12,Artes e Design,1
1,10,2020-01-12,Artes e Design,1
2,4,2021-01-02,Educação Infantil,1
3,4,2021-01-02,Educação Infantil,1
4,4,2021-01-02,Educação Infantil,1
5,4,2021-01-02,Educação Infantil,1
6,3,2020-01-02,Empreendedorismo,1
7,3,2020-01-02,Empreendedorismo,1
8,10,2020-01-12,Finanças,1
9,4,2021-01-02,Idiomas,1


All producers registered in 2020 onwards **only have products with the recovery feature activated**, so it is not possible to calculate the gain from this feature.

## 4) The product niche(s) with the highest number of cancellations and refunds.

## 5) Calculate the total money lost by producer due to cancellations and refunds. Is there any difference for producers considering products with the recovery tool activated?

## 6) If you need to create a ranking of the top creators of 2023, which variables you consider crucial for ranking them? You can also create variables from the data. You must explain your reasoning and your choice of variables and show how this reflect in your SQL code.