# Проект: АиФ Доброе сердце (ETL/витрина данных)  

**Заказчик:** Фонд «АиФ. Доброе сердце»  
Фонд был создан в 2005-м году по инициативе еженедельной газеты «Аргументы и факты». За эти годы больше 10 тысяч подопечных со сложными диагнозами получили помощь с оплатой операций, лекарств и медоборудования. Фонд помогает тяжелобольным детям и взрослым из разных регионов России, а также развивает проекты по всесторонней поддержке подопечных семей — чтобы никто не оставался наедине с болезнью.  

**Цель проекта:** Фонд хочет лучше узнать своих благотворителей для более эффективной работы. Для этого основной задачей будет проведения RFM-анализа, когортного анализа (retention, LTV, средний чек), а также расчет основных маркетинговых и продуктовых мет ей.  

**Используемые библиотеки:**
 

In [2]:
import os
from dotenv import load_dotenv, find_dotenv
from sqlalchemy import text, create_engine 
import pandas as pd
import yadisk
from tqdm import tqdm

## Подключение к Яндекс диску и БД

In [5]:
# загрузка переменной окружения с кодами доступа
load_dotenv(find_dotenv())               

True

In [7]:
# подключение к Яндекс Диску
APP_ID = os.getenv('app_id')
SECRET_ID = os.getenv('secret_id')
TOKEN = os.getenv('ya_token')
y = yadisk.YaDisk(APP_ID, SECRET_ID, TOKEN)
y.check_token()

True

In [9]:
# подключение к БД
db_config = {'user': os.getenv('USER'),                       
             'pwd': os.getenv('PASSWORD'),                           
             'host': os.getenv('HOST'),
             'port': os.getenv('PORT'),                                        
             'db': os.getenv('DATABASE')}    

engine = create_engine("postgresql+psycopg2://{user}:{pwd}@{host}:{port}/{db}".format(**db_config))
conn = engine.connect()

## Загрузка и предобработка данных

In [None]:
# создание списка файлов формата .csv на загрузку
list_of_files = []
for el in list(y.listdir('AIF')):
  if el['path'].endswith('.csv'):
    list_of_files.append(el['path'])

In [None]:
# определение пути загрузки
load_path = "C:/Users/aif/"
if not os.path.exists(load_path):
    os.mkdir(load_path)
os.chdir(load_path)

In [None]:
# скачивание файла, если он не был загружен ранее
for file in tqdm(list_of_files):
    if file.split('/')[-1] not in os.listdir():
        y.download(file.split(':')[1], file.split('/')[-1])
    else:
        continue

In [None]:
# загрузка файла со списком загруженных ранее датасетов
loaded = open('loaded.txt', 'a')

In [None]:
# формирование датасетов из файлов при условии, что они не были скачены и загружены ранее

actions = []
orders = []
loaded_files = []

list_of_files = list(map(lambda x: x.split('/')[-1], list_of_files))

for file in tqdm(list_of_files):
    if file not in open('loaded.txt', 'r+').read():
        if 'actions' in file:
            df = pd.read_csv(file, sep=';')
            actions.append(df)
            loaded_files.append(file)
        else:
            df = pd.read_csv(file, sep=';')
            orders.append(df)
            loaded_files.append(file)

In [None]:
# создание записи о том, что файл загружен в датасет

for file in loaded_files:
    loaded.write(f"{file}\n")

In [None]:
# объединение датасетов
try:
    actions_df = pd.concat(actions, ignore_index=True)
    orders_df = pd.concat(orders, ignore_index=True)
except Exception as e:
    print(e)

In [None]:
# удаление полных дубликатов
try:
    actions_df = actions_df.drop_duplicates(keep='last').reset_index(drop=True)
    orders_df = orders_df.drop_duplicates(keep='last').reset_index(drop=True)
except Exception as e:
    print(e)

In [None]:
# удаление лишних столбцов
try: 
    actions_df = actions_df.drop(columns=['CustomerActionActionTemplateIdsSystemName',
                  'CustomerActionBrandIdsSystemName', 
                  'CustomerActionChannelIdsSystemName',
                  'CustomerActionCustomerIdsBackendID', 
                  'CustomerActionCustomerIdsWebsiteID'])
    orders_df = orders_df.drop(columns=['OrderAreaIdsExternalId',
                                       'OrderTransactionIdsExternalId',
                                       'OrderIdsBackendID',
                                       'OrderCustomFieldsNextPayDate',
                                       'OrderLineGiftCardAmount',
                                       'OrderLineGiftCardStatusIdsSystemName',
                                       'OrderLineId',
                                       'OrderLineLineId',
                                       'OrderCustomerIdsBackendID',
                                       'OrderCustomerIdsWebsiteID'])
except Exception as e:
    print(e)

In [None]:
# приведение типов данных
try:
    actions_df['CustomerActionDateTimeUtc'] = actions_df['CustomerActionDateTimeUtc'].apply(lambda x: x.split(' ')[0])
    actions_df['CustomerActionDateTimeUtc'] = pd.to_datetime(actions_df['CustomerActionDateTimeUtc'], format='%d.%m.%Y')
    actions_df['CustomerActionCreationDateTimeUtc'] = actions_df['CustomerActionCreationDateTimeUtc'].apply(lambda x: x.split(' ')[0])
    actions_df['CustomerActionCreationDateTimeUtc'] = pd.to_datetime(actions_df['CustomerActionCreationDateTimeUtc'], format='%d.%m.%Y')

    orders_df['OrderFirstActionDateTimeUtc'] = orders_df['OrderFirstActionDateTimeUtc'].apply(lambda x: x.split(' ')[0])
    orders_df['OrderFirstActionDateTimeUtc'] = pd.to_datetime(orders_df['OrderFirstActionDateTimeUtc'], format='%d.%m.%Y')
except Exception as e:
    print(e)

## Загрузка данных в БД

In [None]:
# загрузка датасета actions
try:
    actions_df.to_sql('actions', con=conn, index=False, if_exists='append')
except Exception as e:
    print(e)

In [None]:
# загрузка датасета orders
try:
    orders_df.to_sql('orders', con=conn, index=False, if_exists='append')
except Exception as e:
    print(e)

In [13]:
# функция для чтения запросов к БД
def sql_query(query):
    return pd.io.sql.read_sql(sql=text(query), con = conn)

### Выделение RFM-сегментов

RFM-сегментация проводится только среди действующих жертвователей, то есть по таблице orders_df

In [15]:
rfm = """WITH base AS (SELECT "OrderCustomerIdsMindboxId", 
                     "OrderFirstActionDateTimeUtc", 
                     "OrderLinePriceOfLine"
              FROM orders
              WHERE "OrderLineStatusIdsExternalId" = 'Paid'),
fm_base AS (SELECT "OrderCustomerIdsMindboxId" AS usr,
              SUM("OrderLinePriceOfLine") AS monetary,
              COUNT("OrderLinePriceOfLine") AS frequency
              FROM base
              GROUP BY "OrderCustomerIdsMindboxId"),
f AS (SELECT PERCENTILE_DISC(0.30) WITHIN GROUP (ORDER BY frequency) AS fr_fs,
             PERCENTILE_DISC(0.70) WITHIN GROUP (ORDER BY frequency) AS fr_sn
       FROM fm_base),
m AS (SELECT PERCENTILE_DISC(0.30) WITHIN GROUP (ORDER BY monetary) AS mn_fs,
             PERCENTILE_DISC(0.70) WITHIN GROUP (ORDER BY monetary) AS mn_sn
       FROM fm_base),
rec AS (SELECT DISTINCT "OrderCustomerIdsMindboxId" AS usr,
             LAST_VALUE("OrderFirstActionDateTimeUtc") OVER(PARTITION BY "OrderCustomerIdsMindboxId" ORDER BY "OrderFirstActionDateTimeUtc" RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_date,
             MAX("OrderFirstActionDateTimeUtc") OVER() AS max_date
             FROM base),
for_rfm AS (SELECT rec.usr,
                   last_date,
                   max_date - last_date AS recency,
                   fm_base.monetary AS monetary,
                   fm_base.frequency AS frequency
            FROM rec
            JOIN fm_base ON rec.usr = fm_base.usr),
r AS (SELECT PERCENTILE_DISC(0.30) WITHIN GROUP (ORDER BY recency) AS r_fs,
             PERCENTILE_DISC(0.70) WITHIN GROUP (ORDER BY recency) AS r_sn
       FROM for_rfm),
rfm AS (SELECT usr,
               last_date,
               recency,
               CASE WHEN recency <= (SELECT r_fs FROM r) THEN '1'
                    WHEN recency <= (select r_sn FROM r) THEN '2'
                    ELSE '3' 
                    END AS R,
               frequency,
               CASE WHEN frequency <= (SELECT fr_fs FROM f) THEN '3'
                    WHEN frequency <= (select fr_sn FROM f) THEN '2'
                    ELSE '1' 
                    END AS F,
               monetary,
               CASE WHEN monetary <= (SELECT mn_fs FROM m) THEN '3'
                    WHEN monetary <= (SELECT mn_sn FROM m) THEN '2'
                    ELSE '1' 
                    END AS M
        FROM for_rfm)
SELECT usr,
       EXTRACT(DAYS FROM recency)::int as recency,
       frequency,
       monetary,
       CONCAT(R,F,M) AS RFM
FROM rfm"""


rfm_segment = sql_query(rfm)

In [17]:
rfm_segment

Unnamed: 0,usr,recency,frequency,monetary,rfm
0,30017,764,1,300.0,333
1,37990,340,2,200.0,223
2,24055,1027,3,600.0,312
3,42207,31,2,250.0,123
4,40555,361,1,300.0,233
...,...,...,...,...,...
22835,20728,934,1,200.0,333
22836,58012,33,1,300.0,133
22837,35823,543,1,500.0,232
22838,23038,36,13,4100.0,111


## Retention

In [19]:
# n-month retention
retention = """WITH sessions AS (SELECT "CustomerActionCustomerIdsMindboxId" AS usr,
                                    MIN(date_trunc('month',"CustomerActionDateTimeUtc"::date)) OVER (PARTITION BY "CustomerActionCustomerIdsMindboxId") AS cohort,
                                    date_trunc('month',"CustomerActionDateTimeUtc"::date) AS mth                         
                             FROM actions),
                cohorts AS (SELECT cohort::date as cohort,
                                   mth,
                                   COUNT(DISTINCT usr) as cnt
                            FROM sessions
                            GROUP BY cohort, mth
                            ORDER BY cohort, mth)
SELECT cohort,
       ROW_NUMBER() OVER(PARTITION BY cohort ORDER BY mth) - 1 AS mth,
       ROUND(cnt::numeric / MAX(cnt) OVER(PARTITION BY cohort)::numeric, 4) *100 AS n_month_retention
FROM cohorts
"""

n_mth_retention = sql_query(retention)

In [20]:
n_mth_retention

Unnamed: 0,cohort,mth,n_month_retention
0,2021-01-01,0,100.00
1,2021-01-01,1,63.46
2,2021-01-01,2,60.34
3,2021-01-01,3,56.09
4,2021-01-01,4,52.12
...,...,...,...
1030,2024-07-01,1,97.24
1031,2024-07-01,2,83.83
1032,2024-08-01,0,100.00
1033,2024-08-01,1,86.50


## Churn rate

In [23]:
churn = """WITH profiles AS (SELECT "CustomerActionCustomerIdsMindboxId" AS usr,
                         MIN(DATE_TRUNC('month', "CustomerActionDateTimeUtc"))::date AS cohort
                  FROM actions
                  GROUP BY "CustomerActionCustomerIdsMindboxId"),
     cohorts AS (SELECT cohort,
                        DATE_TRUNC('month', a."CustomerActionDateTimeUtc")::date AS event_mth,
                        COUNT(DISTINCT a."CustomerActionCustomerIdsMindboxId") AS cnt
                 FROM profiles AS p
                 JOIN actions AS a ON p.usr = a."CustomerActionCustomerIdsMindboxId"
                 GROUP BY cohort, event_mth)
SELECT *,
       LAG(cnt) OVER(PARTITION BY cohort ORDER BY event_mth),
       COALESCE(ROUND((1 - (cnt::NUMERIC / LAG(cnt) OVER(PARTITION BY cohort ORDER BY event_mth)))*100, 2), 0) AS churn
FROM cohorts
"""
churn_rate = sql_query(churn)

In [25]:
churn_rate

Unnamed: 0,cohort,event_mth,cnt,lag,churn
0,2021-01-01,2021-01-01,353,,0.00
1,2021-01-01,2021-02-01,224,353.0,36.54
2,2021-01-01,2021-03-01,213,224.0,4.91
3,2021-01-01,2021-04-01,198,213.0,7.04
4,2021-01-01,2021-05-01,184,198.0,7.07
...,...,...,...,...,...
1030,2024-07-01,2024-08-01,950,977.0,2.76
1031,2024-07-01,2024-09-01,819,950.0,13.79
1032,2024-08-01,2024-08-01,1555,,0.00
1033,2024-08-01,2024-09-01,1345,1555.0,13.50


## LTV

In [27]:
ltv = """WITH profiles AS (SELECT "CustomerActionCustomerIdsMindboxId" AS usr,
                                  MIN(DATE_TRUNC('month', "CustomerActionDateTimeUtc"))::date AS cohort
                           FROM actions
                           GROUP BY "CustomerActionCustomerIdsMindboxId"),
              cohorts AS (SELECT p.cohort,
                                 DATE_TRUNC('month', o."OrderFirstActionDateTimeUtc")::date AS event_mth,
                                 COUNT(p.usr) OVER (PARTITION BY p.cohort) AS cnt,
                                 SUM(o."OrderLinePriceOfLine") OVER (PARTITION BY p.cohort ORDER BY (DATE_TRUNC('month', o."OrderFirstActionDateTimeUtc")::date)) AS summa
                          FROM profiles AS p
                          JOIN orders AS o ON p.usr = o."OrderCustomerIdsMindboxId" 
                                           AND o."OrderLineStatusIdsExternalId" LIKE 'Paid'),
              ltv_tbl AS (SELECT cohort,
                                 event_mth,
                                 ROUND(MAX(summa) / MAX(cnt), 2) AS ltv
                          FROM cohorts
                          GROUP BY cohort, event_mth)
SELECT cohort,
       ROW_NUMBER() OVER (PARTITION BY cohort ORDER BY event_mth) - 1 AS mth,
       ltv
FROM ltv_tbl
"""
ltv = sql_query(ltv)

In [29]:
ltv

Unnamed: 0,cohort,mth,ltv
0,2021-01-01,0,80.79
1,2021-01-01,1,111.46
2,2021-01-01,2,138.77
3,2021-01-01,3,163.80
4,2021-01-01,4,189.23
...,...,...,...
1030,2024-07-01,1,928.23
1031,2024-07-01,2,938.61
1032,2024-08-01,0,1696.30
1033,2024-08-01,1,1713.81


## Average order value (AOV)

In [31]:
aov = """WITH profiles AS (SELECT "CustomerActionCustomerIdsMindboxId" AS usr,
                                  MIN(DATE_TRUNC('month', "CustomerActionDateTimeUtc"))::date AS cohort
                           FROM actions
                           GROUP BY "CustomerActionCustomerIdsMindboxId"), 
              cohorts AS (SELECT p.cohort,
                                 DATE_TRUNC('month', o."OrderFirstActionDateTimeUtc")::date AS event_mth,
                                 COUNT(o."OrderIdsMindboxId") AS cnt,
                                 SUM(o."OrderLinePriceOfLine") AS summa
                          FROM profiles p
                          LEFT JOIN orders o ON p.usr = o."OrderCustomerIdsMindboxId" 
                                             AND o."OrderLineStatusIdsExternalId" like 'Paid'::text
                          GROUP BY p.cohort, (date_trunc('month', o."OrderFirstActionDateTimeUtc")::date))
 SELECT cohort,
        ROW_NUMBER() OVER (PARTITION BY cohort ORDER BY event_mth) - 1 AS mth,
        ROUND(summa /cnt, 2) AS aov
        FROM cohorts"""

aov = sql_query(aov)

In [33]:
aov

Unnamed: 0,cohort,mth,aov
0,2021-01-01,0,1042.24
1,2021-01-01,1,635.48
2,2021-01-01,2,604.79
3,2021-01-01,3,582.70
4,2021-01-01,4,647.05
...,...,...,...
1061,2024-08-01,0,1730.23
1062,2024-08-01,1,892.69
1063,2024-08-01,2,
1064,2024-09-01,0,2065.41


## MAU, WAU, DAU, sticky factor

In [35]:
mau = """SELECT date_trunc('month',"CustomerActionDateTimeUtc")::date AS mth,
                COUNT(DISTINCT "CustomerActionCustomerIdsMindboxId") AS cnt
         FROM actions
        GROUP BY date_trunc('month',"CustomerActionDateTimeUtc")::date
        ORDER BY mth"""
wau = """SELECT EXTRACT(YEAR FROM "CustomerActionDateTimeUtc") AS yr,
                EXTRACT(WEEK FROM "CustomerActionDateTimeUtc") AS wk,
                COUNT(DISTINCT "CustomerActionCustomerIdsMindboxId") AS cnt
         FROM actions
         GROUP BY EXTRACT(YEAR FROM "CustomerActionDateTimeUtc"), EXTRACT(WEEK FROM "CustomerActionDateTimeUtc")
         ORDER BY yr, wk"""
dau = """SELECT "CustomerActionDateTimeUtc"::date AS dt,
                 COUNT(DISTINCT "CustomerActionCustomerIdsMindboxId") AS cnt
         FROM actions
         GROUP BY "CustomerActionDateTimeUtc"::date
         ORDER BY dt"""
sticky_factor = """WITH mau AS (SELECT date_trunc('month',"CustomerActionDateTimeUtc")::date AS mth,
                                       COUNT(DISTINCT "CustomerActionCustomerIdsMindboxId") AS cnt
                                FROM actions
                                GROUP BY date_trunc('month',"CustomerActionDateTimeUtc")::date),
                        dau AS (SELECT "CustomerActionDateTimeUtc"::date AS dt,
                                       COUNT(DISTINCT "CustomerActionCustomerIdsMindboxId") AS cnt
                                FROM actions
                                GROUP BY "CustomerActionDateTimeUtc"::date
                                ORDER BY dt),
                        avgdau AS (SELECT date_trunc('month', dt) AS mth,
                                          avg(cnt) AS cnt
                                   FROM dau
                                   GROUP BY date_trunc('month', dt))
                  SELECT m.mth,
                         round(a.cnt/m.cnt * 100, 2) AS sticky_factor
                  FROM mau AS m
                  JOIN avgdau AS a ON m.mth = a.mth"""

In [37]:
mau = sql_query(mau)
wau = sql_query(wau)
dau = sql_query(dau)
sticky_factor = sql_query(sticky_factor)

In [39]:
mau

Unnamed: 0,mth,cnt
0,2021-01-01,353
1,2021-02-01,395
2,2021-03-01,466
3,2021-04-01,849
4,2021-05-01,571
5,2021-06-01,561
6,2021-07-01,388
7,2021-08-01,639
8,2021-09-01,643
9,2021-10-01,753


In [41]:
wau

Unnamed: 0,yr,wk,cnt
0,2021.0,1.0,78
1,2021.0,2.0,87
2,2021.0,3.0,68
3,2021.0,4.0,121
4,2021.0,5.0,72
...,...,...,...
188,2024.0,32.0,8174
189,2024.0,33.0,3329
190,2024.0,34.0,8481
191,2024.0,35.0,14921


In [43]:
dau

Unnamed: 0,dt,cnt
0,2021-01-01,11
1,2021-01-02,15
2,2021-01-03,14
3,2021-01-04,17
4,2021-01-05,10
...,...,...
1334,2024-08-31,496
1335,2024-09-01,459
1336,2024-09-02,457
1337,2024-09-03,6865


In [45]:
sticky_factor

Unnamed: 0,mth,sticky_factor
0,2021-01-01,3.68
1,2021-02-01,3.95
2,2021-03-01,3.52
3,2021-04-01,3.63
4,2021-05-01,3.72
5,2021-06-01,3.68
6,2021-07-01,3.81
7,2021-08-01,3.73
8,2021-09-01,3.79
9,2021-10-01,3.59
