In [2]:
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import seaborn as sns 
import statsmodels.stats.proportion as proportion
from scipy.stats import ttest_ind,mannwhitneyu,shapiro,norm
from statsmodels.stats.weightstats import ztest
from tqdm import tqdm
import timeit
from scipy import stats
import math
from datetime import date, datetime, timedelta
import time
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
import warnings
warnings.filterwarnings("ignore")
import clickhouse_connect  



from credential import postgres_secret,clickhouse_dwh_secret

def get_engine(user):
    if user == postgres_secret['user']:
        db_name = postgres_secret['db_name']
        password = postgres_secret['password']
        host = postgres_secret['host']
        engine = create_engine(f'postgresql://{user}:{password}@{host}:6432/{db_name}')
    elif user == clickhouse_dwh_secret['user']:
            db_name = clickhouse_dwh_secret['db_name'] 
            password = clickhouse_dwh_secret['password']
            host = clickhouse_dwh_secret['host']
            engine = create_engine(f'clickhouse://{user}:{password}@{host}:8123/{db_name}')
    return engine
    
connection_clickhouse = clickhouse_connect.get_client(
    host = clickhouse_dwh_secret['host'],
    port= '8123',
    username = clickhouse_dwh_secret['user'],
    password = clickhouse_dwh_secret['password'],
    database='datamarts'
    )

    
def execute(SQL, user):
    start_time = time.time()  # запоминаем время начала выполнения функции
    engine = get_engine(user)
    Session = sessionmaker(bind=engine)  # sessions factory ()
    with Session() as session: # open session
        result = session.execute(text(SQL))
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
        
    end_time = time.time()  # запоминаем время окончания выполнения функции
    execution_time = round(end_time - start_time,4) # вычисляем время выполнения   
    
    print(f"Время выполнения функции: {execution_time} секунд")
    print()
    return df

In [3]:
pwd

'/Users/kemran/Desktop/work_files/python_files'

In [None]:
query = f'''
                    SELECT 
                    t4.title AS bonus_title,
                    t1.title AS title,
                    JSONExtractInt(t4.metadata,'duration') AS duration,
                    JSONExtractString(t4.metadata,'duration_unit') AS duration_unit,
                    t3.activated_at AS bonus_activated_at,
                    t3.activated_at::date AS bonus_start_at,
                    t3.activated_at::date + JSONExtractInt(t4.metadata,'duration') AS bonus_end_at

                    FROM product_x.promo_codes AS t1

                    LEFT JOIN (SELECT * FROM product_x.bonus_programs bp 
                                ) t4
                    ON t1.bonus_program_id=t4.id

                    LEFT JOIN (SELECT * FROM product_x.promo_code_activations
                               ) AS t2
                    ON 	t1.id=t2.promo_code_id


                    LEFT JOIN (SELECT * FROM  product_x.user_bonuses
                              ) AS t3
                    ON t2.user_bonus_id=t3.id

                    WHERE bonus_activated_at BETWEEN'2024-01-01' AND '2024-08-01'
         '''

df = execute(query,user = 'kmekhtiev')
# df_retention = execute(query_retention,user = 'kmekhtiev')
# df_retention['reg_date'] = df_retention['reg_date'].astype('datetime64[ns]')

In [None]:
df.to_excel('Активированные коды с января по ферваль.xlsx',index=False)

## Watchtime 

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_watchtime_by_day_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                client_type_general String,
                client_type_w_sberdevice String,
                bonus_title String,
                item_type String,
                b2c_b2b String,
                user_type String,
                free_days String,
                app_version String,
                watchtime Int32,
                watchtime_session_watch Int32,
                promo_type String,
                session_cnt Int32,
                session_watch Int32,
                session_cnt_ttl Int32,
                session_watch_ttl Int32
             )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/mekhtiev_watchtime_by_day_local', '{replica}')
             ORDER BY date
             '''
execute(query,user = 'kmekhtiev')

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_watchtime_by_day_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                client_type_general String,
                client_type_w_sberdevice String,
                bonus_title String,
                item_type String,
                b2c_b2b String,
                user_type String,
                free_days String,
                app_version String,
                watchtime Int32,
                watchtime_session_watch Int32,
                promo_type String,
                session_cnt Int32,
                session_watch Int32,
                session_cnt_ttl Int32,
                session_watch_ttl Int32
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_watchtime_by_day_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [22]:
list_date = pd.date_range("2024-11-01", "2024-11-07", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query_delete = f" DELETE FROM datamarts.mekhtiev_watchtime_by_day_local ON CLUSTER 'viasat_cluster' WHERE date = '{date}'::date"
    execute(query_delete,user = 'kmekhtiev')
    print(f" Дата {date_str} удалена из таблицы")
    query = f''' INSERT INTO datamarts.mekhtiev_watchtime_by_day_distr
           SELECT 
                t1.date,
                t1.profile_id,
                t1.client_type,
                t1.client_type_w_sberdevice,
                t1.bonus_title,
                t2.item_type,
                t1.b2c_b2b,
                t1.user_type,
                t1.free_days,
                t1.app_version,
                t2.watchtime2 AS watchtime,
                t2.watchtime_session_watch,
                t1.promo_type,
                t1.session_cnt,
                t2.watch_session2 AS session_watch,
                t1.session_cnt_ttl,
                t2.session_watch_ttl
                FROM
                (SELECT  
                    date,  
                    t1.profile_id AS profile_id,  
                    client_type,
                    client_type_w_sberdevice,
                    promo_type, 
                    bonus_title,
                    CASE WHEN item_type LIKE '' THEN 'kinom'
                         WHEN item_type IN ('series','movie','tvchannel') THEN item_type
                         ELSE 'other'
                    END AS item_type,                    
                    CASE WHEN promo_type='cards' THEN 'b2b' ELSE 'b2c' END AS b2c_b2b,
                    CASE WHEN first_prolong_date!='1970-01-01' AND date>=first_prolong_date AND ends_at>=date THEN 'subs'
                         WHEN date>=trial_start AND ends_at>=date (first_prolong_date='1970-01-01' OR first_prolong_date>date) THEN 'trial'
                         WHEN date>=reg_date AND (trial_start='1970-01-01' OR trial_start>date) THEN 'reg'
                    END AS user_type,
                    CASE WHEN free_days in (3,14,30,35,45) THEN toString(free_days)
                         WHEN free_days=0 AND promo_type='cards' THEN 'no_trial_b2b' 
                         WHEN free_days=0 AND promo_type!='cards' THEN 'no_trial_b2c'
                         ELSE 'other_trial'
                    END AS free_days,
                    session_cnt_ttl,
                    max(app_version) AS app_version,
                    uniq(session_id) AS session_cnt
                    FROM (SELECT date,profile_id,user_id,promo_type,client_type,event_name,event_page,payload,app_version,session_id,reg_date,bonus_title,
                          created_at::date AS trial_start,free_days,trial_start+free_days AS trial_end,first_prolong_date::date AS first_prolong_date,ends_at,
                          JSONExtractString(payload,'item_type') AS item_type,
                          JSONExtractInt(payload,'duration') AS duration,
                          CASE WHEN app_version like '%sber%' THEN 'sber_device' 
                               WHEN app_version not like '%sber%' AND client_type='android_tv' THEN 'android_tv (without sber)' 
                               ELSE client_type 
                          END AS client_type_w_sberdevice,
                          uniq(session_id) OVER (PARTITION BY date,profile_id) as session_cnt_ttl
                          FROM datamarts.sandbox_data_distr 
                          WHERE date='{date}'::date
                          AND client_type != 'backend'  
                          AND reg_date!='1970-01-01'
                          AND profile_id IS NOT NULL
                         ) AS t1
                    GROUP BY  1,2,3,4,5,6,7,8,9,10,11
                    ) AS t1
                    
LEFT JOIN  (SELECT 
                       date,
                       profile_id,
                       client_type,
                       item_type,
                       user_type,
                       bonus_title,
                       free_days,
                       session_watch_ttl,
                       uniq(CASE WHEN watch_session=1 THEN session_id END) AS watch_session2,
                       COALESCE(sum(CASE WHEN watch_session=1 THEN watchtime END),0) AS watchtime_session_watch,
                       sum(watchtime) AS watchtime2
                       FROM 
                           (SELECT 
                            date,
                            client_type,
                            profile_id,
                            item_type,
                            session_id,
                            bonus_title,
                            user_type,
                            free_days,
                            CASE WHEN item_type='kinom' AND duration>0 AND watchtime>=30 THEN 1
                                 WHEN item_type='tvchannel' AND watchtime>=30 THEN 1
                                 WHEN item_type='series' AND duration>600 AND watchtime>=duration*0.05 THEN 1
                                 WHEN item_type='movie' AND duration>600 AND watchtime>=duration*0.05 THEN  1
                                 ELSE 0
                            END AS watch_session,                           
                            watchtime,
                            uniq(CASE WHEN watch_session=1 THEN session_id END) OVER (PARTITION BY date,profile_id) AS session_watch_ttl
                            FROM
                            (SELECT 
                                date,
                                profile_id,
                                client_type,
                                bonus_title,
                                CASE WHEN item_type LIKE '' THEN 'kinom'
                                     WHEN item_type IN ('series','movie','tvchannel') THEN item_type
                                     ELSE 'other'
                                END AS item_type,
                                item_title,
                                duration,
                                session_id,
                                user_type,
                                free_days,
                                sum(viewing_time) AS watchtime
                                FROM 
                                 (SELECT
                                   date,
                                   utc_timestamp,
                                   t1.profile_id AS profile_id,
                                   client_type,
                                   bonus_title,
                                   event_name,
                                   event_page,
                                   session_id,
                                   reg_date,
                                   trial_start,
                                   trial_end
                                   first_prolong_date,
                                   ends_at,
                                   free_days,
                                   CASE WHEN first_prolong_date!='1970-01-01' AND date>=first_prolong_date AND ends_at>=date THEN 'subs'
                                        WHEN date>=trial_start AND ends_at>=date (first_prolong_date='1970-01-01' OR first_prolong_date>date) THEN 'trial'
                                        WHEN date>=reg_date AND (trial_start='1970-01-01' OR trial_start>date) THEN 'reg'
                                   END AS user_type,
                                   JSONExtractInt(payload,'duration') AS duration,
                                   JSONExtractString(payload,'item_type') AS item_type,
                                   JSONExtractString(payload,'item_title') AS item_title,
                                   JSONExtractString(payload,'season') AS item_season, 
                                   JSONExtractString(payload,'episode') AS item_episode,          
                                   CASE WHEN event_page<>'tvchannel'
                                        AND JSONExtractInt(payload,'viewing_time')<= JSONExtractInt(payload,'duration') 
                                   THEN JSONExtractInt(payload,'viewing_time')
                                   WHEN event_page='tvchannel' 
                                        AND JSONExtractInt(payload,'viewing_time') <18000 
                                   THEN JSONExtractInt(payload,'viewing_time')
                                   END AS viewing_time,
                                   count(DISTINCT item_title) OVER (PARTITION BY date,profile_id) AS viewing_cnt,
                                   sum(viewing_time) OVER (PARTITION BY date,profile_id,item_title) AS viewing_time_per_item
                                   FROM (SELECT date,utc_timestamp,profile_id,user_id,client_type,event_name,event_page,payload,app_version,session_id,reg_date,bonus_title,
                                         created_at::date AS trial_start,free_days,trial_start+free_days AS trial_end,first_prolong_date::date AS first_prolong_date,ends_at
                                         FROM datamarts.sandbox_data_distr 
                                         WHERE date='{date}'::date
                                         AND event_name IN ('auto_player_streaming','auto_kinom_streaming')
                                         AND reg_date!='1970-01-01' 
                                         )AS t1

                                   WHERE viewing_time IS NOT NULL   
                                 )
                                 GROUP BY 1,2,3,4,5,6,7,8,9,10
                             )      
                          )
                          GROUP BY 1,2,3,4,5,6,7,8       
                ) AS t2
ON  t1.date=t2.date
AND t1.profile_id=t2.profile_id
AND t1.client_type=t2.client_type
AND t1.user_type=t2.user_type
AND t1.item_type=t2.item_type
'''
    
    
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()

Время выполнения функции: 7.0054 секунд

 Дата 2024-11-01 удалена из таблицы
Время выполнения функции: 27.707 секунд

Дата '2024-11-01' загружена

Время выполнения функции: 2.1314 секунд

 Дата 2024-11-02 удалена из таблицы
Время выполнения функции: 12.5056 секунд

Дата '2024-11-02' загружена

Время выполнения функции: 8.6158 секунд

 Дата 2024-11-03 удалена из таблицы
Время выполнения функции: 3.0357 секунд

Дата '2024-11-03' загружена

Время выполнения функции: 6.6377 секунд

 Дата 2024-11-04 удалена из таблицы
Время выполнения функции: 2.545 секунд

Дата '2024-11-04' загружена

Время выполнения функции: 7.4882 секунд

 Дата 2024-11-05 удалена из таблицы
Время выполнения функции: 2.5821 секунд

Дата '2024-11-05' загружена

Время выполнения функции: 6.2407 секунд

 Дата 2024-11-06 удалена из таблицы
Время выполнения функции: 2.4653 секунд

Дата '2024-11-06' загружена

Время выполнения функции: 6.7295 секунд

 Дата 2024-11-07 удалена из таблицы
Время выполнения функции: 2.6826 секунд



In [None]:
query = ''' SELECT date,sum(watchtime)/3600  AS watchtime,uniq(profile_id) AS cnt_user FROM datamarts.mekhtiev_watchtime_by_day_distr WHERE date>='2024-01-01' GROUP BY 1'''
df = execute(query,user='kmekhtiev')


query2 = '''SELECT date,sum(watchtime)/3600 AS watchtime,uniq(profile_id) AS cnt_user FROM datamarts.mekhtiev_watchtime_by_day_distr2 WHERE date>='2024-01-01' GROUP BY 1'''
df2 = execute(query2,user='kmekhtiev')


In [None]:
plt.figure(figsize=(20,8),dpi=150)

sns.lineplot(data=df,x='date',y='watchtime',legend='auto')
sns.lineplot(data=df2,x='date',y='watchtime',legend='auto')

plt.legend()

plt.show()

## Квантили смотрения ТВ каналов

In [None]:
start_date = "2024-09-01"
end_date = datetime.now().date() - timedelta(days=1)
query  = f''' SELECT date,
                     profile_id,
                     event_page,
                     viewing_time
                     FROM
                            (SELECT
                                        utc_timestamp::date AS date,
                                        profile_id,
                                        client_type,
                                        event_page,
                                        JSONExtractInt(payload,'viewing_time') AS viewing_time,
                                        JSONExtractInt(payload,'duration') AS duration,
                                        JSONExtractString(payload,'item_type') AS item_type,
                                        payload
                                    FROM
                                    datamarts.sandbox_data_distr x 
                                    WHERE event_name IN ('auto_player_streaming','auto_kinom_streaming')
                                    AND utc_timestamp::date BETWEEN '{start_date}' AND '{end_date}'
                                    AND reg_date!='1970-01-01'
                                    AND profile_id IS NOT NULL
                                    AND viewing_time>=0
                                    ) as t1
                                    WHERE event_page='tvchannel'
              
                                    '''
df_tv = execute(query,user = 'kmekhtiev')
df_tv['date'] = df_tv['date'].astype('datetime64[ns]')

In [None]:
df_tv

In [None]:
df_tv[df_tv['viewing_time']>18000]['profile_id'].nunique()/df_tv['profile_id'].nunique()

In [None]:
df_tv['profile_id'].nunique()

In [None]:
quantiles=[1,2,3,4,5,6,7,8,9,10,25,50,75,80,85,90,91,92,93,94,95,96,97,98,99,99.1,99.2,99.3,99.4,99.5,99.6,99.8,99.9]
columns = ['viewing_time']


# пустой df для хранения результатов
quantiles_data = pd.DataFrame(index=quantiles, columns=columns)

for i in columns:
    quantile_values = np.percentile(df_tv['viewing_time'], quantiles)
    quantiles_data[i] = quantile_values

quantiles_data

## Retention по регистрации и оформлению подписки

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                reg_date Date,
                bonus_title String,
                retention_day Int32
            ) 
            --ENGINE = MergeTree()
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_retention_local', '{replica}')
            ORDER BY reg_date
'''
execute(query, user='kmekhtiev')

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                reg_date Date,
                bonus_title String,
                retention_day Int32
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_retention_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [23]:
start_date = "2024-01-01"
end_date = datetime.now().date() - timedelta(days=1)

query_delete = ''' TRUNCATE TABLE datamarts.mekhtiev_retention_local  ON CLUSTER 'viasat_cluster' '''
execute(query_delete,user = 'kmekhtiev')
print ('Таблица очищена')



query_retention = f'''INSERT INTO datamarts.mekhtiev_retention_distr
                        SELECT
                        utc_timestamp::date AS date,
                        profile_id,
                        event_name,
                        reg_date,
                        bonus_title,
                        date - reg_date AS retention_day
                        FROM datamarts.sandbox_data_distr AS t1
                        LEFT JOIN product_x.users AS u ON t1.user_id=u.id
                        WHERE utc_timestamp::date BETWEEN '{start_date}' AND '{end_date}'
                        AND reg_date::date BETWEEN '2023-09-01' AND yesterday()
                        AND profile_id IS NOT NULL
                        GROUP BY 1,2,3,4,5,6
         '''

execute(query_retention,user = 'kmekhtiev')
print('Данные залиты')

Время выполнения функции: 6.1605 секунд

Таблица очищена
Время выполнения функции: 20.3887 секунд

Данные залиты


In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_created_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                created_date Date,
                bonus_title String,
                retention_day Int32
            ) 
            --ENGINE = MergeTree()
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_retention_created_local', '{replica}')
            ORDER BY created_date
'''
execute(query, user='kmekhtiev')

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_created_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                created_date Date,
                bonus_title String,
                retention_day Int32
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_retention_created_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [24]:
start_date = "2024-01-01"
end_date = datetime.now().date() - timedelta(days=1)

query_delete = ''' TRUNCATE TABLE datamarts.mekhtiev_retention_created_local  ON CLUSTER 'viasat_cluster' '''
execute(query_delete,user = 'kmekhtiev')
print ('Таблица очищена')



query_retention = f'''INSERT INTO datamarts.mekhtiev_retention_created_distr
                        SELECT
                        utc_timestamp::date AS date,
                        profile_id,
                        event_name,
                        created_at AS created_date,
                        bonus_title,
                        date - created_at::date AS retention_day
                        FROM datamarts.sandbox_data_distr AS t1
                        LEFT JOIN product_x.users AS u ON t1.user_id=u.id
                        WHERE utc_timestamp::date BETWEEN '{start_date}' AND '{end_date}'
                        AND created_at::date BETWEEN '2023-09-01' AND yesterday()
                        AND profile_id IS NOT NULL
                        GROUP BY 1,2,3,4,5,6
         '''

execute(query_retention,user = 'kmekhtiev')
print('Данные залиты')

Время выполнения функции: 6.9516 секунд

Таблица очищена
Время выполнения функции: 10.707 секунд

Данные залиты


## DAU

In [None]:
#b2c_b2b нет, так как считаем их по полю promo_type, а это поле есть только у зарегов
query = '''CREATE TABLE datamarts.mekhtiev_dau_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                bonus_title String,
                client_type String,
                free_day String,
                user_type String
            ) 
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_dau_local', '{replica}')
            ORDER BY date
'''
execute(query, user='kmekhtiev')


In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_dau_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                bonus_title String,
                client_type String,
                free_day String,
                user_type String
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_dau_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [25]:
list_date = pd.date_range("2024-11-01", "2024-11-07", freq='D')
result=[]
for date in list_date:
        date_str = date.strftime('%Y-%m-%d')
        query_delete = f" DELETE FROM datamarts.mekhtiev_dau_local ON CLUSTER 'viasat_cluster' WHERE date = '{date}'::date"
        execute(query_delete,user = 'kmekhtiev')
        print(f" Дата {date_str} удалена из таблицы")
        query = f'''INSERT INTO datamarts.mekhtiev_dau_distr
                       SELECT
                                    date,
                                    visitor_id,
                                    profile_id,
                                    reg_date,
                                    created_date,
                                    first_prolong_date,
                                    bonus_title,
                                    client_type,
                                    free_days,
                                    CASE
                                        WHEN first_prolong_date!='1970-01-01' AND first_prolong_date<=date THEN 'subs'
                                        WHEN created_date!='1970-01-01' AND created_date<=date THEN 'trial'
                                        WHEN reg_date!='1970-01-01' AND reg_date<=date THEN 'reg'
                                        ELSE 'visitor'
                                    END AS user_type
                                    FROM
                                        (SELECT
                                        date,
                                        visitor_id,
                                        profile_id,
                                        bonus_title,
                                        client_type,
                                        free_days,
                                        profile_id,
                                        reg_date,
                                        promo_type,
                                        created_at::date AS created_date,
                                        first_prolong_date::date AS first_prolong_date
                                        FROM datamarts.sandbox_data_distr
                                        WHERE date='{date}'::date AND client_type !='backend'
                                        GROUP BY 1,2,3,4,5,6,7,8,9,10,11
                                        )

                                '''
        df_temp = execute(query,user = 'kmekhtiev')   
        result.append(df_temp)
        print(f"""Дата '{date_str}' загружена""")
        print()

Время выполнения функции: 7.4601 секунд

 Дата 2024-11-01 удалена из таблицы
Время выполнения функции: 1.2601 секунд

Дата '2024-11-01' загружена

Время выполнения функции: 7.9809 секунд

 Дата 2024-11-02 удалена из таблицы
Время выполнения функции: 1.3448 секунд

Дата '2024-11-02' загружена

Время выполнения функции: 7.3019 секунд

 Дата 2024-11-03 удалена из таблицы
Время выполнения функции: 1.2988 секунд

Дата '2024-11-03' загружена

Время выполнения функции: 7.7545 секунд

 Дата 2024-11-04 удалена из таблицы
Время выполнения функции: 1.2331 секунд

Дата '2024-11-04' загружена

Время выполнения функции: 7.7163 секунд

 Дата 2024-11-05 удалена из таблицы
Время выполнения функции: 1.2921 секунд

Дата '2024-11-05' загружена

Время выполнения функции: 3.1243 секунд

 Дата 2024-11-06 удалена из таблицы
Время выполнения функции: 5.348 секунд

Дата '2024-11-06' загружена

Время выполнения функции: 3.6585 секунд

 Дата 2024-11-07 удалена из таблицы
Время выполнения функции: 1.2743 секунд

Д

## MAU

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_mau_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                bonus_title String,
                client_type String,
                free_day String,
                user_type String
            ) 
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_mau_local', '{replica}')
            ORDER BY date
'''
execute(query, user='kmekhtiev')


In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_mau_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                bonus_title String,
                client_type String,
                free_day String,
                user_type String
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_mau_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [26]:
list_date = pd.date_range("2024-11-01", "2024-11-07", freq='D')
result=[]
for date in list_date:
        date_str = date.strftime('%Y-%m-%d')
        query_delete = f" DELETE FROM datamarts.mekhtiev_mau_local ON CLUSTER 'viasat_cluster' WHERE date = '{date}'::date"
        execute(query_delete,user = 'kmekhtiev')
        print(f" Дата {date_str} удалена из таблицы")
        query = f'''INSERT INTO datamarts.mekhtiev_mau_distr
                      SELECT
                            dt_month AS date,
                            visitor_id,
                            profile_id,
                            reg_date,
                            created_date,
                            first_prolong_date,
                            bonus_title,
                            client_type,
                            free_days,
                            CASE
                                WHEN first_prolong_date!='1970-01-01'  THEN 'subs'
                                WHEN created_date!='1970-01-01' THEN 'trial'
                                WHEN reg_date!='1970-01-01' THEN 'reg'
                                ELSE 'visitor'
                            END AS user_type
                            FROM
                                (SELECT
                                '{date}'::date as dt_month,
                                visitor_id,
                                profile_id,
                                client_type,
                                free_days,
                                profile_id,
                                reg_date,
                                bonus_title,
                                promo_type,
                                created_at::date AS created_date,
                                first_prolong_date::date AS first_prolong_date
                                FROM datamarts.sandbox_data_distr
                                WHERE date BETWEEN '{date}'::date - interval '30' DAY AND '{date}'::date 
                                AND client_type !='backend'
                                GROUP BY 1,2,3,4,5,6,7,8,9,10,11
                            )
                                '''
        df_temp = execute(query,user = 'kmekhtiev')   
        result.append(df_temp)
        print(f"""Дата '{date_str}' загружена""")
        print()

Время выполнения функции: 8.6149 секунд

 Дата 2024-11-01 удалена из таблицы
Время выполнения функции: 6.5259 секунд

Дата '2024-11-01' загружена

Время выполнения функции: 3.5123 секунд

 Дата 2024-11-02 удалена из таблицы
Время выполнения функции: 6.5574 секунд

Дата '2024-11-02' загружена

Время выполнения функции: 3.9198 секунд

 Дата 2024-11-03 удалена из таблицы
Время выполнения функции: 6.2992 секунд

Дата '2024-11-03' загружена

Время выполнения функции: 3.6862 секунд

 Дата 2024-11-04 удалена из таблицы
Время выполнения функции: 6.1366 секунд

Дата '2024-11-04' загружена

Время выполнения функции: 4.2433 секунд

 Дата 2024-11-05 удалена из таблицы
Время выполнения функции: 5.1217 секунд

Дата '2024-11-05' загружена

Время выполнения функции: 4.8062 секунд

 Дата 2024-11-06 удалена из таблицы
Время выполнения функции: 4.6752 секунд

Дата '2024-11-06' загружена

Время выполнения функции: 9.3634 секунд

 Дата 2024-11-07 удалена из таблицы
Время выполнения функции: 2.1936 секунд



## Первый день захода в приложение 

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_min_date_visitor_profile_local ON CLUSTER 'viasat_cluster'
            (
                min_date_visitor Date,
                min_date_profile Date,
                visitor_id UUID,
                client_type_general String,
                bonus_title String,
                user_id Nullable(UUID),
                profile_id Nullable(UUID),
                promo_type String,
                app_version String
                )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/mekhtiev_min_date_visitor_profile_local', '{replica}')
             ORDER BY min_date_visitor
             '''
execute(query,user = 'kmekhtiev')

In [None]:
query = f'''CREATE TABLE datamarts.mekhtiev_min_date_visitor_profile_distr ON CLUSTER 'viasat_cluster'
            (
                min_date_visitor Date,
                min_date_profile Date,
                visitor_id UUID,
                client_type_general String,
                bonus_title String,
                user_id Nullable(UUID),
                profile_id Nullable(UUID),
                promo_type String,
                app_version String
                )
             
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_min_date_visitor_profile_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [27]:
list_date = pd.date_range("2024-07-01", "2024-11-07", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query_delete = f" DELETE FROM datamarts.mekhtiev_min_date_visitor_profile_local ON CLUSTER 'viasat_cluster' WHERE min_date_visitor = '{date_str}'"
    execute(query_delete,user = 'kmekhtiev')
    print(f'Дата {date_str} удалена из таблицы')
    query = f'''
    INSERT INTO datamarts.mekhtiev_min_date_visitor_profile_distr
    SELECT
    min_date_visitor,
    min(min_date_visitor) OVER (PARTITION BY profile_id) AS min_date_profile,
    visitor_id,
    client_type2 AS client_type,
    bonus_title,
    user_id2 AS user_id,
    profile_id2 AS profile_id,
    promo_type2 AS promo_type,
    app_version2 AS app_version
    FROM
        (SELECT
        min(utc_timestamp::date) AS min_date_visitor,
        visitor_id,
        user_id,
        promo_type,
        profile_id,
        client_type,
        bonus_title,
        app_version,
        first_value(client_type) OVER (PARTITION BY visitor_id) AS client_type2,
        first_value(user_id) OVER (PARTITION BY visitor_id) AS user_id2,
        first_value(promo_type) OVER (PARTITION BY visitor_id) AS promo_type2,
        first_value(profile_id) OVER (PARTITION BY visitor_id) AS profile_id2,
        first_value(app_version) OVER (PARTITION BY visitor_id) AS app_version2
        FROM datamarts.sandbox_data_distr x
        LEFT JOIN product_x.users AS u ON x.user_id=u.id
        WHERE client_type!='backend'
        AND utc_timestamp::date>='2023-09-01'
        AND (u.email NOT LIKE '%@test%' OR u.email IS NULL)
        GROUP BY 2,3,4,5,6,7,8
        )
    WHERE min_date_visitor='{date_str}' 
    GROUP BY 1,3,4,5,6,7,8,9
                

            '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_registration = pd.concat(result)

Время выполнения функции: 8.3857 секунд

Дата 2024-07-01 удалена из таблицы
Время выполнения функции: 30.2234 секунд

Дата '2024-07-01' загружена

Время выполнения функции: 7.9971 секунд

Дата 2024-07-02 удалена из таблицы
Время выполнения функции: 33.99 секунд

Дата '2024-07-02' загружена

Время выполнения функции: 0.7369 секунд

Дата 2024-07-03 удалена из таблицы
Время выполнения функции: 14.211 секунд

Дата '2024-07-03' загружена

Время выполнения функции: 4.5489 секунд

Дата 2024-07-04 удалена из таблицы
Время выполнения функции: 14.1405 секунд

Дата '2024-07-04' загружена

Время выполнения функции: 8.9606 секунд

Дата 2024-07-05 удалена из таблицы
Время выполнения функции: 12.5729 секунд

Дата '2024-07-05' загружена

Время выполнения функции: 5.826 секунд

Дата 2024-07-06 удалена из таблицы
Время выполнения функции: 13.1734 секунд

Дата '2024-07-06' загружена

Время выполнения функции: 7.3816 секунд

Дата 2024-07-07 удалена из таблицы
Время выполнения функции: 12.6294 секунд

Дата

DatabaseException: Orig exception: Code: 241. DB::Exception: Memory limit (total) exceeded: would use 14.08 GiB (attempt to allocate chunk of 0 bytes), maximum: 14.06 GiB. OvercommitTracker decision: Memory overcommit has freed not enough memory.: While executing AggregatingTransform. (MEMORY_LIMIT_EXCEEDED) (version 23.3.1.2823 (official build))


# LTV

In [None]:
list_date = pd.date_range("2024-01-01", "2024-09-01", freq='MS',normalize=True)
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
            SELECT 
                '{date_str}' AS trial_month,
                paid_at::date AS paid_date,
                paid_at,
                date_trunc('month',paid_date) AS paid_month,
                s.user_id AS user_id,
                t2.free_days AS free_days,
                i.subscription_id AS subscription_id,
                s.created_at::date AS created_date,
                t2.first_prolong_date::date AS first_prolong_date,
                t2.reg_source AS reg_source,
                t2.reg_medium AS reg_medium,
                t2.bonus_title AS bonus_title,
                CASE WHEN t2.promo_type='cards' THEN 'b2b' ELSE 'b2c' END AS b2c_b2b,
                i.price_currency AS price_currency,
                sum(CASE WHEN i.price_currency='USD' THEN i.price_cents*90/100
                         WHEN i.price_currency='AMD' THEN i.price_cents*0.25/100
                         ELSE i.price_cents/100
                    END) AS payment
                FROM  product_x.invoices i
                LEFT JOIN product_x.subscriptions s ON s.id = i.subscription_id
                LEFT JOIN product_x.users u ON u.id = s.user_id
                INNER JOIN (SELECT 
                           user_id,
                           first_prolong_date::date AS first_prolong_date,
                           reg_source,
                           reg_medium,
                           bonus_title,
                           free_days,
                           promo_type
                           FROM datamarts.marketing_dash_distr
                           WHERE DATE_TRUNC('month', first_prolong_date)='{date_str}'
                           GROUP BY 1,2,3,4,5,6,7
                           ) AS t2 
                        ON s.user_id=t2.user_id
                WHERE u.user_type = 'regular'
                AND u.vipplay = FALSE
                AND s.state in ('normal_period','trial','canceled','grace_period')
                AND i.paid_at BETWEEN '{date_str}' AND '2024-10-01'
                AND i.state in ('success')
                AND ((u.email NOT ILIKE '%%@test%%' AND u.email NOT ILIKE '%%@viasat%%') OR (u.email IS NULL AND u.phone_number IS NOT NULL))
                AND i.price_cents > 100
                GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14
            '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()

df = pd.concat(result)
df['paid_date'] = df['paid_date'].astype('datetime64[ns]')
df['paid_month'] = df['paid_month'].astype('datetime64[ns]')
df['trial_month'] = df['trial_month'].astype('datetime64[ns]')
df['payment'] = df['payment'].astype('int32')
df.sort_values(by='paid_at',inplace=True)

In [None]:
# LTV общий 
df_agg = df.groupby(['trial_month','paid_month']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby('trial_month')['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby('trial_month')['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user

# Нумерация месяцев
df_agg['num_of_month'] = df_agg.groupby('trial_month')['paid_month'].rank(method='first').astype('int')
df_agg = df_agg[df_agg['num_of_month']<7] # Ограничиваем 6-ью месяцами

# Сделаем pivot 
df_pivot = pd.pivot(data = df_agg,index = 'trial_month',columns = 'num_of_month',values = 'ltv')
df_pivot['uniq_user'] = df_agg.groupby('trial_month')['uniq_user'].max()

# Создаем DataFrame с данными для расчета средневзвешенного LTV
weighted_ltv = df_agg.pivot(index='trial_month', columns='num_of_month', values='ltv')
user_counts = df_agg.pivot(index='trial_month', columns='num_of_month', values='uniq_user')

# Рассчитываем средневзвешенное значение LTV для каждого 'num_of_month'
weighted_avg_ltv = ((weighted_ltv * user_counts).sum() / user_counts.sum()).round().astype('int')


weighted_avg_ltv = weighted_avg_ltv.reset_index()
weighted_avg_ltv = weighted_avg_ltv.rename(columns={0:'ltv'})


# Построение графика месячного LTV
plt.figure(figsize=(20,8))
plt.plot(weighted_avg_ltv['num_of_month'], weighted_avg_ltv['ltv'], marker='o', linestyle='-', color='#005f80',markerfacecolor='white',alpha=0.8)

for i in range(len(weighted_avg_ltv['num_of_month'])):  
    plt.text(weighted_avg_ltv['num_of_month'][i], weighted_avg_ltv['ltv'][i]+10,
             str(weighted_avg_ltv['ltv'][i]),
             ha='center', fontsize=10,color='purple')  
plt.xlabel('Месяцы')
plt.ylabel('Руб')
plt.title('Кумулятивная сумма на пользователя по месяцам (Flocktory)')
plt.grid(True,linewidth=0.4)
plt.xticks(rotation=45)
plt.show()

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_ltv_local ON CLUSTER 'viasat_cluster'
            (
                num_of_month Int32,
                ltv Int32
                )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/mekhtiev_ltv_local', '{replica}')
             ORDER BY num_of_month
             '''
execute(query,user = 'kmekhtiev')

In [None]:
query = f'''CREATE TABLE datamarts.mekhtiev_ltv_distr ON CLUSTER 'viasat_cluster'
            (
                num_of_month Int32,
                ltv Int32
                )
             
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_ltv_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
execute(SQL = f" TRUNCATE TABLE datamarts.mekhtiev_ltv_local ON CLUSTER 'viasat_cluster' ", user='kmekhtiev')
connection_clickhouse.insert_df('mekhtiev_ltv_distr', weighted_avg_ltv)

print("Данные залиты")

In [None]:
df['free_days'].unique()

In [None]:
# LTV в разбивке b2c_b2b
df_agg = df[df['free_days'].isin([3,14,30,35,45])].groupby(['trial_month','paid_month','free_days']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby(['trial_month','free_days'])['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby(['trial_month','free_days'])['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user

# Нумерация месяцев
df_agg['num_of_month'] = df_agg.groupby(['trial_month','free_days'])['paid_month'].rank(method='first').astype('int')
df_agg = df_agg[df_agg['num_of_month']<7] # Ограничиваем 6-ью месяцами

# Сделаем pivot 
df_pivot = pd.pivot(data = df_agg,index = ['trial_month','free_days'],columns = 'num_of_month',values = 'ltv')
df_pivot['uniq_user'] = df_agg.groupby(['trial_month','free_days'])['uniq_user'].max()

# Создаем DataFrame с данными для расчета средневзвешенного LTV
weighted_ltv = df_agg.pivot(index='trial_month', columns='num_of_month', values='ltv')
user_counts = df_agg.pivot(index='trial_month', columns='num_of_month', values='uniq_user')

# Рассчитываем средневзвешенное значение LTV для каждого 'num_of_month'
weighted_avg_ltv = ((weighted_ltv * user_counts).sum() / user_counts.sum()).round().astype('int')


weighted_avg_ltv = weighted_avg_ltv.reset_index()
weighted_avg_ltv = weighted_avg_ltv.rename(columns={0:'cumsum'})


# Построение графика месячного LTV
plt.figure(figsize=(20,8))
plt.plot(weighted_avg_ltv['num_of_month'], weighted_avg_ltv['cumsum'], marker='o', linestyle='-', color='#005f80',markerfacecolor='white',alpha=0.8)

for i in range(len(weighted_avg_ltv['num_of_month'])):  
    plt.text(weighted_avg_ltv['num_of_month'][i], weighted_avg_ltv['cumsum'][i]+10,
             str(weighted_avg_ltv['cumsum'][i]),
             ha='center', fontsize=10,color='purple')  
plt.xlabel('Месяцы')
plt.ylabel('Руб')
plt.title('Кумулятивная сумма на пользователя по месяцам (Flocktory)')
plt.grid(True,linewidth=0.4)
plt.xticks(rotation=45)
plt.show()

In [None]:
df_agg = df[df['free_days'].isin([3,14,30,35,45])].groupby(['trial_month','paid_month','free_days']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby(['trial_month','free_days'])['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby(['trial_month','free_days'])['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user


df_agg['num_of_month'] = df_agg.groupby(['trial_month','free_days'])['paid_month'].rank(method='first').astype('int')

df_agg['diff'] = ((df_agg['paid_month'] - df_agg['paid_month'].min()) / pd.offsets.MonthEnd(1)).astype(int)
df_agg = df_agg[df_agg['num_of_month']<7] # Ограничиваем 6-ью месяцами

df_agg[df_agg['free_days']==14]

In [None]:
 'paid_month': ['2024-02-01', '2024-03-01']  
}  
df = pd.DataFrame(data)  

# Преобразование строк в формат даты
df['paid_month'] = pd.to_datetime(df['paid_month'])  

# Добавление нового столбца, который содержит разницу в месяцах
df['difference'] = (df['paid_month'].shift(-1) - df['paid_month']).dt.days //30# Выбор только первой строки разницыresult = df['difference'].dropna().astype(int).iloc[0]  



In [None]:
df[df['b2c_b2b']=='b2c'].groupby(['paid_month']).agg({'payment':'sum','user_id':'count'}).reset_index()

In [None]:
df.groupby(['paid_month','b2c_b2b'])['user_id'].count()

In [None]:
df_agg = df.groupby(['trial_month','paid_month','b2c_b2b']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby(['trial_month','b2c_b2b'])['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby(['trial_month','b2c_b2b'])['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user


# Нумерация месяцев
df_agg['num_of_month'] = df_agg.groupby(['trial_month','b2c_b2b'])['paid_month'].rank(method='first').astype('int')
df_agg = df_agg[df_agg['num_of_month']<7] # Ограничиваем 6-ью месяцами

df_pivot = pd.pivot(data = df_agg,index = ['trial_month','b2c_b2b'],columns = 'num_of_month',values = 'ltv')
df_pivot['uniq_user'] = df_agg.groupby(['trial_month','b2c_b2b'])['uniq_user'].max()

df_pivot

In [None]:
df_pivot

In [None]:
df[(df['trial_month']=='2024-04-01') & (df['user_id']=='583943d4-7048-4c4b-91ca-6edfb27f492b')]

In [None]:
df['cumsum'] = df.groupby('user_id')['payment'].cumsum()
df['cnt_user'] = df.groupby('trial_month')['user_id'].transform('nunique').astype('int32')

In [None]:
df.groupby('trial_month')['user_id'].nunique()

In [None]:
df.groupby(['trial_month','paid_month'])['cumsum'].sum()

In [None]:
df.groupby(['trial_month','paid_month'])['cumsum'].sum()

In [None]:
list_date = pd.date_range("2024-06-01", "2024-07-22", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
                        SELECT 
                        '{date_str}' AS date,
                        trial_duration,
                        count(DISTINCT profile_id) as cnt_user
                        FROM datamarts.marketing_dash_distr
                        WHERE created_at::date BETWEEN '{date_str}'::date - interval '6' DAY AND '{date_str}'::date
                        AND first_prolong_date!='1970-01-01'
                        GROUP BY 1,2
                        '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_registration = pd.concat(result)       

In [None]:
t1 = (pd.pivot(data=df_registration[df_registration.trial_duration.isin([0,3,14])],
              index='date',
              columns='trial_duration')
     )
t1

In [None]:
list_date = pd.date_range("2024-06-01", "2024-07-22", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
                        SELECT 
                        '{date_str}' AS date,
                        trial_duration,
                        count(DISTINCT profile_id) as cnt_user
                        FROM datamarts.marketing_dash_distr
                        WHERE first_prolong_date::date BETWEEN '{date_str}'::date - interval '6' DAY AND '{date_str}'::date
                        GROUP BY 1,2
                        '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_subs = pd.concat(result)       

In [None]:
t2 = (pd.pivot(data=df_subs[df_subs.trial_duration.isin([0,3,14,35])],
              index='date',
              columns='trial_duration')
     )
t2

In [None]:
list_date = pd.date_range("2024-01-01", "2024-07-22", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
                        SELECT 
                        '{date_str}' AS date,
                        trial_duration,
                        created_at::date AS created_at,
                        first_prolong_date::date AS first_prolong_date,
                        profile_id
                        FROM datamarts.marketing_dash_distr
                        WHERE first_prolong_date::date BETWEEN '{date_str}'::date - interval '6' DAY AND '{date_str}'::date
                        GROUP BY 1,2,3,4,5
                        '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_subs_dt = pd.concat(result) 
df_subs_dt['first_prolong_date'] = df_subs_dt['first_prolong_date'].astype('datetime64[ns]')
df_subs_dt['created_at'] = df_subs_dt['created_at'].astype('datetime64[ns]')
df_subs_dt['dt'] = (df_subs_dt.first_prolong_date - df_subs_dt.created_at).dt.days

In [None]:
df_subs_dt_agg = (df_subs_dt[#(df_registration2.first_prolong_date == '2024-07-20') &
                    #& (df_registration2['dt']==35)]
                     (df_subs_dt['trial_duration']==3)]
    .groupby(['dt'])['profile_id'] 
    .nunique() 
    .reset_index() 
)
df_subs_dt_agg = df_subs_dt_agg[df_subs_dt_agg.dt<50]

# Столбчатая диаграмма  
plt.figure(figsize=(15, 8))  
plt.bar(df_subs_dt_agg['dt'], df_subs_dt_agg['profile_id'], color='skyblue')  
plt.ylabel('Число пользователей')
plt.xlabel('Число дней между началом триала и первым списанием')
plt.xticks(rotation=45)
plt.grid()
plt.show() 

In [None]:
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter, MultipleLocator
df = (df_subs_dt[(df_subs_dt.dt.isin([3,45])) & (df_subs_dt.trial_duration==3)]
        .groupby(['date','dt'])['profile_id']
        .nunique()
        .reset_index()
        )

df['sum'] = df.groupby('date')['profile_id'].transform('sum')
df['frac'] = df['profile_id']/df['sum']

df_pivot = df.pivot(index='date', columns='dt', values='frac').fillna(0)

# Построение столбчатой диаграммы
colors = ['#2E8B57', '#98FB98']  # Темно-зеленый и светло-зеленый
ax = df_pivot.plot(kind='bar', stacked=True, figsize=(15, 8),color=colors)
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f'{int(y * 100)}%'))
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
plt.legend(title='dt', loc='upper right')
plt.xticks(rotation=45)  # Поворот меток даты для улучшения читаемости
plt.show()

# Watchtime общий 

In [None]:
def watchtime (df,client_type_general=None):
        if client_type_general:
            result = df.groupby(['date','client_type_general']).agg({'watchtime':'sum','profile_id':'nunique','active_day':'sum','watch_day':'sum'}).reset_index()
            result.rename(columns={'profile_id':'cnt_all_user'},inplace=True)
            
            cnt_watch_user = df[df['watchtime'] != 0].groupby(['date','client_type_general'])['profile_id'].nunique()
            result = pd.merge(result,cnt_watch_user,on=['date','client_type_general'],how='left')
            result.rename(columns={'profile_id':'cnt_watch_user'},inplace=True)
            
        else:
            result = df.groupby('date').agg({'watchtime':'sum','profile_id':'nunique','active_day':'sum','watch_day':'sum'}).reset_index()
            result.rename(columns={'profile_id':'cnt_all_user'},inplace=True)
                          
            cnt_watch_user = df[df['watchtime'] != 0].groupby(['date'])['profile_id'].nunique()
            result = pd.merge(result,cnt_watch_user,on=['date'],how='left')
            result.rename(columns={'profile_id':'cnt_watch_user'},inplace=True)
        return result

In [None]:
data = watchtime(df)
plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watchtime']/3600, marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.xlabel('Date')
plt.ylabel('Watchtime (hours)')
plt.title('Watchtime over 7 days')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watchtime']/3600, label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.xlabel('Date')
plt.ylabel('Watchtime (hours)')
plt.title('Watchtime over 7 days')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Watchtime на пользователя

In [None]:
data = watchtime(df)

data['watchtime_per_user'] = data['watchtime']/data['cnt_all_user']

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watchtime_per_user']/3600, marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.ylabel('Watchtime per user (hour)')
plt.title('Watchtime per user over 7 days')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
data['watchtime_per_user'] = data['watchtime']/data['cnt_all_user']
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем ра"змер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watchtime_per_user']/3600, label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.ylabel('Watchtime per user (hour)')
plt.title('Watchtime per user over 7 days')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Число пользователей 

In [None]:
data = watchtime(df)

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['cnt_all_user'], marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.xlabel('Date')
plt.ylabel('Users')
plt.title('Число пользователей')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['cnt_all_user'], label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.ylabel('Users')
plt.title('Число пользователей')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Число активных дней 

In [None]:
data = watchtime(df)
data['active_day_per_user'] = data['active_day']/data['cnt_all_user']

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['active_day_per_user'], marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.ylabel('Число дней')
plt.title('Число активных дней на пользователя')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
grouped_data = data.groupby('client_type_general')
data['active_day_per_user'] = data['active_day']/data['cnt_all_user']

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['active_day_per_user'], label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.xlabel('Date')
plt.ylabel('Число дней')
plt.title('Число активных дней на пользователя')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Конверсия в день просмотр из активного дня 

In [None]:
import matplotlib.ticker as mticker
data = watchtime(df)
data['watch_day_%'] = data['watch_day']/data['active_day'] * 100

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watch_day_%'], marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.ylabel('%')
plt.title('Конверсия из активного дня в дни с просмотром')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter())
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
data['watch_day_%'] = data['watch_day']/data['active_day'] * 100
grouped_data = data.groupby('client_type_general')


plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watch_day_%'], label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.xlabel('Date')
plt.ylabel('%')
plt.title('Конверсия из активного дня в дни с просмотром')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter())
plt.show()

# Watchtime на смотрящих пользователей

In [None]:
data = watchtime(df)

data['watchtime_per_user'] = data['watchtime']/data['cnt_watch_user']

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watchtime_per_user']/3600, marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.xlabel('Date')
plt.ylabel('Watchtime per user (hour)')
plt.title('Watchtime на смотрящего пользователя за 7 дней')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
data['watchtime_per_user'] = data['watchtime']/data['cnt_watch_user']
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watchtime_per_user']/3600, label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.ylabel('hour')
plt.title('Watchtime на смотрящего пользователя за 7 дней')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for col in df_retention.iloc[:,7:]:
        plt.plot(df_retention['reg_date'], df_retention[col]*100, label=col,marker='o',alpha=0.8,markerfacecolor='whi†te')

plt.xlabel('Date')
plt.ylabel('%')
plt.title('Retention')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter())
plt.show()

# Новые пользователи и конверсии в регистрацию/триал/подписку

In [None]:
df_registration[df_registration.cnt_profile<2].groupby('date')['visitor_id'].nunique().reset_index()

In [None]:
df_registration[(df_registration.reg_date!='1970-01-01') & (df_registration.min_date<=df_registration.reg_date) & (df_registration.cnt_profile<2)].groupby('date_reg')['profile_id'].nunique().reset_index()

In [None]:
df_registration[(df_registration.created_date!='1970-01-01') & (df_registration.min_date<=df_registration.created_date) & (df_registration.cnt_profile<2)].groupby('date')['profile_id'].nunique().reset_index()

In [None]:
df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2)].groupby(['date_reg','trial_duration'])['profile_id'].nunique().reset_index()

In [None]:
df_registration_agg = df_registration.groupby('date')['visitor_id'].nunique().reset_index()

t1 = df_registration[(df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2)].groupby('date')['profile_id'].nunique().reset_index()
t1 = t1.rename(columns={'profile_id':'registration_cnt'})

t2 = df_registration[(df_registration.created_date!='1970-01-01') & (df_registration.cnt_profile<2)].groupby('date')['profile_id'].nunique().reset_index()
t2 = t2.rename(columns={'profile_id':'trial_cnt'})


t3 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==0)].groupby('date')['profile_id'].nunique().reset_index()
t3 = t3.rename(columns={'profile_id':'payd_cnt_0_trial'})

t4 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==3)].groupby('date')['profile_id'].nunique().reset_index()
t4 = t4.rename(columns={'profile_id':'payd_cnt_3_trial'})

t5 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==14)].groupby('date')['profile_id'].nunique().reset_index()
t5 = t5.rename(columns={'profile_id':'payd_cnt_14_trial'})

t6 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==35)].groupby('date')['profile_id'].nunique().reset_index()
t6 = t6.rename(columns={'profile_id':'payd_cnt_35_trial'})

# Объединение данных с помощью метода merge
df_registration_agg = df_registration_agg.merge(t1, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t2, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t3, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t4, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t5, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t6, on='date', how='left')


df_registration_agg.columns = ['date', 'visitor_id', 'registration_cnt','trial_cnt','payd_cnt_0_trial','payd_cnt_3_trial','payd_cnt_14_trial','payd_cnt_35_trial']

In [None]:
df_registration[(df_registration.first_prolong_date!='1970-01-01')].groupby('trial_duration')['profile_id'].nunique()

In [None]:
df_registration_agg['register_frac'] = df_registration_agg.registration_cnt/df_registration_agg.visitor_id * 100
df_registration_agg['trial_frac'] = df_registration_agg.trial_cnt/df_registration_agg.registration_cnt * 100
df_registration_agg['payd_0_day_frac'] = df_registration_agg.payd_cnt_0_trial/df_registration_agg.trial_cnt * 100
df_registration_agg['payd_3_day_frac'] = df_registration_agg.payd_cnt_3_trial/df_registration_agg.trial_cnt * 100
df_registration_agg['payd_14_day_frac'] = df_registration_agg.payd_cnt_14_trial/df_registration_agg.trial_cnt * 100



In [None]:
df_registration_agg

In [None]:
import copy
fig, axs = plt.subplots(6, figsize=(20, 25))

# График 1: visitor_id
axs[0].plot(df_registration_agg['date'], df_registration_agg['visitor_id'], color='blue', marker='s', alpha=0.7,markerfacecolor='white')
axs[0].set_title('Новые пользователи')
axs[0].set_ylabel('Число пользователей')
axs[0].tick_params(axis='x', rotation=45)
axs[0].grid()

# График 2: registration_cnt
axs[1].plot(df_registration_agg['date'], df_registration_agg['registration_cnt'], color='green', marker='s', alpha=0.7,markerfacecolor='white')
axs[1].set_title('Зарегистрированные')
axs[1].set_ylabel('Число пользователей')
axs[1].tick_params(axis='x', rotation=45)
axs[1].grid()

# График 3: trial_cnt
axs[2].plot(df_registration_agg['date'], df_registration_agg['trial_cnt'], color='red', marker='s', alpha=0.7,markerfacecolor='white')
axs[2].set_title('Триал')
axs[2].set_ylabel('Число пользователей')
axs[2].tick_params(axis='x', rotation=45)
axs[2].grid()

# Создайте копию исходного DataFrame
df_plot4 = df_registration_agg.copy()


# График 4: trial_cnt_3_day
axs[3].plot(df_plot4['date'], df_plot4['payd_cnt_0_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[3].set_title('Подписчики c триалом 0 дней')
axs[3].set_ylabel('Число пользователей')
axs[3].tick_params(axis='x', rotation=45)
axs[3].grid()

df_plot4 = df_plot4[df_plot4['date'] <= (datetime.now() - timedelta(days=4))]

# График 5: trial_cnt_3_day
axs[4].plot(df_plot4['date'], df_plot4['payd_cnt_3_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[4].set_title('Подписчики с триалом 3 дня')
axs[4].set_ylabel('Число пользователей')
axs[4].tick_params(axis='x', rotation=45)
axs[4].grid()

# Создайте копию исходного DataFrame
df_plot5 = df_registration_agg.copy()


df_plot5 = df_plot4[df_plot4['date'] <= (datetime.now() - timedelta(days=15))]

# График 6: trial_cnttrial_cnt_14_day
axs[5].plot(df_plot5['date'], df_plot5['payd_cnt_14_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[5].set_title('Подписчики с триалом 14 дней')
axs[5].set_ylabel('Число пользователей')
axs[5].tick_params(axis='x', rotation=45)
axs[5].grid()


# # График 6: trial_cnttrial_cnt_35_day
# axs[5].plot(df_registration_agg['min_date'], df_registration_agg['payd_cnt_35_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
# axs[5].set_title('Подписчики')
# axs[5].set_ylabel('trial_cnt')
# axs[5].tick_params(axis='x', rotation=45)
# axs[5].grid()


# Увеличение расстояния между графиками
plt.subplots_adjust(hspace=0.8)


plt.tight_layout()
plt.show()

In [None]:
import copy
fig, axs = plt.subplots(5, figsize=(20, 25))


# График 1: registration_cnt
axs[0].plot(df_registration_agg['date'], df_registration_agg['register_frac'], color='green', marker='s', alpha=0.7,markerfacecolor='white')
axs[0].set_title('Доля регистраций')
axs[0].set_ylabel('%')
axs[0].tick_params(axis='x', rotation=45)
axs[0].grid()

# График 2: trial_cnt
axs[1].plot(df_registration_agg['date'], df_registration_agg['trial_frac'], color='red', marker='s', alpha=0.7,markerfacecolor='white')
axs[1].set_title('Доля триальщиков')
axs[1].set_ylabel('%')
axs[1].tick_params(axis='x', rotation=45)
axs[1].grid()


# График 3: trial_cnttrial_cnt_14_day
axs[2].plot(df_plot4['date'], df_plot4['payd_0_day_frac'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[2].set_title('Подписчики с триалом 0 дней')
axs[2].set_ylabel('%')
axs[2].tick_params(axis='x', rotation=45)
axs[2].grid()


# График 4: trial_cnttrial_cnt_35_day
axs[3].plot(df_plot5['date'], df_plot5['payd_3_day_frac'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[3].set_title('Подписчики с триалом 3 дня')
axs[3].set_ylabel('%')
axs[3].tick_params(axis='x', rotation=45)
axs[3].grid()

# График 5: trial_cnttrial_cnt_35_day
axs[4].plot(df_plot5['date'], df_plot5['payd_14_day_frac'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[4].set_title('Подписчики c триалом 14 дней')
axs[4].set_ylabel('%')
axs[4].tick_params(axis='x', rotation=45)
axs[4].grid()


# Увеличение расстояния между графиками
plt.subplots_adjust(hspace=0.8)


plt.tight_layout()
plt.show()

In [None]:
query ="""
                SELECT  *
                    FROM datamarts.marketing_dash_distr
                    WHERE  first_prolong_date between '2024-07-09' and '2024-08-14' 
                    AND  bonus_title like '%GetBlogger%'
                    AND payer = 1 
"""
df = execute(query,user='kmekhtiev')

df.to_excel('getbloggers_payers_0709_0814.xlsx',index=False)

In [None]:
query ="""
SELECT 
user_id, 
bonus_title, 
promo, 
reg_source,
reg_campaign, 
reg_medium,device
FROM datamarts.marketing_dash_distr mdd 
WHERE device='web' AND reg_date>='2024-08-10'
"""
df = execute(query,user='kmekhtiev')

df.to_excel('Выгрузка_зарегов1508_1208.xlsx',index=False)

In [None]:
query = """
                    SELECT  *
                    FROM datamarts.marketing_dash_distr
                    WHERE  first_prolong_date between '2024-08-01' and '2024-08-15' 
                    --AND  bonus_title like '%GetBlogger%'
                    AND payer = 1 
                """

df = execute(query,user='kmekhtiev')
df.to_excel('first_pay_0108_1508.xlsx',index=False)