In [8]:
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import seaborn as sns 
import statsmodels.stats.proportion as proportion
from scipy.stats import ttest_ind,mannwhitneyu,shapiro,norm
from statsmodels.stats.weightstats import ztest
from tqdm import tqdm
import timeit
from scipy import stats
import math
from datetime import date, datetime, timedelta
import time
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
import warnings
warnings.filterwarnings("ignore")
import clickhouse_connect  



from credential import postgres_secret,clickhouse_dwh_secret

def get_engine(user):
    if user == postgres_secret['user']:
        db_name = postgres_secret['db_name']
        password = postgres_secret['password']
        host = postgres_secret['host']
        engine = create_engine(f'postgresql://{user}:{password}@{host}:6432/{db_name}')
    elif user == clickhouse_dwh_secret['user']:
            db_name = clickhouse_dwh_secret['db_name'] 
            password = clickhouse_dwh_secret['password']
            host = clickhouse_dwh_secret['host']
            engine = create_engine(f'clickhouse://{user}:{password}@{host}:8123/{db_name}')
    return engine
    
connection_clickhouse = clickhouse_connect.get_client(
    host = clickhouse_dwh_secret['host'],
    port= '8123',
    username = clickhouse_dwh_secret['user'],
    password = clickhouse_dwh_secret['password'],
    database='datamarts'
    )

    
def execute(SQL, user):
    start_time = time.time()  # запоминаем время начала выполнения функции
    engine = get_engine(user)
    Session = sessionmaker(bind=engine)  # sessions factory ()
    with Session() as session: # open session
        result = session.execute(text(SQL))
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
        
    end_time = time.time()  # запоминаем время окончания выполнения функции
    execution_time = round(end_time - start_time,4) # вычисляем время выполнения   
    
    print(f"Время выполнения функции: {execution_time} секунд")
    print()
    return df

In [None]:
pwd

In [None]:
query = f'''
                    SELECT 
                    t4.title AS bonus_title,
                    t1.title AS title,
                    JSONExtractInt(t4.metadata,'duration') AS duration,
                    JSONExtractString(t4.metadata,'duration_unit') AS duration_unit,
                    t3.activated_at AS bonus_activated_at,
                    t3.activated_at::date AS bonus_start_at,
                    t3.activated_at::date + JSONExtractInt(t4.metadata,'duration') AS bonus_end_at

                    FROM product_x.promo_codes AS t1

                    LEFT JOIN (SELECT * FROM product_x.bonus_programs bp 
                                ) t4
                    ON t1.bonus_program_id=t4.id

                    LEFT JOIN (SELECT * FROM product_x.promo_code_activations
                               ) AS t2
                    ON 	t1.id=t2.promo_code_id


                    LEFT JOIN (SELECT * FROM  product_x.user_bonuses
                              ) AS t3
                    ON t2.user_bonus_id=t3.id

                    WHERE bonus_activated_at BETWEEN'2024-01-01' AND '2024-08-01'
         '''

df = execute(query,user = 'kmekhtiev')
# df_retention = execute(query_retention,user = 'kmekhtiev')
# df_retention['reg_date'] = df_retention['reg_date'].astype('datetime64[ns]')

In [None]:
df.to_excel('Активированные коды с января по ферваль.xlsx',index=False)

## Watchtime 

In [5]:
query = '''CREATE TABLE sandbox.mekhtiev_watchtime_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                client_type String,
                client_type_w_sberdevice String,
                bonus_title String,
                source String,
                item_type String,
                b2c_b2b String,
                user_type String,
                free_days String,
                app_version String,
                promo_type String,
                country String,
                device_type String,
                os_version String,
                session_cnt_ttl Int32,
                session_watch_ttl Int32,
                session_watch Int32,
                watchtime_session_watch Int32,
                session_cnt Int32,
                watchtime Int32
             )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/mekhtiev_watchtime_local', '{replica}')
             ORDER BY date
             '''
execute(query,user = 'kmekhtiev')

Время выполнения функции: 7.6748 секунд



Unnamed: 0,host,port,status,error,num_hosts_remaining,num_hosts_active
0,dwh02,9000.0,57.0,Code: 57. DB::Exception: Table sandbox.mekhtie...,3.0,0.0
1,dwh04,9000.0,57.0,Code: 57. DB::Exception: Table sandbox.mekhtie...,2.0,0.0
2,dwh01,9000.0,57.0,Code: 57. DB::Exception: Table sandbox.mekhtie...,1.0,0.0
3,dwh03,9000.0,57.0,Code: 57. DB::Exception: Table sandbox.mekhtie...,0.0,0.0
4,Code: 57. DB::Exception: There was an error on...,,,,,


In [7]:
query = '''CREATE TABLE sandbox.mekhtiev_watchtime ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                client_type String,
                client_type_w_sberdevice String,
                bonus_title String,
                source String,
                item_type String,
                b2c_b2b String,
                user_type String,
                free_days String,
                app_version String,
                promo_type String,
                country String,
                device_type String,
                os_version String,
                session_cnt_ttl Int32,
                session_watch_ttl Int32,
                session_watch Int32,
                watchtime_session_watch Int32,
                session_cnt Int32,
                watchtime Int32
             )
             ENGINE = Distributed(viasat_cluster, sandbox, mekhtiev_watchtime_local, rand())
             '''
execute(query,user = 'kmekhtiev')

Время выполнения функции: 5.3526 секунд



Unnamed: 0,host,port,status,error,num_hosts_remaining,num_hosts_active
0,dwh03,9000,0,,3,0
1,dwh01,9000,0,,2,0
2,dwh04,9000,0,,1,0
3,dwh02,9000,0,,0,0


In [12]:
list_date = pd.date_range("2023-09-01", "2023-12-31", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    #query_delete = f" DELETE FROM datamarts.watchtime_by_day ON CLUSTER 'viasat_cluster' WHERE date = '{date}'::date"
    #execute(query_delete,user = 'kmekhtiev')
    #print(f" Дата {date_str} удалена из таблицы")
    query = f''' INSERT INTO datamarts.watchtime_by_day
WITH first_step AS (
    SELECT date,
        utc_timestamp,
        profile_id,
        user_id,
        client_type,
        event_name,
        event_page,
        payload,
        bonus_title,
        app_version,
        app_version,
        session_id,
        created_at::date AS trial_start,
        free_days,
        device_type,
        os_version,
        reg_date,
        trial_start+free_days AS trial_end,
        source,
        promo_type,
        first_prolong_date::date AS first_prolong_date,
        ends_at,
        country
    FROM 
        datamarts.clean_event
    WHERE 
        date='{date}'::date
        AND reg_date!='1970-01-01' 
        AND client_type!='backend'
),

second_step AS (
    SELECT
        date,
        utc_timestamp,
        user_id,
        profile_id,
        client_type,
        bonus_title,
        event_name,
        event_page,
        session_id,
        reg_date,
        app_version,
        country,
        trial_start,
        trial_end,
        promo_type,
        first_prolong_date,
        ends_at,
        source,
        free_days,
        device_type,
        os_version,
        event_name,
        CASE WHEN first_prolong_date!='1970-01-01' 
                AND date>=first_prolong_date 
                AND ends_at>=date THEN 'subs'
                WHEN date>=trial_start 
                AND ends_at>=date (first_prolong_date='1970-01-01' OR first_prolong_date>date) THEN 'trial'
                WHEN date>=reg_date 
                AND (trial_start='1970-01-01' OR trial_start>date) THEN 'reg'
        END AS user_type,
        JSONExtractInt(payload,'duration') AS duration,
        JSONExtractString(payload,'item_type') AS item_type,
        JSONExtractString(payload,'item_title') AS item_title,
        JSONExtractString(payload,'season') AS item_season,
        JSONExtractString(payload,'episode') AS item_episode,
        CASE WHEN event_name IN ('auto_player_streaming','auto_kinom_streaming') 
                AND event_page<>'tvchannel' 
                AND JSONExtractInt(payload,'viewing_time')<= JSONExtractInt(payload,'duration')
                THEN JSONExtractInt(payload,'viewing_time')
                WHEN event_name IN ('auto_player_streaming','auto_kinom_streaming') 
                AND event_page='tvchannel' 
                AND JSONExtractInt(payload,'viewing_time') <18000
                THEN JSONExtractInt(payload,'viewing_time')
        END AS viewing_time,
        count(DISTINCT item_title) OVER (PARTITION BY date,profile_id) AS viewing_cnt,
        uniq(session_id) OVER (PARTITION BY date,profile_id) as session_cnt_ttl,
        sum(viewing_time) OVER (PARTITION BY date,profile_id,item_title) AS viewing_time_per_item
    FROM
        first_step
),

third_step AS (
    SELECT 
        date,
        profile_id,
        client_type,
        promo_type,
        CASE WHEN promo_type='cards' THEN 'b2b' ELSE 'b2c' END AS b2c_b2b,
        bonus_title,
        source,
        CASE WHEN item_type LIKE '' THEN 'kinom'
                WHEN item_type IN ('series','movie','tvchannel') THEN item_type
                ELSE 'other'
        END AS item_type,
        item_title,
        duration,
        session_id,
        user_type,
        free_days,
        session_cnt_ttl,
        app_version,
        country,
        device_type,
        os_version,
        uniq(session_id) AS session_cnt,
        sum(viewing_time) AS watchtime
    FROM
        second_step 
    GROUP BY
        1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
),

fourth_step AS (
    SELECT
        date,
        client_type,
        CASE WHEN app_version like '%sber%' THEN 'sber_device' 
                WHEN app_version not like '%sber%' AND client_type='android_tv' THEN 'android_tv (without sber)' 
                ELSE client_type 
            END AS client_type_w_sberdevice,
        profile_id,
        b2c_b2b,
        promo_type,
        session_cnt_ttl,
        session_cnt,
        app_version,
        country,
        item_type,
        session_id,
        bonus_title,
        source,
        user_type,
        free_days,
        device_type,
        os_version,
        CASE WHEN item_type='kinom' AND duration>0 AND watchtime>=30 THEN 1
                WHEN item_type='tvchannel' AND watchtime>=30 THEN 1
                WHEN item_type='series' AND duration>600 AND watchtime>=duration*0.05 THEN 1
                WHEN item_type='movie' AND duration>600 AND watchtime>=duration*0.05 THEN  1
                ELSE 0
        END AS watch_session,
        watchtime,
        uniq(CASE WHEN watch_session=1 THEN session_id END) OVER (PARTITION BY date,profile_id) AS session_watch_ttl
    FROM
        third_step
)

SELECT 
    date,
    profile_id,
    client_type,
    client_type_w_sberdevice,
    bonus_title,
    source,
    item_type,
    b2c_b2b,
    user_type,
    free_days,
    app_version,
    promo_type,
    country,
    device_type,
    os_version,
    session_cnt_ttl,
    session_watch_ttl,
    uniq(CASE WHEN watch_session=1 THEN session_id END) AS watch_session2,
    COALESCE(sum(CASE WHEN watch_session=1 THEN watchtime END),0) AS watchtime_session_watch,
    uniq(session_cnt) AS session_cnt2,
    sum(watchtime) AS watchtime2
FROM
    fourth_step
GROUP BY
    1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
;


'''
    
    
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()

Время выполнения функции: 0.9729 секунд

Дата '2023-09-01' загружена

Время выполнения функции: 0.8419 секунд

Дата '2023-09-02' загружена

Время выполнения функции: 0.8414 секунд

Дата '2023-09-03' загружена

Время выполнения функции: 0.7706 секунд

Дата '2023-09-04' загружена

Время выполнения функции: 0.8653 секунд

Дата '2023-09-05' загружена

Время выполнения функции: 0.8237 секунд

Дата '2023-09-06' загружена

Время выполнения функции: 0.8048 секунд

Дата '2023-09-07' загружена

Время выполнения функции: 0.966 секунд

Дата '2023-09-08' загружена

Время выполнения функции: 0.8073 секунд

Дата '2023-09-09' загружена

Время выполнения функции: 0.8155 секунд

Дата '2023-09-10' загружена

Время выполнения функции: 0.8985 секунд

Дата '2023-09-11' загружена

Время выполнения функции: 0.8854 секунд

Дата '2023-09-12' загружена

Время выполнения функции: 0.8702 секунд

Дата '2023-09-13' загружена

Время выполнения функции: 0.82 секунд

Дата '2023-09-14' загружена

Время выполнения функци

In [16]:
query = f''' WITH first_step AS (
    SELECT date,
        utc_timestamp,
        profile_id,
        user_id,
        client_type,
        event_name,
        event_page,
        payload,
        bonus_title,
        app_version,
        app_version,
        session_id,
        created_at::date AS trial_start,
        free_days,
        device_type,
        os_version,
        reg_date,
        trial_start+free_days AS trial_end,
        source,
        promo_type,
        first_prolong_date::date AS first_prolong_date,
        ends_at,
        country
    FROM 
        datamarts.clean_event
    WHERE 
        date='2023-09-01'::date
        AND reg_date!='1970-01-01' 
        AND client_type!='backend'
),

second_step AS (
    SELECT
        date,
        utc_timestamp,
        user_id,
        profile_id,
        client_type,
        bonus_title,
        event_name,
        event_page,
        session_id,
        reg_date,
        app_version,
        country,
        trial_start,
        trial_end,
        promo_type,
        first_prolong_date,
        ends_at,
        source,
        free_days,
        device_type,
        os_version,
        event_name,
        CASE WHEN first_prolong_date!='1970-01-01' 
                AND date>=first_prolong_date 
                AND ends_at>=date THEN 'subs'
                WHEN date>=trial_start 
                AND ends_at>=date (first_prolong_date='1970-01-01' OR first_prolong_date>date) THEN 'trial'
                WHEN date>=reg_date 
                AND (trial_start='1970-01-01' OR trial_start>date) THEN 'reg'
        END AS user_type,
        JSONExtractInt(payload,'duration') AS duration,
        JSONExtractString(payload,'item_type') AS item_type,
        JSONExtractString(payload,'item_title') AS item_title,
        JSONExtractString(payload,'season') AS item_season,
        JSONExtractString(payload,'episode') AS item_episode,
        CASE WHEN event_name IN ('auto_player_streaming','auto_kinom_streaming') 
                AND event_page<>'tvchannel' 
                AND JSONExtractInt(payload,'viewing_time')<= JSONExtractInt(payload,'duration')
                THEN JSONExtractInt(payload,'viewing_time')
                WHEN event_name IN ('auto_player_streaming','auto_kinom_streaming') 
                AND event_page='tvchannel' 
                AND JSONExtractInt(payload,'viewing_time') <18000
                THEN JSONExtractInt(payload,'viewing_time')
        END AS viewing_time,
        count(DISTINCT item_title) OVER (PARTITION BY date,profile_id) AS viewing_cnt,
        uniq(session_id) OVER (PARTITION BY date,profile_id) as session_cnt_ttl,
        sum(viewing_time) OVER (PARTITION BY date,profile_id,item_title) AS viewing_time_per_item
    FROM
        first_step
),

third_step AS (
    SELECT 
        date,
        profile_id,
        client_type,
        promo_type,
        CASE WHEN promo_type='cards' THEN 'b2b' ELSE 'b2c' END AS b2c_b2b,
        bonus_title,
        source,
        CASE WHEN item_type LIKE '' THEN 'kinom'
                WHEN item_type IN ('series','movie','tvchannel') THEN item_type
                ELSE 'other'
        END AS item_type,
        item_title,
        duration,
        session_id,
        user_type,
        free_days,
        session_cnt_ttl,
        app_version,
        country,
        device_type,
        os_version,
        uniq(session_id) AS session_cnt,
        sum(viewing_time) AS watchtime
    FROM
        second_step 
    GROUP BY
        1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
),

fourth_step AS (
    SELECT
        date,
        client_type,
        CASE WHEN app_version like '%sber%' THEN 'sber_device' 
                WHEN app_version not like '%sber%' AND client_type='android_tv' THEN 'android_tv (without sber)' 
                ELSE client_type 
            END AS client_type_w_sberdevice,
        profile_id,
        b2c_b2b,
        promo_type,
        session_cnt_ttl,
        session_cnt,
        app_version,
        country,
        item_type,
        session_id,
        bonus_title,
        source,
        user_type,
        free_days,
        device_type,
        os_version,
        CASE WHEN item_type='kinom' AND duration>0 AND watchtime>=30 THEN 1
                WHEN item_type='tvchannel' AND watchtime>=30 THEN 1
                WHEN item_type='series' AND duration>600 AND watchtime>=duration*0.05 THEN 1
                WHEN item_type='movie' AND duration>600 AND watchtime>=duration*0.05 THEN  1
                ELSE 0
        END AS watch_session,
        watchtime,
        uniq(CASE WHEN watch_session=1 THEN session_id END) OVER (PARTITION BY date,profile_id) AS session_watch_ttl
    FROM
        third_step
)

SELECT 
    date,
    profile_id,
    client_type,
    client_type_w_sberdevice,
    bonus_title,
    source,
    item_type,
    b2c_b2b,
    user_type,
    free_days,
    app_version,
    promo_type,
    country,
    device_type,
    os_version,
    session_cnt_ttl,
    session_watch_ttl,
    uniq(CASE WHEN watch_session=1 THEN session_id END) AS watch_session2,
    COALESCE(sum(CASE WHEN watch_session=1 THEN watchtime END),0) AS watchtime_session_watch,
    uniq(session_cnt) AS session_cnt2,
    sum(watchtime) AS watchtime2
FROM
    fourth_step
GROUP BY
    1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
;


'''
    
df_temp = execute(query,user = 'kmekhtiev')   

Время выполнения функции: 0.8388 секунд



In [18]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6c1026c9-883a-4cb7-9f31-7733f158c33e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import psycopg2\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as ticker\n",
    "import numpy as np\n",
    "import seaborn as sns \n",
    "import statsmodels.stats.proportion as proportion\n",
    "from scipy.stats import ttest_ind,mannwhitneyu,shapiro,norm\n",
    "from statsmodels.stats.weightstats import ztest\n",
    "from tqdm import tqdm\n",
    "import timeit\n",
    "from scipy import stats\n",
    "import math\n",
    "from datetime import date, datetime, timedelta\n",
    "import time\n",
    "from sqlalchemy import create_engine, text\n",
    "from sqlalchemy.orm import sessionmaker\n",
    "import clickhouse_connect  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3500a90b-1a1a-449a-82f8-0e563ba2fa42",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import gspread\n",
    "from gspread_dataframe import get_as_dataframe\n",
    "from google.oauth2.service_account import Credentials\n",
    "import requests\n",
    "import xml.etree.ElementTree as ET\n",
    "from oauth2client.service_account import ServiceAccountCredentials"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eb2241f6-afc6-4348-992a-d8a94ee7760b",
   "metadata": {},
   "source": [
    "# Код для прогрузки диапазона дат"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "52df9b0a-eb3e-49c1-b97c-acbab84c2a98",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Курс JPY на 01/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 02/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 03/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 04/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 05/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 06/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 07/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 08/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 09/01/2025: 0.643746\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 10/01/2025: 0.646063\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 11/01/2025: 0.6434000000000001\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 12/01/2025: 0.6434000000000001\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 13/01/2025: 0.6434000000000001\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 14/01/2025: 0.6484099999999999\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 15/01/2025: 0.6565819999999999\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 16/01/2025: 0.650518\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 17/01/2025: 0.654747\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 18/01/2025: 0.660095\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 19/01/2025: 0.660095\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 20/01/2025: 0.660095\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 21/01/2025: 0.6533669999999999\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 22/01/2025: 0.64036\n",
      "Курс успешно записан в Google Sheets.\n",
      "Курс JPY на 23/01/2025: 0.632028\n",
      "Курс успешно записан в Google Sheets.\n"
     ]
    }
   ],
   "source": [
    "# Функция для получения курса йены  \n",
    "def get_jpy_rate(date):  \n",
    "    formatted_date = date.strftime(\"%d/%m/%Y\")  \n",
    "    url = f\"https://www.cbr.ru/scripts/XML_daily.asp?date_req={formatted_date}\"  \n",
    "    response = requests.get(url)  \n",
    "    response.raise_for_status()  \n",
    "    \n",
    "    tree = ET.fromstring(response.content)  \n",
    "    for currency in tree.findall('Valute'):  \n",
    "        char_code = currency.find('CharCode').text  \n",
    "        if char_code == \"JPY\":  \n",
    "            rate = currency.find('Value').text   \n",
    "            return float(rate.replace(',', '.')) / 100  # Преобразуем строку в число с плавающей запятой  \n",
    "    return None  # Если курс не найден, возвращаем None  \n",
    "\n",
    "def write_to_google_sheets(sheet_name, date, value):\n",
    "    # Авторизация Google Sheets API\n",
    "    scope = [\"https://spreadsheets.google.com/feeds\", \"https://www.googleapis.com/auth/drive\"]\n",
    "    creds = ServiceAccountCredentials.from_json_keyfile_name(\"exchange-rate.json\", scope)\n",
    "    client = gspread.authorize(creds)\n",
    "\n",
    "    # Открытие нужного Google Sheets файла\n",
    "    spreadsheet = client.open(sheet_name)\n",
    "    \n",
    "    # Открытие листа по имени 'exchange_rate'\n",
    "    sheet = spreadsheet.worksheet(\"exchange_rate\")\n",
    "    \n",
    "    # Преобразование даты в формат 'YYYY-MM-DD'\n",
    "    formatted_date = date.strftime(\"%Y-%m-%d\")\n",
    "    sheet.append_row([formatted_date, value], value_input_option=\"USER_ENTERED\")\n",
    "\n",
    "\n",
    "# Основной блок  \n",
    "if __name__ == \"__main__\":  \n",
    "    try:  \n",
    "        start_date = datetime(2025, 1, 1)  # Начальная дата  \n",
    "        end_date = datetime.now()  # Текущая дата  \n",
    "\n",
    "        # Проходим по датам от 01/01/2025 до текущей даты  \n",
    "        current_date = start_date  \n",
    "        while current_date <= end_date:  \n",
    "            jpy_rate = get_jpy_rate(current_date)  \n",
    "            if jpy_rate is not None:  \n",
    "                print(f\"Курс JPY на {current_date.strftime('%d/%m/%Y')}: {jpy_rate}\")  \n",
    "                write_to_google_sheets(\"Japan_trip\", current_date, jpy_rate)  \n",
    "                print(\"Курс успешно записан в Google Sheets.\")  \n",
    "            else:  \n",
    "                print(f\"Курс JPY на {current_date.strftime('%d/%m/%Y')} не найден.\")  \n",
    "            current_date += timedelta(days=1)  # Переход к следующему дню  \n",
    "\n",
    "    except Exception as e:  \n",
    "        print(f\"Ошибка: {e}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bdb98ceb-cd64-4496-bec2-b8c055c7792a",
   "metadata": {},
   "source": [
    "# Код для прогрузки текущей даты "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "9df88e64-7de9-4174-ad7f-e76891576f76",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Курс JPY на 20/01/2025: 0.660095\n",
      "Курс успешно записан в Google Sheets.\n"
     ]
    }
   ],
   "source": [
    "# Функция для получения курса йены  \n",
    "def get_jpy_rate(date):  \n",
    "    formatted_date = date.strftime(\"%d/%m/%Y\")  \n",
    "    url = f\"https://www.cbr.ru/scripts/XML_daily.asp?date_req={formatted_date}\"  \n",
    "    response = requests.get(url)  \n",
    "    response.raise_for_status()  \n",
    "    \n",
    "    # Разбор XML  \n",
    "    tree = ET.fromstring(response.content)  \n",
    "    for currency in tree.findall('Valute'):  \n",
    "        char_code = currency.find('CharCode').text  \n",
    "        if char_code == \"JPY\":  \n",
    "            rate = currency.find('Value').text   \n",
    "            return float(rate.replace(',', '.')) / 100  # Преобразуем строку в число с плавающей запятой  \n",
    "    return None  # Если курс не найден, возвращаем None  \n",
    "\n",
    "def write_to_google_sheets(sheet_name, date, value):  \n",
    "    # Авторизация Google Sheets API  \n",
    "    scope = [\"https://spreadsheets.google.com/feeds\", \"https://www.googleapis.com/auth/drive\"]  \n",
    "    creds = ServiceAccountCredentials.from_json_keyfile_name(\"/Users/kemran/Desktop/work_files/python_files/practice_files/exchange_rate/exchange-rate.json\", scope)  \n",
    "    client = gspread.authorize(creds)  \n",
    "\n",
    "    # Открытие нужного Google Sheets файла  \n",
    "    spreadsheet = client.open(sheet_name)  \n",
    "    \n",
    "    # Открытие листа по имени 'exchange_rate'  \n",
    "    sheet = spreadsheet.worksheet(\"exchange_rate\")  \n",
    "    \n",
    "    # Преобразование даты в формат 'YYYY-MM-DD'  \n",
    "    formatted_date = date.strftime(\"%Y-%m-%d\")  \n",
    "    sheet.append_row([formatted_date, value], value_input_option=\"USER_ENTERED\")  \n",
    "\n",
    "# Основной блок  \n",
    "if __name__ == \"__main__\":  \n",
    "    try:  \n",
    "        today = datetime.now()  # Сегодняшняя дата  \n",
    "        jpy_rate = get_jpy_rate(today)  # Получение курса JPY  \n",
    "        \n",
    "        if jpy_rate is not None:  \n",
    "            print(f\"Курс JPY на {today.strftime('%d/%m/%Y')}: {jpy_rate}\")  \n",
    "            write_to_google_sheets(\"Japan_trip\", today, jpy_rate)  \n",
    "            print(\"Курс успешно записан в Google Sheets.\")  \n",
    "        else:  \n",
    "            print(f\"Курс JPY на {today.strftime('%d/%m/%Y')} не найден.\")  \n",
    "\n",
    "    except Exception as e:  \n",
    "        print(f\"Ошибка: {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89022f6e-d991-4bab-9ceb-69ea0f00366f",
   "metadata": {},
   "outputs": [],
   "source": [
    "!which python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "23971e97-5bcb-4ebd-a907-781e5eb80325",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'/Users/kemran/Desktop/work_files/python_files/practice_files/exchange_rate'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41ca2ab6-2bc5-4cfc-b6b0-41da1ac93c25",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


Unnamed: 0,date,profile_id,client_type,client_type_w_sberdevice,bonus_title,source,item_type,b2c_b2b,user_type,free_days,...,promo_type,country,device_type,os_version,session_cnt_ttl,session_watch_ttl,watch_session2,watchtime_session_watch,session_cnt2,watchtime2


In [None]:
df_new = execute(f'''SELECT * FROM datamarts.mekhtiev_watchtime_by_day_distr2  WHERE date>='2024-11-01' ''',user='kmekhtiev')
df_old = execute (f'''SELECT * FROM datamarts.mekhtiev_watchtime_by_day_distr WHERE date>='2024-11-01' ''',user='kmekhtiev')

df_new['date'] = pd.to_datetime(df_new['date'])
df_old['date'] = pd.to_datetime(df_old['date'])                              


In [None]:
plt.figure(figsize=(12,8))
df_new_agg = df_new.groupby('date').agg({'profile_id':'nunique','watchtime':'sum'})
df_old_agg = df_old.groupby('date').agg({'profile_id':'nunique','watchtime':'sum'})
df_merge = pd.merge(df_new_agg,df_old_agg,on='date',how='left',suffixes=('_new','_old')).reset_index()
df_merge['delta'] = df_merge['watchtime_new'] - df_merge['watchtime_old']
plt.xticks(rotation=45)


sns.lineplot(data=df_merge, x='date', y='delta')
#sns.lineplot(data=df_merge, x='date', y='watchtime_old', label='Watchtime Old')

## Квантили смотрения ТВ каналов

In [None]:
start_date = "2024-09-01"
end_date = datetime.now().date() - timedelta(days=1)
query  = f''' SELECT date,
                     profile_id,
                     event_page,
                     viewing_time
                     FROM
                            (SELECT
                                        utc_timestamp::date AS date,
                                        profile_id,
                                        client_type,
                                        event_page,
                                        JSONExtractInt(payload,'viewing_time') AS viewing_time,
                                        JSONExtractInt(payload,'duration') AS duration,
                                        JSONExtractString(payload,'item_type') AS item_type,
                                        payload
                                    FROM
                                    datamarts.sandbox_data_distr x 
                                    WHERE event_name IN ('auto_player_streaming','auto_kinom_streaming')
                                    AND utc_timestamp::date BETWEEN '{start_date}' AND '{end_date}'
                                    AND reg_date!='1970-01-01'
                                    AND profile_id IS NOT NULL
                                    AND viewing_time>=0
                                    ) as t1
                                    WHERE event_page='tvchannel'
              
                                    '''
df_tv = execute(query,user = 'kmekhtiev')
df_tv['date'] = df_tv['date'].astype('datetime64[ns]')

In [None]:
df_tv

In [None]:
df_tv[df_tv['viewing_time']>18000]['profile_id'].nunique()/df_tv['profile_id'].nunique()

In [None]:
df_tv['profile_id'].nunique()

In [None]:
quantiles=[1,2,3,4,5,6,7,8,9,10,25,50,75,80,85,90,91,92,93,94,95,96,97,98,99,99.1,99.2,99.3,99.4,99.5,99.6,99.8,99.9]
columns = ['viewing_time']


# пустой df для хранения результатов
quantiles_data = pd.DataFrame(index=quantiles, columns=columns)

for i in columns:
    quantile_values = np.percentile(df_tv['viewing_time'], quantiles)
    quantiles_data[i] = quantile_values

quantiles_data

## Retention по регистрации и оформлению подписки

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                reg_date Date,
                bonus_title String,
                reg_source String,
                retention_day Int32
            ) 
            --ENGINE = MergeTree()
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_retention_local', '{replica}')
            ORDER BY reg_date
'''
execute(query, user='kmekhtiev')

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                reg_date Date,
                bonus_title String,
                reg_source String,
                retention_day Int32
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_retention_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
start_date = "2024-01-01"
end_date = datetime.now().date() - timedelta(days=1)

query_delete = ''' TRUNCATE TABLE datamarts.mekhtiev_retention_local  ON CLUSTER 'viasat_cluster' '''
execute(query_delete,user = 'kmekhtiev')
print ('Таблица очищена')



query_retention = f'''INSERT INTO datamarts.mekhtiev_retention_distr
                        SELECT
                        utc_timestamp::date AS date,
                        profile_id,
                        event_name,
                        reg_date,
                        bonus_title,
                        reg_source,
                        date - reg_date AS retention_day
                        FROM datamarts.sandbox_data_distr AS t1
                        LEFT JOIN product_x.users AS u ON t1.user_id=u.id
                        WHERE utc_timestamp::date BETWEEN '{start_date}' AND '{end_date}'
                        AND reg_date::date BETWEEN '2023-09-01' AND yesterday()
                        AND profile_id IS NOT NULL
                        GROUP BY 1,2,3,4,5,6,7
         '''

execute(query_retention,user = 'kmekhtiev')
print('Данные залиты')

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_created_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                created_date Date,
                bonus_title String,
                reg_source String,
                retention_day Int32
            ) 
            --ENGINE = MergeTree()
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_retention_created_local', '{replica}')
            ORDER BY created_date
'''
execute(query, user='kmekhtiev')

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_retention_created_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                event_name String,
                created_date Date,
                bonus_title String,
                reg_source String,
                retention_day Int32
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_retention_created_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
start_date = "2024-01-01"
end_date = datetime.now().date() - timedelta(days=1)

query_delete = ''' TRUNCATE TABLE datamarts.mekhtiev_retention_created_local  ON CLUSTER 'viasat_cluster' '''
execute(query_delete,user = 'kmekhtiev')
print ('Таблица очищена')



query_retention = f'''INSERT INTO datamarts.mekhtiev_retention_created_distr
                        SELECT
                        utc_timestamp::date AS date,
                        profile_id,
                        event_name,
                        created_at AS created_date,
                        bonus_title,
                        reg_source,
                        date - created_at::date AS retention_day
                        FROM datamarts.sandbox_data_distr AS t1
                        LEFT JOIN product_x.users AS u ON t1.user_id=u.id
                        WHERE utc_timestamp::date BETWEEN '{start_date}' AND '{end_date}'
                        AND created_at::date BETWEEN '2023-09-01' AND yesterday()
                        AND profile_id IS NOT NULL
                        GROUP BY 1,2,3,4,5,6,7
         '''

execute(query_retention,user = 'kmekhtiev')
print('Данные залиты')

## DAU

In [None]:
#b2c_b2b нет, так как считаем их по полю promo_type, а это поле есть только у зарегов
query = '''CREATE TABLE datamarts.mekhtiev_dau_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                country String,
                bonus_title String,
                source String,
                client_type String,
                free_day String,
                user_type String
            ) 
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_dau_local', '{replica}')
            ORDER BY date
'''
execute(query, user='kmekhtiev')


In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_dau_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                country String,
                bonus_title String,
                source String,
                client_type String,
                free_day String,
                user_type String
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_dau_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
list_date = pd.date_range("2023-09-01", "2024-01-09", freq='D')
result=[]
for date in list_date:
        date_str = date.strftime('%Y-%m-%d')
        query_delete = f" DELETE FROM datamarts.mekhtiev_dau_local ON CLUSTER 'viasat_cluster' WHERE date = '{date}'::date"
        execute(query_delete,user = 'kmekhtiev')
        print(f" Дата {date_str} удалена из таблицы")
        query = f'''INSERT INTO datamarts.mekhtiev_dau_distr
                       SELECT
                                    date,
                                    visitor_id,
                                    profile_id,
                                    reg_date,
                                    created_date,
                                    first_prolong_date,
                                    country,
                                    bonus_title,
                                    source,
                                    client_type,
                                    free_days,
                                    CASE
                                        WHEN first_prolong_date!='1970-01-01' AND first_prolong_date<=date THEN 'subs'
                                        WHEN created_date!='1970-01-01' AND created_date<=date THEN 'trial'
                                        WHEN reg_date!='1970-01-01' AND reg_date<=date THEN 'reg'
                                        ELSE 'visitor'
                                    END AS user_type
                                    FROM
                                        (SELECT
                                        date,
                                        visitor_id,
                                        profile_id,
                                        bonus_title,
                                        source,
                                        country,
                                        client_type,
                                        free_days,
                                        profile_id,
                                        reg_date,
                                        promo_type,
                                        created_at::date AS created_date,
                                        first_prolong_date::date AS first_prolong_date
                                        FROM datamarts.sandbox_data_distr
                                        WHERE date='{date}'::date AND client_type !='backend'
                                        GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13
                                        )

                                '''
        df_temp = execute(query,user = 'kmekhtiev')   
        result.append(df_temp)
        print(f"""Дата '{date_str}' загружена""")
        print()

## MAU

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_mau_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                country String,
                bonus_title String,
                source String,
                client_type String,
                free_day String,
                user_type String
            ) 
            ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_mau_local', '{replica}')
            ORDER BY date
'''
execute(query, user='kmekhtiev')


In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_mau_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                visitor_id UUID,
                profile_id UUID,
                reg_date Date,
                created_date Date,
                first_prolong_date Date,
                country String,
                bonus_title String,
                source String,
                client_type String,
                free_day String,
                user_type String
             )
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_mau_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
 WHERE date BETWEEN '{date}'::date - interval '30' DAY AND '{date}'::date 

In [None]:
list_date = pd.date_range("2024-11-20", "2024-12-09", freq='D')
result=[]
for date in list_date:
        date_str = date.strftime('%Y-%m-%d')
        query_delete = f" DELETE FROM datamarts.mekhtiev_mau_local ON CLUSTER 'viasat_cluster' WHERE date = '{date}'::date"
        execute(query_delete,user = 'kmekhtiev')
        print(f" Дата {date_str} удалена из таблицы")
        query = f'''INSERT INTO datamarts.mekhtiev_mau_distr
                      SELECT
                            dt_month AS date,
                            visitor_id,
                            profile_id,
                            reg_date,
                            created_date,
                            first_prolong_date,
                            country,
                            bonus_title,
                            source,
                            client_type,
                            free_days,
                            CASE
                                WHEN first_prolong_date!='1970-01-01'  THEN 'subs'
                                WHEN created_date!='1970-01-01' THEN 'trial'
                                WHEN reg_date!='1970-01-01' THEN 'reg'
                                ELSE 'visitor'
                            END AS user_type
                            FROM
                                (SELECT
                                '{date}'::date as dt_month,
                                visitor_id,
                                profile_id,
                                client_type,
                                free_days,
                                profile_id,
                                reg_date,
                                bonus_title,
                                source,
                                country,
                                promo_type,
                                created_at::date AS created_date,
                                first_prolong_date::date AS first_prolong_date
                                FROM datamarts.sandbox_data_distr
                                WHERE date BETWEEN '{date}'::date - interval '30' DAY AND '{date}'::date 
                                AND client_type !='backend'
                                GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13
                            )
                                '''
        df_temp = execute(query,user = 'kmekhtiev')   
        result.append(df_temp)
        print(f"""Дата '{date_str}' загружена""")
        print()

## Первый день захода в приложение 

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_min_date_visitor_profile_local ON CLUSTER 'viasat_cluster'
            (
                min_date_visitor Date,
                min_date_profile Date,
                visitor_id UUID,
                client_type String,
                device String,
                bonus_title String,
                user_id Nullable(UUID),
                profile_id Nullable(UUID),
                promo_type String,
                app_version String
                )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/mekhtiev_min_date_visitor_profile_local', '{replica}')
             ORDER BY min_date_visitor
             '''
execute(query,user = 'kmekhtiev')

In [None]:
query = f'''CREATE TABLE datamarts.mekhtiev_min_date_visitor_profile_distr ON CLUSTER 'viasat_cluster'
            (
                min_date_visitor Date,
                min_date_profile Date,
                visitor_id UUID,
                client_type String,
                device String,
                bonus_title String,
                user_id Nullable(UUID),
                profile_id Nullable(UUID),
                promo_type String,
                app_version String
                )
             
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_min_date_visitor_profile_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
start_date = datetime.now() - timedelta(days=180)
end_date = datetime.now() - timedelta(days=1)

list_date = pd.date_range(start_date, end_date, freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query_delete = f" DELETE FROM datamarts.mekhtiev_min_date_visitor_profile_local ON CLUSTER 'viasat_cluster' WHERE min_date_visitor = '{date_str}'"
    execute(query_delete,user = 'kmekhtiev')
    print(f'Дата {date_str} удалена из таблицы')
    query = f'''
    INSERT INTO datamarts.mekhtiev_min_date_visitor_profile_distr
    SELECT
    min_date_visitor,
    min(min_date_visitor) OVER (PARTITION BY profile_id) AS min_date_profile,
    visitor_id,
    client_type2 AS client_type,
    device,
    bonus_title,
    user_id2 AS user_id,
    profile_id2 AS profile_id,
    promo_type2 AS promo_type,
    app_version2 AS app_version
    FROM
        (SELECT
        min(utc_timestamp::date) AS min_date_visitor,
        visitor_id,
        user_id,
        promo_type,
        profile_id,
        client_type,
        bonus_title,
        app_version,
        device,
        first_value(client_type) OVER (PARTITION BY visitor_id) AS client_type2,
        first_value(user_id) OVER (PARTITION BY visitor_id) AS user_id2,
        first_value(promo_type) OVER (PARTITION BY visitor_id) AS promo_type2,
        first_value(profile_id) OVER (PARTITION BY visitor_id) AS profile_id2,
        first_value(app_version) OVER (PARTITION BY visitor_id) AS app_version2
        FROM datamarts.sandbox_data_distr x
        LEFT JOIN product_x.users AS u ON x.user_id=u.id
        WHERE client_type!='backend'
        AND utc_timestamp::date>='2023-09-01'
        AND (u.email NOT LIKE '%@test%' OR u.email IS NULL)
        GROUP BY 2,3,4,5,6,7,8,9
        )
    WHERE min_date_visitor='{date_str}' 
    GROUP BY 1,3,4,5,6,7,8,9,10
                

            '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_registration = pd.concat(result)

# LTV

In [None]:
list_date = pd.date_range("2024-01-01", "2025-01-01", freq='MS',normalize=True)
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
            SELECT 
                '{date_str}' AS trial_month,
                paid_at::date AS paid_date,
                paid_at,
                date_trunc('month',paid_date) AS paid_month,
                s.user_id AS user_id,
                t2.free_days AS free_days,
                i.subscription_id AS subscription_id,
                s.created_at::date AS created_date,
                t2.first_prolong_date::date AS first_prolong_date,
                t2.reg_source AS reg_source,
                t2.reg_medium AS reg_medium,
                t2.bonus_title AS bonus_title,
                t2.device AS device,
                CASE WHEN t2.promo_type='cards' THEN 'b2b' ELSE 'b2c' END AS b2c_b2b,
                i.price_currency AS price_currency,
                sum(CASE WHEN i.price_currency='USD' THEN i.price_cents*90/100
                         WHEN i.price_currency='AMD' THEN i.price_cents*0.25/100
                         ELSE i.price_cents/100
                    END) AS payment
                FROM  product_x.invoices i
                LEFT JOIN product_x.subscriptions s ON s.id = i.subscription_id
                LEFT JOIN product_x.users u ON u.id = s.user_id
                INNER JOIN (SELECT 
                           user_id,
                           first_prolong_date::date AS first_prolong_date,
                           reg_source,
                           reg_medium,
                           bonus_title,
                           free_days,
                           promo_type,
                           device
                           FROM datamarts.marketing_dash_distr
                           WHERE DATE_TRUNC('month', first_prolong_date)='{date_str}'
                           ) AS t2 
                        ON s.user_id=t2.user_id
                WHERE u.user_type = 'regular'
                AND u.vipplay = FALSE
                AND s.state in ('normal_period','trial','canceled','grace_period')
                AND i.paid_at BETWEEN '{date_str}' AND '2024-12-01'
                AND i.state in ('success')
                AND ((u.email NOT ILIKE '%%@test%%' AND u.email NOT ILIKE '%%@viasat%%') OR (u.email IS NULL AND u.phone_number IS NOT NULL))
                AND i.price_cents > 100
                GROUP BY 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
            '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()

df = pd.concat(result)
df['paid_date'] = df['paid_date'].astype('datetime64[ns]')
df['paid_month'] = df['paid_month'].astype('datetime64[ns]')
df['trial_month'] = df['trial_month'].astype('datetime64[ns]')
df['payment'] = df['payment'].astype('int32')
df.sort_values(by='paid_at',inplace=True)

In [None]:
df['trial_month'].nunique()

In [None]:
query = ''' 
SELECT t1.*,t3.bonus_title_new
FROM datamarts.financial_activity_distr AS t1
LEFT JOIN datamarts.marketing_dash_distr t2 ON t1.user_id=t2.user_id
LEFT JOIN datamarts.mekhtiev_bonus_distr AS t3 ON t2.bonus_title=t3.bonus_title_old
WHERE main_date BETWEEN '2024-10-01' AND '2024-11-12'
'''

df = execute(query,user = 'kmekhtiev')

In [None]:
t = df[df['t1.free_days']=='35'].groupby(['t1.main_date','t3.bonus_title_new'])['t1.total_sum_by_day'].sum().reset_index()
t['t1.main_date'] = t['t1.main_date'].astype('datetime64[ns]')
sns.lineplot(t,x='t1.main_date',y='t1.total_sum_by_day',hue='t3.bonus_title_new')

In [None]:
df2 = df[(df['free_days']==14)]
df_agg = df2.groupby(['trial_month','paid_month']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby('trial_month')['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby('trial_month')['user_id'].transform('max')

df_agg['trial_month'] = df_agg['trial_month'].astype('datetime64[ns]')

In [None]:
df_agg

In [None]:
plt.figure(figsize=(13,8),dpi=100)
#df_agg['trial_month'] = df_agg['trial_month'].dt.strftime('%Y-%m-%d')  
sns.lineplot(data=df_agg,x='paid_month',y='cumulative',hue='trial_month')
plt.xlabel('Месяцы')
plt.ylabel('Кумулятивная сумма (руб)')
plt.legend(title='Когорты')
plt.show()

In [None]:
df['trial_month'].unique()

In [None]:
# LTV общий 
#df2 = df[(df['free_days']==14) & (df['device']=='web')]
df_agg = df.groupby(['trial_month','paid_month']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby('trial_month')['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby('trial_month')['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user

# Нумерация месяцев
df_agg['num_of_month'] = df_agg.groupby('trial_month')['paid_month'].rank(method='first').astype('int')
df_agg = df_agg[df_agg['num_of_month']<13] # Ограничиваем 6-ью месяцами

# Сделаем pivot 
df_pivot = pd.pivot(data = df_agg,index = 'trial_month',columns = 'num_of_month',values = 'ltv')
df_pivot['uniq_user'] = df_agg.groupby('trial_month')['uniq_user'].max()

# Создаем DataFrame с данными для расчета средневзвешенного LTV
weighted_ltv = df_agg.pivot(index='trial_month', columns='num_of_month', values='ltv')
user_counts = df_agg.pivot(index='trial_month', columns='num_of_month', values='uniq_user')

# Рассчитываем средневзвешенное значение LTV для каждого 'num_of_month'
weighted_avg_ltv = ((weighted_ltv * user_counts).sum() / user_counts.sum()).round().astype('int')


weighted_avg_ltv = weighted_avg_ltv.reset_index()
weighted_avg_ltv = weighted_avg_ltv.rename(columns={0:'ltv'})


# Построение графика месячного LTV
plt.figure(figsize=(20,8))
plt.plot(weighted_avg_ltv['num_of_month'], weighted_avg_ltv['ltv'], marker='o', linestyle='-', color='#005f80',markerfacecolor='white',alpha=0.8)

for i in range(len(weighted_avg_ltv['num_of_month'])):  
    plt.text(weighted_avg_ltv['num_of_month'][i], weighted_avg_ltv['ltv'][i]+10,
             str(weighted_avg_ltv['ltv'][i]),
             ha='center', fontsize=10,color='purple')  
plt.xlabel('Месяцы')
plt.ylabel('Руб')
plt.title('LTV по месяцам')
plt.grid(True,linewidth=0.4)
plt.xticks(ticks=range(1, len(weighted_avg_ltv['num_of_month'])+1),rotation=45)  
plt.show()

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_ltv_local ON CLUSTER 'viasat_cluster'
            (
                num_of_month Int32,
                ltv Int32
                )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/mekhtiev_ltv_local', '{replica}')
             ORDER BY num_of_month
             '''
execute(query,user = 'kmekhtiev')

In [None]:
query = f'''CREATE TABLE datamarts.mekhtiev_ltv_distr ON CLUSTER 'viasat_cluster'
            (
                num_of_month Int32,
                ltv Int32
                )
             
             ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_ltv_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
execute(SQL = f" TRUNCATE TABLE datamarts.mekhtiev_ltv_local ON CLUSTER 'viasat_cluster' ", user='kmekhtiev')
connection_clickhouse.insert_df('mekhtiev_ltv_distr', weighted_avg_ltv)

print("Данные залиты")

In [None]:
df['free_days'].unique()

In [None]:
# LTV в разбивке b2c_b2b
df_agg = df[df['free_days'].isin([3,14,30,35,45])].groupby(['trial_month','paid_month','free_days']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby(['trial_month','free_days'])['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby(['trial_month','free_days'])['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user

# Нумерация месяцев
df_agg['num_of_month'] = df_agg.groupby(['trial_month','free_days'])['paid_month'].rank(method='first').astype('int')
df_agg = df_agg[df_agg['num_of_month']<7] # Ограничиваем 6-ью месяцами

# Сделаем pivot 
df_pivot = pd.pivot(data = df_agg,index = ['trial_month','free_days'],columns = 'num_of_month',values = 'ltv')
df_pivot['uniq_user'] = df_agg.groupby(['trial_month','free_days'])['uniq_user'].max()

# Создаем DataFrame с данными для расчета средневзвешенного LTV
weighted_ltv = df_agg.pivot(index='trial_month', columns='num_of_month', values='ltv')
user_counts = df_agg.pivot(index='trial_month', columns='num_of_month', values='uniq_user')

# Рассчитываем средневзвешенное значение LTV для каждого 'num_of_month'
weighted_avg_ltv = ((weighted_ltv * user_counts).sum() / user_counts.sum()).round().astype('int')


weighted_avg_ltv = weighted_avg_ltv.reset_index()
weighted_avg_ltv = weighted_avg_ltv.rename(columns={0:'cumsum'})


# Построение графика месячного LTV
plt.figure(figsize=(20,8))
plt.plot(weighted_avg_ltv['num_of_month'], weighted_avg_ltv['cumsum'], marker='o', linestyle='-', color='#005f80',markerfacecolor='white',alpha=0.8)

for i in range(len(weighted_avg_ltv['num_of_month'])):  
    plt.text(weighted_avg_ltv['num_of_month'][i], weighted_avg_ltv['cumsum'][i]+10,
             str(weighted_avg_ltv['cumsum'][i]),
             ha='center', fontsize=10,color='purple')  
plt.xlabel('Месяцы')
plt.ylabel('Руб')
plt.title('Кумулятивная сумма на пользователя по месяцам (Flocktory)')
plt.grid(True,linewidth=0.4)
plt.xticks(rotation=45)
plt.show()

In [None]:
df_agg = df[df['free_days'].isin([3,14,30,35,45])].groupby(['trial_month','paid_month','free_days']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby(['trial_month','free_days'])['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby(['trial_month','free_days'])['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user


df_agg['num_of_month'] = df_agg.groupby(['trial_month','free_days'])['paid_month'].rank(method='first').astype('int')

df_agg['diff'] = ((df_agg['paid_month'] - df_agg['paid_month'].min()) / pd.offsets.MonthEnd(1)).astype(int)
df_agg = df_agg[df_agg['num_of_month']<7] # Ограничиваем 6-ью месяцами

df_agg[df_agg['free_days']==14]

In [None]:
 'paid_month': ['2024-02-01', '2024-03-01']  
}  
df = pd.DataFrame(data)  

# Преобразование строк в формат даты
df['paid_month'] = pd.to_datetime(df['paid_month'])  

# Добавление нового столбца, который содержит разницу в месяцах
df['difference'] = (df['paid_month'].shift(-1) - df['paid_month']).dt.days //30# Выбор только первой строки разницыresult = df['difference'].dropna().astype(int).iloc[0]  



In [None]:
df[df['b2c_b2b']=='b2c'].groupby(['paid_month']).agg({'payment':'sum','user_id':'count'}).reset_index()

In [None]:
df.groupby(['paid_month','b2c_b2b'])['user_id'].count()

In [None]:
df_agg = df.groupby(['trial_month','paid_month','b2c_b2b']).agg({'payment':'sum','user_id':'count'}).reset_index()

# Расчет кумулятивной суммы, и добавления поля по числу пользователей когорты 
df_agg['cumulative'] = df_agg.groupby(['trial_month','b2c_b2b'])['payment'].cumsum()
df_agg['uniq_user'] = df_agg.groupby(['trial_month','b2c_b2b'])['user_id'].transform('max')

#Расчет ltv
df_agg['ltv'] = df_agg.cumulative/df_agg.uniq_user


# Нумерация месяцев
df_agg['num_of_month'] = df_agg.groupby(['trial_month','b2c_b2b'])['paid_month'].rank(method='first').astype('int')
df_agg = df_agg[df_agg['num_of_month']<7] # Ограничиваем 6-ью месяцами

df_pivot = pd.pivot(data = df_agg,index = ['trial_month','b2c_b2b'],columns = 'num_of_month',values = 'ltv')
df_pivot['uniq_user'] = df_agg.groupby(['trial_month','b2c_b2b'])['uniq_user'].max()

df_pivot

In [None]:
df_pivot

In [None]:
df[(df['trial_month']=='2024-04-01') & (df['user_id']=='583943d4-7048-4c4b-91ca-6edfb27f492b')]

In [None]:
df['cumsum'] = df.groupby('user_id')['payment'].cumsum()
df['cnt_user'] = df.groupby('trial_month')['user_id'].transform('nunique').astype('int32')

In [None]:
df.groupby('trial_month')['user_id'].nunique()

In [None]:
df.groupby(['trial_month','paid_month'])['cumsum'].sum()

In [None]:
df.groupby(['trial_month','paid_month'])['cumsum'].sum()

In [None]:
list_date = pd.date_range("2024-06-01", "2024-07-22", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
                        SELECT 
                        '{date_str}' AS date,
                        trial_duration,
                        count(DISTINCT profile_id) as cnt_user
                        FROM datamarts.marketing_dash_distr
                        WHERE created_at::date BETWEEN '{date_str}'::date - interval '6' DAY AND '{date_str}'::date
                        AND first_prolong_date!='1970-01-01'
                        GROUP BY 1,2
                        '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_registration = pd.concat(result)       

In [None]:
t1 = (pd.pivot(data=df_registration[df_registration.trial_duration.isin([0,3,14])],
              index='date',
              columns='trial_duration')
     )
t1

In [None]:
list_date = pd.date_range("2024-06-01", "2024-07-22", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
                        SELECT 
                        '{date_str}' AS date,
                        trial_duration,
                        count(DISTINCT profile_id) as cnt_user
                        FROM datamarts.marketing_dash_distr
                        WHERE first_prolong_date::date BETWEEN '{date_str}'::date - interval '6' DAY AND '{date_str}'::date
                        GROUP BY 1,2
                        '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_subs = pd.concat(result)       

In [None]:
t2 = (pd.pivot(data=df_subs[df_subs.trial_duration.isin([0,3,14,35])],
              index='date',
              columns='trial_duration')
     )
t2

In [None]:
list_date = pd.date_range("2024-01-01", "2024-07-22", freq='D')
result = []
for date in list_date:
    date_str = date.strftime('%Y-%m-%d')
    query = f'''
                        SELECT 
                        '{date_str}' AS date,
                        trial_duration,
                        created_at::date AS created_at,
                        first_prolong_date::date AS first_prolong_date,
                        profile_id
                        FROM datamarts.marketing_dash_distr
                        WHERE first_prolong_date::date BETWEEN '{date_str}'::date - interval '6' DAY AND '{date_str}'::date
                        GROUP BY 1,2,3,4,5
                        '''
    df_temp = execute(query,user = 'kmekhtiev')   
    result.append(df_temp)
    print(f"""Дата '{date_str}' загружена""")
    print()
    
df_subs_dt = pd.concat(result) 
df_subs_dt['first_prolong_date'] = df_subs_dt['first_prolong_date'].astype('datetime64[ns]')
df_subs_dt['created_at'] = df_subs_dt['created_at'].astype('datetime64[ns]')
df_subs_dt['dt'] = (df_subs_dt.first_prolong_date - df_subs_dt.created_at).dt.days

In [None]:
df_subs_dt_agg = (df_subs_dt[#(df_registration2.first_prolong_date == '2024-07-20') &
                    #& (df_registration2['dt']==35)]
                     (df_subs_dt['trial_duration']==3)]
    .groupby(['dt'])['profile_id'] 
    .nunique() 
    .reset_index() 
)
df_subs_dt_agg = df_subs_dt_agg[df_subs_dt_agg.dt<50]

# Столбчатая диаграмма  
plt.figure(figsize=(15, 8))  
plt.bar(df_subs_dt_agg['dt'], df_subs_dt_agg['profile_id'], color='skyblue')  
plt.ylabel('Число пользователей')
plt.xlabel('Число дней между началом триала и первым списанием')
plt.xticks(rotation=45)
plt.grid()
plt.show() 

In [None]:
import matplotlib.dates as mdates
from matplotlib.ticker import FuncFormatter, MultipleLocator
df = (df_subs_dt[(df_subs_dt.dt.isin([3,45])) & (df_subs_dt.trial_duration==3)]
        .groupby(['date','dt'])['profile_id']
        .nunique()
        .reset_index()
        )

df['sum'] = df.groupby('date')['profile_id'].transform('sum')
df['frac'] = df['profile_id']/df['sum']

df_pivot = df.pivot(index='date', columns='dt', values='frac').fillna(0)

# Построение столбчатой диаграммы
colors = ['#2E8B57', '#98FB98']  # Темно-зеленый и светло-зеленый
ax = df_pivot.plot(kind='bar', stacked=True, figsize=(15, 8),color=colors)
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: f'{int(y * 100)}%'))
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
plt.legend(title='dt', loc='upper right')
plt.xticks(rotation=45)  # Поворот меток даты для улучшения читаемости
plt.show()

# Watchtime общий 

In [None]:
def watchtime (df,client_type_general=None):
        if client_type_general:
            result = df.groupby(['date','client_type_general']).agg({'watchtime':'sum','profile_id':'nunique','active_day':'sum','watch_day':'sum'}).reset_index()
            result.rename(columns={'profile_id':'cnt_all_user'},inplace=True)
            
            cnt_watch_user = df[df['watchtime'] != 0].groupby(['date','client_type_general'])['profile_id'].nunique()
            result = pd.merge(result,cnt_watch_user,on=['date','client_type_general'],how='left')
            result.rename(columns={'profile_id':'cnt_watch_user'},inplace=True)
            
        else:
            result = df.groupby('date').agg({'watchtime':'sum','profile_id':'nunique','active_day':'sum','watch_day':'sum'}).reset_index()
            result.rename(columns={'profile_id':'cnt_all_user'},inplace=True)
                          
            cnt_watch_user = df[df['watchtime'] != 0].groupby(['date'])['profile_id'].nunique()
            result = pd.merge(result,cnt_watch_user,on=['date'],how='left')
            result.rename(columns={'profile_id':'cnt_watch_user'},inplace=True)
        return result

In [None]:
data = watchtime(df)
plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watchtime']/3600, marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.xlabel('Date')
plt.ylabel('Watchtime (hours)')
plt.title('Watchtime over 7 days')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watchtime']/3600, label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.xlabel('Date')
plt.ylabel('Watchtime (hours)')
plt.title('Watchtime over 7 days')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Watchtime на пользователя

In [None]:
data = watchtime(df)

data['watchtime_per_user'] = data['watchtime']/data['cnt_all_user']

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watchtime_per_user']/3600, marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.ylabel('Watchtime per user (hour)')
plt.title('Watchtime per user over 7 days')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
data['watchtime_per_user'] = data['watchtime']/data['cnt_all_user']
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем ра"змер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watchtime_per_user']/3600, label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.ylabel('Watchtime per user (hour)')
plt.title('Watchtime per user over 7 days')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Число пользователей 

In [None]:
data = watchtime(df)

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['cnt_all_user'], marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.xlabel('Date')
plt.ylabel('Users')
plt.title('Число пользователей')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['cnt_all_user'], label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.ylabel('Users')
plt.title('Число пользователей')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Число активных дней 

In [None]:
data = watchtime(df)
data['active_day_per_user'] = data['active_day']/data['cnt_all_user']

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['active_day_per_user'], marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.ylabel('Число дней')
plt.title('Число активных дней на пользователя')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
grouped_data = data.groupby('client_type_general')
data['active_day_per_user'] = data['active_day']/data['cnt_all_user']

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['active_day_per_user'], label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.xlabel('Date')
plt.ylabel('Число дней')
plt.title('Число активных дней на пользователя')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

# Конверсия в день просмотр из активного дня 

In [None]:
import matplotlib.ticker as mticker
data = watchtime(df)
data['watch_day_%'] = data['watch_day']/data['active_day'] * 100

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watch_day_%'], marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.ylabel('%')
plt.title('Конверсия из активного дня в дни с просмотром')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter())
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
data['watch_day_%'] = data['watch_day']/data['active_day'] * 100
grouped_data = data.groupby('client_type_general')


plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watch_day_%'], label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.xlabel('Date')
plt.ylabel('%')
plt.title('Конверсия из активного дня в дни с просмотром')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter())
plt.show()

# Watchtime на смотрящих пользователей

In [None]:
data = watchtime(df)

data['watchtime_per_user'] = data['watchtime']/data['cnt_watch_user']

plt.figure(figsize=(20, 8))  # Устанавливаем размер графика
plt.plot(data['date'], data['watchtime_per_user']/3600, marker='o',markerfacecolor='white',alpha=0.8)  # Добавляем точки на график
plt.xlabel('Date')
plt.ylabel('Watchtime per user (hour)')
plt.title('Watchtime на смотрящего пользователя за 7 дней')
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
data = watchtime(df,'client_type_general')
data['watchtime_per_user'] = data['watchtime']/data['cnt_watch_user']
grouped_data = data.groupby('client_type_general')

plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for client_type, data in grouped_data:
    plt.plot(data['date'], data['watchtime_per_user']/3600, label=client_type,marker='o',alpha=0.8,markerfacecolor='white')

plt.ylabel('hour')
plt.title('Watchtime на смотрящего пользователя за 7 дней')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.show()

In [None]:
plt.figure(figsize=(20, 10))  # Устанавливаем размер графика

# Перебираем каждую группу и строим график для каждого client_type
for col in df_retention.iloc[:,7:]:
        plt.plot(df_retention['reg_date'], df_retention[col]*100, label=col,marker='o',alpha=0.8,markerfacecolor='whi†te')

plt.xlabel('Date')
plt.ylabel('%')
plt.title('Retention')
plt.legend()  
plt.grid()
plt.xticks(rotation=45)  # Поворачиваем подписи оси X для лучшей читаемости
plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter())
plt.show()

# Новые пользователи и конверсии в регистрацию/триал/подписку

In [None]:
df_registration[df_registration.cnt_profile<2].groupby('date')['visitor_id'].nunique().reset_index()

In [None]:
df_registration[(df_registration.reg_date!='1970-01-01') & (df_registration.min_date<=df_registration.reg_date) & (df_registration.cnt_profile<2)].groupby('date_reg')['profile_id'].nunique().reset_index()

In [None]:
df_registration[(df_registration.created_date!='1970-01-01') & (df_registration.min_date<=df_registration.created_date) & (df_registration.cnt_profile<2)].groupby('date')['profile_id'].nunique().reset_index()

In [None]:
df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2)].groupby(['date_reg','trial_duration'])['profile_id'].nunique().reset_index()

In [None]:
df_registration_agg = df_registration.groupby('date')['visitor_id'].nunique().reset_index()

t1 = df_registration[(df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2)].groupby('date')['profile_id'].nunique().reset_index()
t1 = t1.rename(columns={'profile_id':'registration_cnt'})

t2 = df_registration[(df_registration.created_date!='1970-01-01') & (df_registration.cnt_profile<2)].groupby('date')['profile_id'].nunique().reset_index()
t2 = t2.rename(columns={'profile_id':'trial_cnt'})


t3 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==0)].groupby('date')['profile_id'].nunique().reset_index()
t3 = t3.rename(columns={'profile_id':'payd_cnt_0_trial'})

t4 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==3)].groupby('date')['profile_id'].nunique().reset_index()
t4 = t4.rename(columns={'profile_id':'payd_cnt_3_trial'})

t5 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==14)].groupby('date')['profile_id'].nunique().reset_index()
t5 = t5.rename(columns={'profile_id':'payd_cnt_14_trial'})

t6 = df_registration[(df_registration.first_prolong_date!='1970-01-01') & (df_registration.reg_date!='1970-01-01') & (df_registration.cnt_profile<2) & (df_registration.trial_duration==35)].groupby('date')['profile_id'].nunique().reset_index()
t6 = t6.rename(columns={'profile_id':'payd_cnt_35_trial'})

# Объединение данных с помощью метода merge
df_registration_agg = df_registration_agg.merge(t1, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t2, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t3, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t4, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t5, on='date', how='left')
df_registration_agg = df_registration_agg.merge(t6, on='date', how='left')


df_registration_agg.columns = ['date', 'visitor_id', 'registration_cnt','trial_cnt','payd_cnt_0_trial','payd_cnt_3_trial','payd_cnt_14_trial','payd_cnt_35_trial']

In [None]:
df_registration[(df_registration.first_prolong_date!='1970-01-01')].groupby('trial_duration')['profile_id'].nunique()

In [None]:
df_registration_agg['register_frac'] = df_registration_agg.registration_cnt/df_registration_agg.visitor_id * 100
df_registration_agg['trial_frac'] = df_registration_agg.trial_cnt/df_registration_agg.registration_cnt * 100
df_registration_agg['payd_0_day_frac'] = df_registration_agg.payd_cnt_0_trial/df_registration_agg.trial_cnt * 100
df_registration_agg['payd_3_day_frac'] = df_registration_agg.payd_cnt_3_trial/df_registration_agg.trial_cnt * 100
df_registration_agg['payd_14_day_frac'] = df_registration_agg.payd_cnt_14_trial/df_registration_agg.trial_cnt * 100



In [None]:
df_registration_agg

In [None]:
import copy
fig, axs = plt.subplots(6, figsize=(20, 25))

# График 1: visitor_id
axs[0].plot(df_registration_agg['date'], df_registration_agg['visitor_id'], color='blue', marker='s', alpha=0.7,markerfacecolor='white')
axs[0].set_title('Новые пользователи')
axs[0].set_ylabel('Число пользователей')
axs[0].tick_params(axis='x', rotation=45)
axs[0].grid()

# График 2: registration_cnt
axs[1].plot(df_registration_agg['date'], df_registration_agg['registration_cnt'], color='green', marker='s', alpha=0.7,markerfacecolor='white')
axs[1].set_title('Зарегистрированные')
axs[1].set_ylabel('Число пользователей')
axs[1].tick_params(axis='x', rotation=45)
axs[1].grid()

# График 3: trial_cnt
axs[2].plot(df_registration_agg['date'], df_registration_agg['trial_cnt'], color='red', marker='s', alpha=0.7,markerfacecolor='white')
axs[2].set_title('Триал')
axs[2].set_ylabel('Число пользователей')
axs[2].tick_params(axis='x', rotation=45)
axs[2].grid()

# Создайте копию исходного DataFrame
df_plot4 = df_registration_agg.copy()


# График 4: trial_cnt_3_day
axs[3].plot(df_plot4['date'], df_plot4['payd_cnt_0_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[3].set_title('Подписчики c триалом 0 дней')
axs[3].set_ylabel('Число пользователей')
axs[3].tick_params(axis='x', rotation=45)
axs[3].grid()

df_plot4 = df_plot4[df_plot4['date'] <= (datetime.now() - timedelta(days=4))]

# График 5: trial_cnt_3_day
axs[4].plot(df_plot4['date'], df_plot4['payd_cnt_3_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[4].set_title('Подписчики с триалом 3 дня')
axs[4].set_ylabel('Число пользователей')
axs[4].tick_params(axis='x', rotation=45)
axs[4].grid()

# Создайте копию исходного DataFrame
df_plot5 = df_registration_agg.copy()


df_plot5 = df_plot4[df_plot4['date'] <= (datetime.now() - timedelta(days=15))]

# График 6: trial_cnttrial_cnt_14_day
axs[5].plot(df_plot5['date'], df_plot5['payd_cnt_14_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[5].set_title('Подписчики с триалом 14 дней')
axs[5].set_ylabel('Число пользователей')
axs[5].tick_params(axis='x', rotation=45)
axs[5].grid()


# # График 6: trial_cnttrial_cnt_35_day
# axs[5].plot(df_registration_agg['min_date'], df_registration_agg['payd_cnt_35_trial'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
# axs[5].set_title('Подписчики')
# axs[5].set_ylabel('trial_cnt')
# axs[5].tick_params(axis='x', rotation=45)
# axs[5].grid()


# Увеличение расстояния между графиками
plt.subplots_adjust(hspace=0.8)


plt.tight_layout()
plt.show()

In [None]:
import copy
fig, axs = plt.subplots(5, figsize=(20, 25))


# График 1: registration_cnt
axs[0].plot(df_registration_agg['date'], df_registration_agg['register_frac'], color='green', marker='s', alpha=0.7,markerfacecolor='white')
axs[0].set_title('Доля регистраций')
axs[0].set_ylabel('%')
axs[0].tick_params(axis='x', rotation=45)
axs[0].grid()

# График 2: trial_cnt
axs[1].plot(df_registration_agg['date'], df_registration_agg['trial_frac'], color='red', marker='s', alpha=0.7,markerfacecolor='white')
axs[1].set_title('Доля триальщиков')
axs[1].set_ylabel('%')
axs[1].tick_params(axis='x', rotation=45)
axs[1].grid()


# График 3: trial_cnttrial_cnt_14_day
axs[2].plot(df_plot4['date'], df_plot4['payd_0_day_frac'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[2].set_title('Подписчики с триалом 0 дней')
axs[2].set_ylabel('%')
axs[2].tick_params(axis='x', rotation=45)
axs[2].grid()


# График 4: trial_cnttrial_cnt_35_day
axs[3].plot(df_plot5['date'], df_plot5['payd_3_day_frac'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[3].set_title('Подписчики с триалом 3 дня')
axs[3].set_ylabel('%')
axs[3].tick_params(axis='x', rotation=45)
axs[3].grid()

# График 5: trial_cnttrial_cnt_35_day
axs[4].plot(df_plot5['date'], df_plot5['payd_14_day_frac'], color='brown', marker='s', alpha=0.7,markerfacecolor='white')
axs[4].set_title('Подписчики c триалом 14 дней')
axs[4].set_ylabel('%')
axs[4].tick_params(axis='x', rotation=45)
axs[4].grid()


# Увеличение расстояния между графиками
plt.subplots_adjust(hspace=0.8)


plt.tight_layout()
plt.show()

In [None]:
query ="""
                SELECT  *
                    FROM datamarts.marketing_dash_distr
                    WHERE  first_prolong_date between '2024-07-09' and '2024-08-14' 
                    AND  bonus_title like '%GetBlogger%'
                    AND payer = 1 
"""
df = execute(query,user='kmekhtiev')

df.to_excel('getbloggers_payers_0709_0814.xlsx',index=False)

In [None]:
query ="""
SELECT 
user_id, 
bonus_title, 
promo, 
reg_source,
reg_campaign, 
reg_medium,device
FROM datamarts.marketing_dash_distr mdd 
WHERE device='web' AND reg_date>='2024-08-10'
"""
df = execute(query,user='kmekhtiev')

df.to_excel('Выгрузка_зарегов1508_1208.xlsx',index=False)

In [None]:
query = """
                    SELECT  *
                    FROM datamarts.marketing_dash_distr
                    WHERE  first_prolong_date between '2024-08-01' and '2024-08-15' 
                    --AND  bonus_title like '%GetBlogger%'
                    AND payer = 1 
                """

df = execute(query,user='kmekhtiev')
df.to_excel('first_pay_0108_1508.xlsx',index=False)