In [None]:
import pandas as pd
import psycopg2
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import seaborn as sns 
import statsmodels.stats.proportion as proportion
from scipy.stats import ttest_ind,mannwhitneyu,shapiro,norm
from statsmodels.stats.weightstats import ztest
from tqdm import tqdm
import timeit
from scipy import stats
import math
from datetime import date, datetime, timedelta
import time
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
import warnings
warnings.filterwarnings("ignore")

from credential import postgres_secret,clickhouse_dwh_secret

def get_engine(user):
    if user == postgres_secret['user']:
        db_name = postgres_secret['db_name']
        password = postgres_secret['password']
        host = postgres_secret['host']
        engine = create_engine(f'postgresql://{user}:{password}@{host}:6432/{db_name}')
    elif user == clickhouse_dwh_secret['user']:
            db_name = clickhouse_dwh_secret['db_name'] 
            password = clickhouse_dwh_secret['password']
            host = clickhouse_dwh_secret['host']
            engine = create_engine(f'clickhouse://{user}:{password}@{host}:8123/{db_name}')
    return engine
    

        
def execute(SQL, user):
    start_time = time.time()  # запоминаем время начала выполнения функции
    engine = get_engine(user)
    Session = sessionmaker(bind=engine)  # sessions factory ()
    with Session() as session: # open session
        result = session.execute(text(SQL))
        df = pd.DataFrame(result.fetchall(), columns=result.keys())
        
    end_time = time.time()  # запоминаем время окончания выполнения функции
    execution_time = round(end_time - start_time,4) # вычисляем время выполнения   
    
    print(f"Время выполнения функции: {execution_time} секунд")
    print()
    return df

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_watchtime_days_local ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                item_title String,
                item_type String,
                watchtime Int32
             )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_watchtime_days_local', '{replica}')
             ORDER BY date
             '''
execute(query,user = 'kmekhtiev')

query = '''CREATE TABLE datamarts.mekhtiev_watchtime_days_distr ON CLUSTER 'viasat_cluster'
            (
                date Date,
                profile_id UUID,
                item_title String,
                item_type String,
                watchtime Int32
             )
            ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_watchtime_days_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
query_truncate = """TRUNCATE TABLE datamarts.mekhtiev_watchtime_days_local  ON CLUSTER 'viasat_cluster' """
execute(query_truncate,user='kmekhtiev')


query = """ INSERT INTO datamarts.mekhtiev_watchtime_days_distr
                                        SELECT 
                                                date,
                                                profile_id,
                                                item_title,
                                                item_type,
                                                sum(viewing_time) AS watchtime
                                                FROM 
                                                    (SELECT 
                                                                date,
                                                                profile_id,                                                      
                                                                event_page,
                                                            JSONExtractInt(payload,'duration') AS duration,
                                                            JSONExtractString(payload,'item_type') AS item_type,
                                                            JSONExtractString(payload,'item_title') AS item_title,
                                                            CASE WHEN event_page<>'tvchannel'
                                                                      AND JSONExtractInt(payload,'viewing_time')<= JSONExtractInt(payload,'duration') 
                                                                      THEN JSONExtractInt(payload,'viewing_time')
                                                                 WHEN event_page='tvchannel' 
                                                                      AND JSONExtractInt(payload,'viewing_time') <18000 
                                                                      THEN JSONExtractInt(payload,'viewing_time')
                                                            END AS viewing_time                                
                                                            FROM datamarts.sandbox_data_distr AS t1
                                                            LEFT JOIN product_x.users AS u ON t1.user_id=u.id
                                                            WHERE date BETWEEN '2024-04-01'  AND '2024-08-15'
                                                            AND event_name IN ('auto_player_streaming','auto_kinom_streaming')
                                                            AND viewing_time IS NOT NULL   
                                                            AND (promo_type IS NULL OR promo_type!='cards')
                                                            AND (u.email NOT LIKE '%@test%' OR u.email IS NULL)
                                                            AND created_at>='2024-04-01'
                                                            and reg_date>='2024-04-01'
                                                          )
                                                          GROUP BY 1,2,3,4
"""
execute(query,user='kmekhtiev')

In [None]:
query = '''CREATE TABLE datamarts.mekhtiev_trial_user_local ON CLUSTER 'viasat_cluster'
            (
                profile_id UUID,
                reg_date Date,
                trial_start Date,
                trial_end Date,
                first_prolong_date Date,
                free_days Int32,
                subs_type Int32
             )
             ENGINE = ReplicatedMergeTree('/clickhouse/tables/{shard}/datamarts.mekhtiev_trial_user_local', '{replica}')
             ORDER BY reg_date
             '''
execute(query,user = 'kmekhtiev')



query = '''CREATE TABLE datamarts.mekhtiev_trial_user_distr ON CLUSTER 'viasat_cluster'
            (
                profile_id UUID,
                reg_date Date,
                trial_start Date,
                trial_end Date,
                first_prolong_date Date,
                free_days Int32,
                subs_type Int32
             )
            ENGINE = Distributed(viasat_cluster, datamarts, mekhtiev_trial_user_local, rand())
             '''
execute(query,user = 'kmekhtiev')

In [None]:
query_truncate = """TRUNCATE TABLE datamarts.mekhtiev_trial_user_distr  ON CLUSTER 'viasat_cluster' """
execute(query_truncate,user='kmekhtiev')

query = """ INSERT INTO datamarts.mekhtiev_trial_user_distr
                SELECT
                    profile_id,
                    reg_date,
                    created_at::date AS trial_start,
                    trial_start + free_days AS trial_end,
                    first_prolong_date,
                    free_days,
                    CASE WHEN first_prolong_date='1970-01-01' THEN 0 ELSE 1 END AS subs_type
                    FROM datamarts.marketing_dash_distr AS t1
                    LEFT JOIN product_x.users AS u ON t1.user_id=u.id
                    WHERE created_at>='2024-04-01' 
                    AND reg_date>='2024-04-01'
                    AND free_days IN (3,14,30,35,45)
                    AND (promo_type IS NULL OR promo_type!='cards')
                    AND (u.email NOT LIKE '%@test%' OR u.email IS NULL)
"""
execute(query,user='kmekhtiev')

In [None]:
df_watchtime = execute(""" SELECT * FROM datamarts.mekhtiev_watchtime_days_distr""",user='kmekhtiev')
df_trial_user = execute(""" SELECT * FROM datamarts.mekhtiev_trial_user_distr""",user='kmekhtiev')

In [None]:
df_merge = (pd.merge(df_trial_user[['profile_id','free_days','trial_start','trial_end','subs_type']]
                     ,df_watchtime[['date','profile_id','watchtime','item_title','item_type']]
                     ,how='left'
                     ,on='profile_id'
                     ,indicator = 'profile_type')
           )
df_merge['watchtime'].fillna(0,inplace=True)
df_merge_agg = (df_merge[(df_merge.date<=df_merge.trial_end) & (df_merge.date>=df_merge.trial_start) & (df_merge.watchtime>=0)]
                .groupby(['profile_id','free_days','trial_start','trial_end','subs_type'])
                .agg({'watchtime':'sum','item_title':'nunique'})
                .reset_index()
               )
df_merge_agg['watchtime']=df_merge_agg.watchtime/60

In [None]:
df_merge_agg[df_merge_agg.watchtime<np.percentile(df_merge_agg['watchtime'],95)]['watchtime'].hist()

In [None]:
df_merge_agg = df_merge_agg[df_merge_agg.watchtime<np.percentile(df_merge_agg['watchtime'],95)]
                 

## Коэффициент Корреляции между watchtime и булевым значением заплатит/ не заплатит

In [None]:
print('Коэффициент корреляции Спирмена для:')
for i in (3,14,30,35,45):
        df_merge_agg_3 = df_merge_agg[df_merge_agg.free_days==i]
        corr,p_value = stats.pearsonr(df_merge_agg_3['watchtime'], df_merge_agg_3['subs_type'])

        print(f'триал {i}:',round(corr,3))

## Коэффициент Корреляции между числом просмотренного контента и булевым значением заплатит/ не заплатит

In [None]:
print('Коэффициент корреляции Спирмена для:')
for i in (3,14,30,35,45):
        df_merge_agg_3 = df_merge_agg[df_merge_agg.free_days==i]
        corr,p_value = stats.pearsonr(df_merge_agg_3['item_title'], df_merge_agg_3['subs_type'])

        print(f'триал {i}:',round(corr,3))