In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from read_db.CH import Getch
import pandas as pd
from datetime import datetime, timedelta

sns.set(rc={'figure.figsize': (16, 10)}, style='whitegrid', font_scale=0.8)

In [2]:
query = """SELECT start_day,
          day,
          source,
          count(user_id) AS users
   FROM
     (SELECT user_id,
             source,
             start_day,
             day
      FROM
        (SELECT user_id,
                source,
                min(toDate(time)) AS start_day
         FROM simulator_20220320.feed_actions
         GROUP BY user_id,
                  source
         HAVING start_day >= toDate('2022-03-28')) t1
      JOIN
        (SELECT DISTINCT user_id,
                         toDate(time) AS day
         FROM simulator_20220320.feed_actions) t2 USING user_id)
   GROUP BY start_day,
            day,
            source
   ORDER BY start_day,
            day"""

In [3]:
both_df = Getch(query).df

In [4]:
both_df.head()

Unnamed: 0,start_day,day,source,users
0,2022-03-28,2022-03-28,ads,3688
1,2022-03-28,2022-03-28,organic,756
2,2022-03-28,2022-03-29,ads,106
3,2022-03-28,2022-03-29,organic,256
4,2022-03-28,2022-03-30,organic,260


In [5]:
ads_df, org_df = both_df.groupby('source')

In [6]:
ads_df = ads_df[1].drop('source', axis=1)
org_df = org_df[1].drop('source', axis=1)

In [7]:
ads_df[ads_df.day == '2022-03-29']

Unnamed: 0,start_day,day,users
2,2022-03-28,2022-03-29,106
18,2022-03-29,2022-03-29,586


In [8]:
org_df[org_df.day == '2022-03-29']

Unnamed: 0,start_day,day,users
3,2022-03-28,2022-03-29,256
19,2022-03-29,2022-03-29,729


In [9]:
def add_retention_col(df):
    start_df = df.query('day == start_day')[['start_day','users']].rename(columns={'users':'start_users'})
    df = df.merge(start_df, on='start_day')
    df['retention'] = df.users / df.start_users
    df = df.drop('start_users', axis=1)
    return df

In [10]:
def get_days_following(df, day_str):
    return df[(df.start_day == day_str) & (df.day != day_str)]

In [11]:
ads_df = add_retention_col(ads_df)
ads_df.head()

Unnamed: 0,start_day,day,users,retention
0,2022-03-28,2022-03-28,3688,1.0
1,2022-03-28,2022-03-29,106,0.028742
2,2022-03-28,2022-03-30,124,0.033623
3,2022-03-28,2022-03-31,122,0.03308
4,2022-03-28,2022-04-01,108,0.029284


In [12]:
following_days_ads = get_days_following(ads_df, '2022-03-28')

In [13]:
following_days_ads

Unnamed: 0,start_day,day,users,retention
1,2022-03-28,2022-03-29,106,0.028742
2,2022-03-28,2022-03-30,124,0.033623
3,2022-03-28,2022-03-31,122,0.03308
4,2022-03-28,2022-04-01,108,0.029284
5,2022-03-28,2022-04-02,123,0.033351
6,2022-03-28,2022-04-03,93,0.025217
7,2022-03-28,2022-04-04,84,0.022777
8,2022-03-28,2022-04-05,80,0.021692


In [14]:
following_days_ads.describe()

Unnamed: 0,users,retention
count,8.0,8.0
mean,105.0,0.028471
std,17.703914,0.0048
min,80.0,0.021692
25%,90.75,0.024607
50%,107.0,0.029013
75%,122.25,0.033148
max,124.0,0.033623


In [15]:
org_df = add_retention_col(org_df)
following_days_org = get_days_following(org_df, '2022-03-28')

In [16]:
following_days_org.describe()

Unnamed: 0,users,retention
count,8.0,8.0
mean,224.375,0.296792
std,26.795189,0.035443
min,181.0,0.239418
25%,208.75,0.276124
50%,227.5,0.300926
75%,238.0,0.314815
max,260.0,0.343915


In [17]:
ads_df[(ads_df.day >= '2022-03-28') & (ads_df.day <= '2022-03-29')]

Unnamed: 0,start_day,day,users,retention
0,2022-03-28,2022-03-28,3688,1.0
1,2022-03-28,2022-03-29,106,0.028742
9,2022-03-29,2022-03-29,586,1.0


In [19]:
org_df[(org_df.day >= '2022-03-28') & (org_df.day <= '2022-03-29')]

Unnamed: 0,start_day,day,users,retention
0,2022-03-28,2022-03-28,756,1.0
1,2022-03-28,2022-03-29,256,0.338624
9,2022-03-29,2022-03-29,729,1.0
