## Product Analytics

Its easiest to grasp concepts like metrics, A/B testing and general analysis with coded examples. Its not easy to get our hands on actual product user data, but we can generate some with distributions and common data models

In [62]:
import pandas as pd
import numpy as np
import scipy
import datetime
import pdb

In [1]:
# Acquistion ---> (Activation) ---- > Retention ---> Monetization

# revenue = (revenue per user) * number_of_users
# number_of_users = new_users + (retention * existing_users) + resurrection; where retention <= 1

### High level metrics; broad but applicable across industries and products

In [2]:

# Acquisition, new users

#-- Baselining metrics
def market_penetration(total_users, tam):
    '''Total addressable market can come from market research. This will
    tell you what stage of growth you are at, and whether to switch focus from
    new user acquistion to existing user retention'''
    return total_users/tam

def user_to_install_ratio(monthly_active_users, monthly_installs):
    '''This ratio compares the active user vs installs. It is often
    paired with penetration. You want to see this ratio grow as penetration
    grows-- otherwise it means your installs are dropping but retention is not 
    keeping up'''

    return monthly_active_users / monthly_installs


# --- Actual acquisition metrics
def active_users(df, frequency):
    
    if frequency == 'daily':
        df['day'] = df['date'].dt.to_period('D')
        df = df[['day', 'number_of_active_users']].groupby(by='day', as_index=False).sum()
        
    elif frequency == 'weekly':
        df['week'] = df['date'].dt.to_period('W')
        df = df[['day', 'number_of_active_users']].groupby(by='week', as_index=False).sum()
        
    elif frequency == 'monthly':
        df['month'] = df['date'].dt.to_period('M')
        df = df[['day', 'number_of_active_users']].groupby(by='month', as_index=False).sum()
        
    else:
        return 'Invalid frequency'
    
    return df

def change_active_users(df, frequency):
    
    if frequency == 'daily':
        df['day'] = df['date'].dt.to_period('D')
        df = df[['day', 'number_of_active_users']].groupby(by='day', as_index=False).sum()
        
    elif frequency == 'weekly':
        df['week'] = df['date'].dt.to_period('W')
        df = df[['day', 'number_of_active_users']].groupby(by='week', as_index=False).sum()
        
    elif frequency == 'monthly':
        df['month'] = df['date'].dt.to_period('M')
        df = df[['day', 'number_of_active_users']].groupby(by='month', as_index=False).sum()
        
    else:
        return 'Invalid frequency'
    
    df['previous'] = df['number_of_active_users'].shift(1)
    df['change'] = df['number_of_active_users'] - df['previous']
    
    return df

def new_pct(new_users, monthly_active_users):
    
    '''Tells you how many of your normal busines userbase is from new acquisitions.
    Target value depends on stage of growth'''
    
    return new_users / monthly_active_users
    

In [149]:
# Retention
def cohort_breakdown(df, cohort_freq, retention_freq, retention_start, retention_end):
    
    '''Say we given data on with user_id, join_date and left_date'''
    
    # Generate a range of dates at appropriate frequency
    range_dates = pd.date_range(start=retention_start, 
                                end=retention_end, 
                                freq=retention_freq).to_period(retention_freq)
    
    df = df[~df['user_id'].isnull()]
    df = df[~df['date'].isnull()]
    
    # Convert user active dates to cohort frequency
    # Group by min to get their cohort
    df['cohort_freq'] = pd.DataFrame(df['date'].dt.to_period(cohort_freq))
    #earliest date by cohort_freq, is join date
    df['cohort'] = df[['user_id', 'cohort_freq']].groupby('user_id').transform('min') 
    
    # Get unique user_id and retention_period. Could be daily, monthly, weekly etc.
    df['retention_freq'] = df['date'].dt.to_period(retention_freq)
    
    # Set up output df
    cohorts = pd.DataFrame({'cohorts': df['cohort'].drop_duplicates()}).reset_index()
    
    # Now we loop through the retention date ranges to create final output
    for i in range_dates:
        
        # Calculate number of users
        temp = df[df['retention_freq'] == i]
        temp = df[['cohort', 'user_id']].groupby('cohort', as_index=False).count()
        cohorts[i] = temp['user_id']
        #pdb.set_trace()
        
    
    return cohorts


# Daily Active Users vs Monthly Active Users
def dau_mau_ratio(daily_active_users, monthly_active_users):
    
    '''This ratio is used to understand stickiness. Of your monthly
    user-base (which is more stable given new user acquisition, churn)--
    how many are logining and engaging daily?'''
    
    return daily_active_users / monthly_active_users


# Open Rate
def open_rate(number_of_opens, installs):
    
    '''This ratio pins the number of opens vs installs. Just because someone
    installed the app, (and maybe signed up or not), doesnt mean they are opening it.
    We use installs to capture customers that downloaded/setup but not create an account.'''
    
    return number_of_opens / installs


def period_retention_pct(df, end_date, start_date):
    
    '''This is a basic retention calculation over a period'''
    
    starting_users = len(set(df[df['active_date'] == start_date]['user_id']))
    ending_users = len(seet(df[df['active_date'] == end_date]['user_id']))
    
    period_retention = (ending_users - starting_users) / starting_users
    
    return period_retention


def cohort_retention_pct(df, cohort_filter, cohort_freq, end_date, start_date):
    
    '''This is a basic retention calculation for a cohort over a period'''
    
    df['cohort'] = df['active_date'].dt.to_period(cohort_freq)
    df = df[df['cohort'] == cohort_filter]
    
    starting_users = len(set(df[df['active_date'] == start_date]['user_id']))
    ending_users = len(seet(df[df['active_date'] == end_date]['user_id']))
    
    cohort_retention = (ending_users - starting_users) / starting_users
    
    return cohort_retention

# Retention days is sort of a hybrid with time spent, it doesnt just look at
# start and end dates, but also give some info of behaviour between the dates

def period_retention_days(df, start_date, end_date):
    
    '''Instead of checking retention %, this is an alternative to get avg days
    users are retained.'''
    
    df = df[(df['active_date'] >= start_date) & (df['active_date'] <= end_date)]
    df = df.groupby('user_id', as_index=False).count()
    number_of_users = len(set(df['user_id']))
    number_of_days = np.sum(df['count'])
    
    return number_of_days / number_of_users

def cohort_retention_days(df, cohort_filter, cohort_freq, start_date, end_date):
    
    '''We will get avg days but for a specific cohort'''
    df['cohort'] = df['active_date'].dt.to_period(cohort_freq)
    df = df[df['cohort'] == cohort_filter]
    
    df = df[(df['active_date'] >= start_date) & (df['active_date'] <= end_date)]
    df = df.groupby('user_id', as_index=False).count()
    number_of_users = len(set(df['user_id']))
    number_of_days = np.sum(df['count'])
    
    return number_of_days / number_of_users
    
def period_over_period_retention(df, period_freq):
    
    '''Same as period retention, but for period over period changes.
    Good for monitoring core users.'''
    
    df['period'] = df['date'].dt.to_period(period_freq)
    df = df[['user_id', 'period']].groupby('period', as_index=False).count()
    df['last_period_count'] = df['count'].shift(1)
    df['period_over_period_retention'] = (df['count'] - df['last_period_last']) / df['last_period_last']
    
    return df
    

In [152]:
# --- Engagement --- #
# Still part of retention, but deeper look into activity and behaviour
# This is very specific to a product's engagement model

def time_spent_over_dau(df, today=None):
    
    '''How much time per user are we getting from daily active users'''
    
    if today is None:
        today = datetime.today() #.strftime('%Y-%m-%d')
    
    df = df[df['active_date'] == today]
    active_users = len(set(df['user_id']))
    time_spent = np.sum(df['time_logged'])
    
    return time_spent / active_users
    

In [None]:
# ---- Monetization --- #



### More Industry Specific Examples