#### I establish the baseline group metrics for adopters and non adopters. In order to determine the impact of the release, I must establish a baseline so that metrics can be compared **relative to** a baseline

In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import duckdb

base_dir = Path('c:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis')
data_dir = Path(base_dir / 'Dataset')

## Define adopters vs. non adopters


In [2]:
pd.set_option('display.max_rows',10)

In [16]:
feats = pd.read_parquet(data_dir / 'feature_usage.parquet')

First, I want to filter to only those who have any feature_usage history pre release

In [17]:
# Pre has all information for both adopters and non adopters 90 days before the release date
# Pre is at the feature usage event level
pre = feats.loc[(feats['usage_date'] < '2023-06-12')].copy()

In [18]:
pre.sort_values(by=['subscription_id', 'usage_date'])

Unnamed: 0,usage_id,subscription_id,usage_date,feature_name,usage_count,usage_duration_secs,error_count,is_beta_feature
3757,U-af47d3,S-001561,2023-03-18,feature_36,8,1872,,True
21020,U-e302b3,S-001561,2023-05-20,feature_4,15,8760,3,True
16545,U-c8f052,S-001561,2023-05-28,,8,4384,0,True
13700,U-46fba8,S-0027d3,2023-01-15,feature_16,24,12528,0,True
5749,U-ea0de3,S-0027d3,2023-05-06,feature_23,8,2568,0,True
...,...,...,...,...,...,...,...,...
35930,U-f6e4d870c937,,2023-06-10,feature_39,8,4125,2,True
3795,,,2023-06-11,feature_13,9,,2,True
15459,,,2023-06-11,feature_16,6,3588,0,True
27357,U-657f6a463a19,,2023-06-11,feature_23,8,3653,1,True


In [None]:
# Normalize the grain of the table to be one row per sub_id, usage_id
pre = pre.groupby(['subscription_id', 'usage_date'], as_index=False) \
   .agg({
        'usage_count': 'sum',
        'usage_duration_secs': 'sum',
        'error_count': 'sum',
        'is_beta_feature': 'max'
   })

# Count how many unique days each person used the platform
distinct_days = pre.groupby('subscription_id')['usage_date'].nunique().reset_index(name='distinct_usage_days').astype('Int64')

# # Calculate number of active days
# num_active_days = pre.groupby('subscription_id')['usage_date'].nunique().reset_index(name='distinct_active_days')
# pre['distinct_usage_days'] = pre['distinct_usage_days'].astype('Int64')

# # Calculate avg gaps between days
# pre['usage_date'] = pd.to_datetime(pre['usage_date'])
# pre = pre.sort_values(['subscription_id', 'usage_date'])

# pre['gaps'] = pre.groupby('subscription_id')['usage_date'] \
#     .diff() \
#     .dt.days \
#     .astype('Int64')

# avg_gaps = pre.groupby('subscription_id')['gaps'] \
#             .mean() \
#             .reset_index(name='avg_gaps')

# pre_cust = num_active_days.merge(
#             avg_gaps,
#             on="subscription_id",
#             how='outer'
# )

# # Calculate avg_usage_per_day
# total_usage = pre.groupby('subscription_id')['usage_count'].sum() \
#                 .reset_index(name='total_usage')

# total_usage_duration = pre.groupby('subscription_id')['usage_duration_secs'].sum() \
#                 .reset_index(name='total_usage_duration')

# # # Quick Validation
# # pre.loc[pre['subscription_id'] == 'S-003647']

# # Cust is aggregated at the customer level, still has both adopters and non adopters
# pre_cust = pre_cust.merge(
#         total_usage,
#         on='subscription_id',
#         how='left'
# )

# pre_cust['avg_usage_per_day'] = (pre_cust['total_usage'] / pre_cust['distinct_active_days']).round(2)

# # Calculate avg usage duration per day
# pre_cust = pre_cust.merge(
#         total_usage_duration,
#         on='subscription_id',
#         how='left'
# )
# pre_cust['avg_usage_duration_per_day'] = (pre_cust['total_usage_duration'] / pre_cust['distinct_active_days']).round(2)

### Partition users into 3 tiered groups: 
* Non-Users (0+ distinct usage days)
* Experimenters (1-2 distinct usage days)
* Adopters (3+ distinct usage days)

### Calculate pre-release metrics at the customer level

Define pre-release window as 90 days prior to June 16, 2023 => March 14, 2023

In [6]:
# Validation Check

# Each sub_id must be unique
print(f"Sub_id doesn't have dupes: {pre_cust['subscription_id'].is_unique}")

# These cols cannot have negative vals
pre_cust[['distinct_active_days', 'avg_gaps', 'total_usage', 'avg_usage_per_day', 'total_usage_duration', 'avg_usage_duration_per_day']].min()

## Check for any nonsense Nulls
# cust.loc[(cust['distinct_active_days'] == 1) & ~(cust['avg_gaps'].isna())]

NameError: name 'pre_cust' is not defined

### Before i aggregate groups. Ensure that individuals have history for both pre release and short term post release (90 days after release date)

In [None]:
# Figure out those who have usages in both post short term and post long term
post_short_ids = feats.loc[(feats['usage_date'] >= '2023-06-12') & (feats['usage_date'] < '2023-09-11')]['subscription_id']
post_long_ids = feats.loc[feats['usage_date'] >= '2023-09-11']['subscription_id']


# Ensure that the short term analysis has individuals that have both pre history and short term history
pre_short_ids = pre.loc[pre['subscription_id'].isin(post_short_ids)]['subscription_id']
pre_long_ids  = pre.loc[pre['subscription_id'].isin(post_long_ids)]['subscription_id']

In [None]:
adopt_repeat = pre_cust.agg({
    'distinct_active_days':'mean',
    'avg_gaps':'mean',
    'avg_usage_per_day':'mean',
    'avg_usage_duration_per_day':'mean',
    'total_usage_duration':'sum',
    'total_usage':'sum'
})

adopt_repeat.index = ['avg_distinct_days',
       'avg_gaps',
       'avg_daily_usage',
       'avg_daily_usage_duration',
       'total_usage_duration',
       'total_usage']

one_time.agg({
    'distinct_active_days':'mean',
    'avg_gaps':'mean',
    'avg_usage_per_day':'mean',
    'avg_usage_duration_per_day':'mean',
    'total_usage_duration':'sum',
    'total_usage':'sum'
})

one_time.index = ['avg_distinct_days',
       'avg_gaps',
       'avg_daily_usage',
       'avg_daily_usage_duration',
       'total_usage_duration',
       'total_usage']

In [None]:
adopt_repeat