#### I establish the baseline group metrics for adopters and non adopters. In order to determine the impact of the release, I must establish a baseline so that metrics can be compared **relative to** a baseline

In [116]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import duckdb

base_dir = Path('c:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis')
data_dir = Path(base_dir / 'Dataset')

## Define adopters vs. non adopters


In [117]:
pd.set_option('display.max_rows',10)

In [118]:
feats = pd.read_parquet(data_dir / 'feature_usage.parquet')

First, I want to filter to only those who have any feature_usage history pre release

In [119]:
# Pre has all information for both adopters and non adopters 90 days before the release date
# Pre is at the feature usage event level
pre = feats.loc[(feats['usage_date'] < '2023-06-12')].copy()

pre_ids = pre['subscription_id'].dropna().unique()

In [120]:
pre['usage_date'] = pd.to_datetime(pre['usage_date'])

In [121]:
pre_feat_history = pre.loc[pre['usage_date'] >= '2023-03-14']

In [123]:
# Normalize the grain of the table to be one row per sub_id, usage_id
pre_feat_history = pre_feat_history.groupby(['subscription_id', 'usage_date'], as_index=False) \
   .agg({
        'usage_count': 'sum',
        'usage_duration_secs': 'sum',
        'error_count': 'sum',
        'is_beta_feature': 'max'
   })

# Count how many unique days each person used the platform
distinct_usage_days = pre_feat_history.groupby('subscription_id')['usage_date'] \
          .nunique() \
          .reset_index(name='distinct_usage_days')

# Calculate avg gaps between days
pre_feat_history = pre_feat_history.sort_values(['subscription_id', 'usage_date'])
pre_feat_history['gaps'] = pre_feat_history.groupby('subscription_id')['usage_date'] \
    .diff() \
    .dt.days \
    .astype('Int64')

avg_gaps = pre_feat_history.groupby('subscription_id')['gaps'] \
            .mean() \
            .reset_index(name='avg_gaps')

pre_metrics = distinct_usage_days.merge(
            avg_gaps,
            on="subscription_id",
            how='outer'
)

# Calculate avg_usage_per_day
total_usage = pre_feat_history.groupby('subscription_id')['usage_count'].sum() \
                .reset_index(name='total_usage')

total_usage_duration = pre_feat_history.groupby('subscription_id')['usage_duration_secs'].sum() \
                .reset_index(name='total_usage_duration')

# # Quick Validation
# pre.loc[pre['subscription_id'] == 'S-003647']

# Cust is aggregated at the customer level, still has both adopters and non adopters
pre_metrics = pre_metrics.merge(
        total_usage,
        on='subscription_id',
        how='left'
)

pre_metrics['avg_usage_per_day'] = (pre_metrics['total_usage'] / pre_metrics['distinct_usage_days']).round(2)

# Calculate avg usage duration per day
pre_metrics = pre_metrics.merge(
        total_usage_duration,
        on='subscription_id',
        how='left'
)
pre_metrics['avg_usage_duration_per_day'] = (pre_metrics['total_usage_duration'] / pre_metrics['distinct_usage_days']).round(2)

# Calculate total error count for each person
total_error_count = pre_feat_history.groupby('subscription_id')['error_count'].sum()
pre_metrics['error_count'] = pre_metrics['subscription_id'].map(total_error_count)

# Partition customers based on distinct_usage_days
# bins = (-1,1] ; (1,3] ; (3,max]
pre_metrics['tier'] = pd.cut(
    pre_metrics['distinct_usage_days'],
    bins=[-1,1,3, pre_metrics['distinct_usage_days'].max()],
    labels=['Tier 3 (Low)', 'Tier 2 (Med)', 'Tier 1 (High)']
)

pre_metrics[['distinct_usage_days', 
            'avg_gaps', 'total_usage', 
            'avg_usage_per_day', 
            'total_usage_duration', 
            'avg_usage_duration_per_day', 
            'error_count']] = pre_metrics[['distinct_usage_days', 
                                          'avg_gaps', 'total_usage', 
                                          'avg_usage_per_day', 
                                          'total_usage_duration', 
                                          'avg_usage_duration_per_day', 
                                          'error_count']].fillna(0)

## Validation
# cust.groupby('tier')['distinct_usage_days'].describe()
# cust['tier'].value_counts

In [124]:
pre_metrics

Unnamed: 0,subscription_id,distinct_usage_days,avg_gaps,total_usage,avg_usage_per_day,total_usage_duration,avg_usage_duration_per_day,error_count,tier
0,S-001561,3,35.5,31,10.33,15016,5005.33,3,Tier 2 (Med)
1,S-0027d3,1,0.0,8,8.0,2568,2568.0,0,Tier 3 (Low)
2,S-003647,2,14.0,16,8.0,3239,1619.5,0,Tier 2 (Med)
3,S-004eb4,1,0.0,8,8.0,1496,1496.0,1,Tier 3 (Low)
4,S-006fed,3,25.0,21,7.0,12889,4296.33,0,Tier 2 (Med)
...,...,...,...,...,...,...,...,...,...
2670,S-ff681f,1,0.0,9,9.0,946,946.0,1,Tier 3 (Low)
2671,S-ff78bf,1,0.0,9,9.0,720,720.0,2,Tier 3 (Low)
2672,S-ff93b5,2,54.0,16,8.0,7684,3842.0,2,Tier 2 (Med)
2673,S-ffb6cc,1,0.0,7,7.0,1981,1981.0,0,Tier 3 (Low)


In [None]:
pre_metrics = pre_feat_history.merge(pd.DataFrame({'subscription_id': pre_ids}), on='subscription_id', how='left')

In [None]:
pre_metrics

Unnamed: 0,subscription_id,usage_date,usage_count,usage_duration_secs,error_count,is_beta_feature,gaps
0,S-001561,2023-03-18,8,1872,0,True,
1,S-001561,2023-05-20,15,8760,3,True,63
2,S-001561,2023-05-28,8,4384,0,True,8
3,S-0027d3,2023-05-06,8,2568,0,True,
4,S-003647,2023-05-19,9,1755,0,True,
...,...,...,...,...,...,...,...
4026,S-ff78bf,2023-04-12,9,720,2,True,
4027,S-ff93b5,2023-03-27,7,3661,0,True,
4028,S-ff93b5,2023-05-20,9,4023,2,True,54
4029,S-ffb6cc,2023-03-20,7,1981,0,True,


In [None]:
cust

NameError: name 'cust' is not defined

### Partition users into 3 tiered groups: 
* Non-Users (0+ distinct usage days)
* Experimenters (1-2 distinct usage days)
* Adopters (3+ distinct usage days)

### Calculate pre-release metrics at the customer level

Define pre-release window as 90 days prior to June 16, 2023 => March 14, 2023

In [None]:
# Validation Check

# Each sub_id must be unique
print(f"Sub_id doesn't have dupes: {pre_cust['subscription_id'].is_unique}")

# These cols cannot have negative vals
pre_cust[['distinct_active_days', 'avg_gaps', 'total_usage', 'avg_usage_per_day', 'total_usage_duration', 'avg_usage_duration_per_day']].min()

## Check for any nonsense Nulls
# cust.loc[(cust['distinct_active_days'] == 1) & ~(cust['avg_gaps'].isna())]

NameError: name 'pre_cust' is not defined

### Before i aggregate groups. Ensure that individuals have history for both pre release and short term post release (90 days after release date)

In [None]:
# Figure out those who have usages in both post short term and post long term
post_short_ids = feats.loc[(feats['usage_date'] >= '2023-06-12') & (feats['usage_date'] < '2023-09-11')]['subscription_id']
post_long_ids = feats.loc[feats['usage_date'] >= '2023-09-11']['subscription_id']


# Ensure that the short term analysis has individuals that have both pre history and short term history
pre_short_ids = pre.loc[pre['subscription_id'].isin(post_short_ids)]['subscription_id']
pre_long_ids  = pre.loc[pre['subscription_id'].isin(post_long_ids)]['subscription_id']

In [None]:
adopt_repeat = pre_cust.agg({
    'distinct_active_days':'mean',
    'avg_gaps':'mean',
    'avg_usage_per_day':'mean',
    'avg_usage_duration_per_day':'mean',
    'total_usage_duration':'sum',
    'total_usage':'sum'
})

adopt_repeat.index = ['avg_distinct_days',
       'avg_gaps',
       'avg_daily_usage',
       'avg_daily_usage_duration',
       'total_usage_duration',
       'total_usage']

one_time.agg({
    'distinct_active_days':'mean',
    'avg_gaps':'mean',
    'avg_usage_per_day':'mean',
    'avg_usage_duration_per_day':'mean',
    'total_usage_duration':'sum',
    'total_usage':'sum'
})

one_time.index = ['avg_distinct_days',
       'avg_gaps',
       'avg_daily_usage',
       'avg_daily_usage_duration',
       'total_usage_duration',
       'total_usage']

In [None]:
adopt_repeat