In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import duckdb

base_dir = Path('c:\\Users\\henry\\OneDrive\\Personal Career\\Personal Projects\\GitHub\\Revenue-Sustainability-Analysis')
data_dir = Path(base_dir / 'Dataset')

## Define adopters vs. non adopters


In [2]:
pd.set_option('display.max_rows',10)

In [3]:
feats = pd.read_parquet(data_dir / 'feature_usage.parquet')

In [4]:
users = feats[feats['feature_name'] == 'feature_newai']

distinct_days = users.groupby('subscription_id')['usage_date'].nunique().reset_index(name='distinct_usage_days')
distinct_days["adoption_flag"] = (distinct_days['distinct_usage_days'] >= 3).astype(int)
distinct_days.groupby('distinct_usage_days')['subscription_id'].nunique().reset_index(name='num_users').sort_values('distinct_usage_days')

feats = feats.merge(distinct_days, on='subscription_id', how='left')
# feats['adoption_flag'] = feats['adoption_flag'].astype('Int64')

# Percentage of users who interacted with product who are adopters = 25.42%
distinct_days['adoption_flag'].mean()

0.2542105263157895

### Calculate pre-release metrics at the customer level

Define pre-release window as 90 days prior to June 16, 2023 => March 14, 2023

In [5]:
pre = feats.loc[(feats['usage_date'] < '2023-06-12') & (feats['usage_date'] >= '2023-03-14')].copy()

# Normalize the grain of the table to be one row per sub_id, usage_id
pre = pre.groupby(['subscription_id', 'usage_date'], as_index=False) \
   .agg({
        'usage_count': 'sum',
        'usage_duration_secs': 'sum',
        'error_count': 'sum',
        'distinct_usage_days': 'sum',

        'is_beta_feature': 'max',
        'adoption_flag': 'max'
   })

# Calculate number of active days
num_active_days = pre.groupby('subscription_id')['usage_date'].nunique().reset_index(name='distinct_active_days')
pre['distinct_usage_days'] = pre['distinct_usage_days'].astype('Int64')

# Calculate avg gaps between days
pre['usage_date'] = pd.to_datetime(pre['usage_date'])
pre = pre.sort_values(['subscription_id', 'usage_date'])

pre['gaps'] = pre.groupby('subscription_id')['usage_date'] \
    .diff() \
    .dt.days \
    .astype('Int64')

avg_gaps = pre.groupby('subscription_id')['gaps'] \
            .mean() \
            .reset_index(name='avg_gaps')

cust = num_active_days.merge(
            avg_gaps,
            on="subscription_id",
            how='outer'
)

# Calculate avg_usage_per_day
total_usage = pre.groupby('subscription_id')['usage_count'].sum() \
                .reset_index(name='total_usage')

total_usage_duration = pre.groupby('subscription_id')['usage_duration_secs'].sum() \
                .reset_index(name='total_usage_duration')

# # Quick Validation
# pre.loc[pre['subscription_id'] == 'S-003647']

cust = cust.merge(
        total_usage,
        on='subscription_id',
        how='left'
)

cust['avg_usage_per_day'] = (cust['total_usage'] / cust['distinct_active_days']).round(2)

# Calculate avg usage duration per day
cust = cust.merge(
        total_usage_duration,
        on='subscription_id',
        how='left'
)
cust['avg_usage_duration_per_day'] = (cust['total_usage_duration'] / cust['distinct_active_days']).round(2)

In [6]:
# Validation Check

# Each sub_id must be unique
print(f"Sub_id doesn't have dupes: {cust['subscription_id'].is_unique}")

# These cols cannot have negative vals
cust[['distinct_active_days', 'avg_gaps', 'total_usage', 'avg_usage_per_day', 'total_usage_duration', 'avg_usage_duration_per_day']].min()

## Check for any nonsense Nulls
# cust.loc[(cust['distinct_active_days'] == 1) & ~(cust['avg_gaps'].isna())]

Sub_id doesn't have dupes: True


distinct_active_days          1.0
avg_gaps                      1.0
total_usage                   0.0
avg_usage_per_day             0.0
total_usage_duration          0.0
avg_usage_duration_per_day    0.0
dtype: Float64

In [11]:
# Distinguish between one time and repeat users 
one_time = cust.loc[cust['distinct_active_days'] == 1]

cust = cust.loc[cust['distinct_active_days'] > 1]
 