In [None]:
from typing import NamedTuple, Dict, List, Optional

import matplotlib.pyplot as plt

import datetime
import json
import math
import numpy as np
import os
import pandas as pd
import sys

KHULNASOFT-LAB_ROOT = os.path.join(os.getenv("HOME"), "go/src/github.com/khulnasoft-lab/fastnode")
sys.path.append(os.path.join(KHULNASOFT-LAB_ROOT, "fastnode-exp/telemetry-analysis/completions-metrics/"))

%reload_ext autoreload
%autoreload 2

FILENAME = os.path.join(KHULNASOFT-LAB_ROOT, "fastnode-exp/telemetry-analysis/completions-metrics/out/completions.json")
PICKLE_FILENAME = os.path.join(KHULNASOFT-LAB_ROOT, "fastnode-exp/telemetry-analysis/completions-metrics/df.pickle")

def with_percent(series):
    """Given a series of counts, return a dataframe with a column containing percentages of
    global count."""
    return pd.DataFrame({'count': series, 'percent': series / len(df.index) * 100})

def weekdays(df):
    return df[~df.index.weekday.isin((5, 6))]

In [None]:
# Load the completions data from JSON to a dataframe and pickle it for quicker access later
# This can be skipped if we already have the pickled file built

from analysis.plots import read_logs

df = read_logs(FILENAME)
df.to_pickle(PICKLE_FILENAME)

In [None]:
# If the data is already pickled we can use this

df = pd.read_pickle(PICKLE_FILENAME)

In [None]:
df.sample(n=10)

In [None]:
df.info()

In [None]:
print("unique users:", df.user_id.nunique())

In [None]:
with_percent(df.resample('W').size())

In [None]:
from typing import Optional

def by_engaged_users(df: pd.DataFrame, fn, threshold=5, agg_fn=lambda g: g.median(), from_date: Optional[pd.Timestamp]=None):
    if from_date: 
        df = df[df.index > from_date]
    
    daily = weekdays(df.resample('D').sum())
    days = list(daily.index)

    r = {}
    
    for day in days:
        day_end = day + datetime.timedelta(days=1)
        for_day = df[(df.index >= day) & (df.index < day_end)]
        by_user = for_day.groupby(['user_id']).sum()
        engaged_users = set(by_user[by_user.ones >= threshold].index)
        engaged_df = for_day[for_day.user_id.isin(engaged_users)]
        grouper = engaged_df.groupby([pd.Grouper(freq='1D'), 'user_id'])
        by_date_user = grouper.sum() # <by date> <by user> metrics
        metric = fn(by_date_user)
        metric = metric.replace([np.inf, -np.inf], np.nan)
        metric = metric.dropna()
        agg_by_user = agg_fn(metric.groupby(level=['timestamp']))
        r[day] = sum(agg_by_user)

    return pd.Series(r)[:-1]

In [None]:
metrics = [
    ["events w/ completions", lambda df: df.ones],
    ["requested expected", lambda df: df.requested_expected],
    ["shown", lambda df: df.shown],
    ["shown / requested expected", lambda df: df.shown / df.requested_expected], 
    ["at_least_one_shown", lambda df: df.at_least_one_shown],
    ["shown / events", lambda df: df.shown / df.ones],
    ["requested expected / events", lambda df: df.requested_expected / df.ones],
    ["selected", lambda df: df.selected_num],
    ["selected / requested expected", lambda df: df.selected_num / df.requested_expected],
    ["selected2", lambda df: df.selected_2_num],
    ["selected / requested expected", lambda df: df.selected_2_num / df.requested_expected],
]

In [None]:
plt.figure(figsize=(16,22))
for i, m in enumerate(metrics):
    title, fn = m
    plt.subplot(int(math.ceil(len(metrics) / 2)), 2, i+1)
    s = by_engaged_users(df, fn, threshold=10, agg_fn=lambda g: g.mean())
    s.resample('1W').mean().plot(marker='o')
    s2 = by_engaged_users(df, fn, threshold=0, agg_fn=lambda g: g.mean())
    s2.resample('1W').mean().plot(marker='o')
    plt.title("mean " + title)
    plt.xlabel('')
    #max_val = weekly_avg.max()
    #plt.ylim([0, max_val * 1.1])
plt.tight_layout()    
plt.show()

In [None]:
plt.figure(figsize=(16,22))
for i, m in enumerate(metrics):
    title, fn = m
    plt.subplot(int(math.ceil(len(metrics) / 2)), 2, i+1)
    s = by_engaged_users(df, fn, threshold=10)
    s.resample('1W').mean().plot(marker='o')
    s2 = by_engaged_users(df, fn, threshold=0)
    s2.resample('1W').mean().plot(marker='o')
    plt.title("median " + title)
    plt.xlabel('')
    #max_val = weekly_avg.max()
    #plt.ylim([0, max_val * 1.1])
plt.tight_layout()    
plt.show()

In [None]:
plt.figure(figsize=(12,6))
weekdays(by_engaged_users(df[df.index >= '2019-05-01'], lambda df: df.shown, threshold=10)).plot(marker='o')
weekdays(by_engaged_users(df[df.index >= '2019-05-01'], lambda df: df.at_least_one_shown, threshold=10)).plot(marker='o')
plt.legend(['shown', 'at least one shown'])
plt.show()

In [None]:
plt.figure(figsize=(16,8))
by_engaged_users(df, lambda df: df.requested, threshold=10).plot(marker='o')
by_engaged_users(df, lambda df: df.requested_expected, threshold=10).plot(marker='o')

plt.show()

In [None]:
mtac_metrics = [
    ["selected_2_attribute_model / requested expected", lambda df: df.selected_2_attribute_model / df.requested_expected],
    ["selected_2_attribute_model / requested", lambda df: df.selected_2_attribute_model / df.requested],
    ["selected_2_call_model / requested expected", lambda df: df.selected_2_call_model / df.requested_expected],
    ["selected_2_call_model / requested", lambda df: df.selected_2_call_model / df.requested],
    ["selected_2_attribute_model / requested expected", lambda df: df.selected_2_attribute_model / df.requested_expected],
    ["selected_2_attribute_model / requested", lambda df: df.selected_2_attribute_model / df.requested],
    ["selected_2_call_model / requested expected", lambda df: df.selected_2_call_model / df.requested_expected],
    ["selected_2_call_model / requested", lambda df: df.selected_2_call_model / df.requested],
    ["selected_2_attribute_model", lambda df: df.selected_2_attribute_model],
    ["selected_2_call_model", lambda df: df.selected_2_call_model],
    ["selected_mtac", lambda df: df.selected_mtac],
    ["selected_2_mtac", lambda df: df.selected_2_mtac],
]

plt.figure(figsize=(16,24))
for i, m in enumerate(mtac_metrics):
    title, fn = m
    plt.subplot(int(math.ceil(len(metrics) / 2)), 2, i+1)
    s = by_engaged_users(df, fn, threshold=10, agg_fn=lambda g: g.mean())
    s.resample('W').mean().plot(marker='o')
    s2 = by_engaged_users(df, fn, threshold=0, agg_fn=lambda g: g.mean())
    s2.resample('W').mean().plot(marker='o')
    plt.title("mean " + title)
    plt.xlabel('')
    #max_val = weekly_avg.max()
    #plt.ylim([0, max_val * 1.1])
plt.tight_layout()    
plt.show()

In [None]:
len(df[df.selected_2_call_model > 0])

In [None]:
more_metrics = [
    ["selected_num", lambda df: df.selected_num],
    ["selected_2_num", lambda df: df.selected_2_num],
    ["at_least_one_shown", lambda df:  df.at_least_one_shown],
    ["at_least_one_shown_call_model", lambda df: df.at_least_one_shown_call_model],
    ["selected_2_call_model / at_least_one_shown_call_model", lambda df: df.selected_2_call_model / df.at_least_one_shown_call_model],
    ["selected_2_mtac / at_least_one_shown_mtac", lambda df: df.selected_2_mtac / df.at_least_one_shown_mtac],
    ["selected_2_call_model / at_least_one_shown", lambda df: df.selected_2_call_model / df.at_least_one_shown],
    ["selected_2_mtac / at_least_one_shown", lambda df: df.selected_2_mtac / df.at_least_one_shown],
    ["selected_2_mtac / requested_expected", lambda df: df.selected_2_mtac / df.requested_expected],
    ["selected_2_call_model / requested_expected", lambda df: df.selected_2_call_model / df.requested_expected],
]

plt.figure(figsize=(16,22))
for i, m in enumerate(more_metrics):
    title, fn = m
    plt.subplot(int(math.ceil(len(metrics) / 2)), 2, i+1)
    s = by_engaged_users(df, fn, threshold=10, agg_fn=lambda g: g.mean(), from_date='2019-05-01')
    s.plot(marker='o')
    s2 = by_engaged_users(df, fn, threshold=0, agg_fn=lambda g: g.mean(), from_date='2019-05-01')
    s2.plot(marker='o')
    plt.title("mean " + title)
plt.tight_layout()    
plt.show()

In [None]:
plt.figure(figsize=(16,8))
later = df[df.index > '2019-05-01'].resample('D').sum()
typs = ["traditional", "attribute_model", "call_model", "keyword_model", "expr_model"]
for typ in typs:
    later[f"selected_{typ}"].plot(marker='o')
later["selected_num"].plot(marker='o')
plt.legend(typs + ["num"])

In [None]:
plt.figure(figsize=(16,8))
df[df.index >= '2019-04-01'].resample('D').sum().requested_expected.plot(marker='o')