In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
columns = ['IP', 'RPM', 'DateTime']
column_mappings = {"ip": columns[0], "ts": columns[2], "c": columns[1]}


def read_data(api_name):
    folder = Path(f'./retrieve/ds/{api_name}')
    assert os.path.isdir(folder), f"folder {folder} doesn't exist"
    files = [f for f in folder.glob('**/*.json')]
    assert len(files) > 0, f"folder {folder} is empty"

    full_df = pd.DataFrame(columns=columns)
    for f in files:
        df = pd.read_json(f)
        df["ts"] = pd.to_datetime(df['ts'], unit='ms', utc=True).dt.tz_convert('Asia/Singapore')

        df.rename(columns=column_mappings, inplace=True)

        full_df = pd.concat([full_df, df], ignore_index=True)

    # convert data type
    full_df.RPM = full_df.RPM.astype(np.int32)

    return full_df[full_df.RPM > 0]


def get_max_count(df):
    max_idx = df['RPM'].argmax()
    return df.iloc[max_idx]


def get_stats(df, title, show_chart = False, bins=50):
    desc = df.describe()
    p85 = df.quantile(0.85, numeric_only=True)
    p85.name = '85%'
    p90 = df.quantile(0.9, numeric_only=True)
    p90.name = '90%'
    p95 = df.quantile(0.95, numeric_only=True)
    p95.name = '95%'
    p98 = df.quantile(0.98, numeric_only=True)
    p98.name = '98%'
    p99 = df.quantile(0.99, numeric_only=True)
    p99.name = '99%'
    print(pd.concat([desc.transpose(), p85, p90, p95, p98, p99], axis=1))

    if show_chart:
        sns.set(style="white", rc={'figure.figsize': (10, 8)})
        sns.displot(df, x='RPM', bins=bins)
        plt.grid(color='gray', linestyle='-.', linewidth=0.25)
        plt.title(title)
        plt.xlabel('RPM per IP')

In [3]:
folder = Path(f'./retrieve/ds/')
dir_list = os.listdir(folder)

for api in dir_list:
    print(f'\n## {api}\n')
    print('```')
    df = read_data(api)
    print(get_max_count(df))
    print('\n')
    get_stats(df, api)
    print('```')



## getfavouriteeventscount

```
IP                     103.199.56.197
RPM                             12981
DateTime    2022-10-21 19:45:00+08:00
Name: 54359, dtype: object


       count       mean         std  min  25%  50%  75%      max   85%   90%  \
RPM  34426.0  60.150119  415.249544  1.0  2.0  4.0  8.0  12981.0  10.0  14.0   

      95%    98%     99%  
RPM  52.0  731.0  1613.0  
```

## getsportcountbysportid

```
IP                     120.197.197.19
RPM                               162
DateTime    2022-10-28 22:08:00+08:00
Name: 35042, dtype: object


       count      mean       std  min  25%  50%  75%    max  85%  90%  95%  \
RPM  23409.0  1.602589  1.830033  1.0  1.0  1.0  2.0  162.0  2.0  3.0  4.0   

     98%  99%  
RPM  5.0  7.0  
```

## getupcomingeventlist

```
IP                    223.147.224.192
RPM                                14
DateTime    2022-10-29 22:28:00+08:00
Name: 22442, dtype: object


      count      mean       std  min  25%  50%  75%   max  85%  