# About
https://redmine.devops.rcos.nii.ac.jp/issues/31135

Binderの利用ログを集計して月次レポートを生成する。
集計対象は以下の通り
* Binderで構築に成功したPodに関する情報
 * 表: 1列目=構築元リポジトリURL、2列目=Pod数
 * 表: 1列目=ユーザーの所属機関（mailのドメイン名部分）、2列目=Pod数
 * 折れ線グラフ: 横軸=日付、縦軸=Pod数
 * 折れ線グラフ: 横軸=日付、縦軸=ユニークユーザー数
* Binderで構築に失敗したPodに関する情報
 * 表: 1列目=構築元リポジトリURL、2列目=Pod数


In [None]:
import datetime

In [None]:
# ログの集計開始日時を31日前に指定
START_DATE = (datetime.datetime.now() - datetime.timedelta(days=31))
START_DATE = datetime.datetime.combine(START_DATE, datetime.time(0, 0, 0))

START_DATE

In [None]:
# ログの集計の終了日時を前日の23:59に指定
END_DATE = (datetime.datetime.now() - datetime.timedelta(days=1))
END_DATE = datetime.datetime.combine(END_DATE, datetime.time(23, 59, 59))

END_DATE

In [None]:
# ログが保存されているディレクトリを指定
TARGET_DIR = "/home/jovyan/.jenkins/binder-logs/"
TARGET_DIR

In [None]:
OUTPUT_DIR = "/home/jovyan/.jenkins/binder-monthly-report/"

In [None]:
# UTCからの時差を時間単位で指定
TZ_OFFSET = 9

In [None]:
!pip install weasyprint

In [None]:
import os
import re
import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import tempfile
work_dir = tempfile.mkdtemp()
work_dir

In [None]:
# pdfレンダリングのため、html や　css の設定
css = """
table, th, td {
   border: 1px solid;
   border-collapse: collapse;
}
th {
   font-weight: normal;
}
th, td {
   padding:8px;
}
"""

template_html = f"""
<!DOCTYPE html>
<html lang="ja">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <style>
        {css}
    </style>
    <title>月間利用状況レポート</title>
</head>
<body>
    <div>
        リポジトリ毎の成功pod数<br>
        @success_repo_thtml
    </div>
    <br>
    <div>
        リポジトリ毎の失敗pod数<br>
        @error_repo_thtml
    </div>
    <br>
    <div>
        ドメイン毎の成功pod数<br>
        @domain_thtml
    </div>
    <img src="./pods_per_day.png">
    <img src="./users_per_day.png">
</body>
</html>"""

# ログファイルの読み込み

In [None]:
log_files =  !ls -t {TARGET_DIR}
print(log_files)
log_lines = []
for file in log_files:
    if(file.endswith('log')):
        log_lines += log_lines + open(os.path.join(TARGET_DIR, file), "r").readlines()
log_lines

# ログの構文解析を行って行単位の辞書に変換

In [None]:
# ログの行ごとに、 ['log_level', 'date', 'time', 'log_id', 'datetime', 'message', 'url', 'mail_address', 'domain']を取得する
timestamp_pattern = r'(?<=\[).+?(?=\])'
mail_pattern =  r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
domain_pattern = r"(@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"

log_columns = ['log_level', 'date', 'time', 'log_id', 'datetime', 'message', 'url', 'mail_address', 'domain']
dict_index_data = {}

for i, line in enumerate(log_lines):
    timestamp_match_result = re.findall(timestamp_pattern, line)  
    if len(timestamp_match_result)==0 : continue

    log_split = timestamp_match_result[0].split(" ")
    try:
        log_dt = datetime.datetime.strptime(log_split[1]+log_split[2], "%y%m%d%H:%M:%S") + datetime.timedelta(hours=TZ_OFFSET)
    except Exception as e:
        continue 
    if log_dt < START_DATE or END_DATE < log_dt:
        continue
    log_message = line[line.find("]")+1:]
    log_split.append(log_dt)
    log_split.append(log_message)

    url = ""
    # ログのメッセージ部分からリポジトリURLを抽出する
    # ex) Launching pod for https://github.com/example/2022: 34 other pods running this repo (250 total)
    if 'Launching pod for' in log_message: url = log_message.split(" ")[4][:-1] # -1は最後に付加される「:」を除くため      
    # ex) Launched https://github.com/sample/2022 in 14s
    if 'Launched' in log_message: url = log_message.split(" ")[2]
    log_split.append(url)
    
    mail_address_match_result = re.findall(mail_pattern, log_message)
    mail_address = "" if len(mail_address_match_result)==0 else mail_address_match_result[0]
    log_split.append(mail_address)

    domain_match_result = re.findall(domain_pattern, mail_address)
    domain = "" if len(domain_match_result)==0 else domain_match_result[0]
    log_split.append(domain)

    dict_index_data[i+1] = {key:value for key, value in zip(log_columns, log_split)}

# 行ごとに該当ログの出現をチェック＆カウント

In [None]:
builder_success = {'id':[], 'log_info':[]}
builder_error = {'id':[], 'log_info':[]}
launcher_success = {'id':[], 'log_info':[]}
launcher_error = {'id':[], 'log_info':[]}

for line_index, log_data in dict_index_data.items():
    if 'Launching pod for' not in log_data['message'] and \
            'Starting server' not in log_data['message']:
        continue

    current_dt = log_data['datetime']
    limit_dt = current_dt + datetime.timedelta(hours=2)

    if 'Launching pod for' in log_data['message']:    
        builder_log_list = list(filter(
            lambda item: item[1]['datetime'] > current_dt and
            item[1]['datetime'] < limit_dt and
            'Launched' not in log_data['message']and
            item[1]['url'] == log_data['url'] and
            item[0] not in builder_success['id'],
            dict_index_data.items()))
        
        if len(builder_log_list) == 0:
            builder_error['id'].append(line_index)
            builder_error['log_info'].append(log_data)
        else:
            builder_success['id'].append(builder_log_list[0][0])
            builder_success['log_info'].append(log_data)


    elif 'Starting server' in log_data['message']:
        error_log_list = list(filter(
            lambda item: item[1]['datetime'] > current_dt and
            item[1]['datetime'] < limit_dt and
            'Error starting server' in item[1]['message'] and
            item[1]['domain'] == log_data['domain'] and
            item[0] not in launcher_error['id'],
            dict_index_data.items()))

        if len(error_log_list) != 0:
            launcher_error['id'].append(error_log_list[0][0])
            launcher_error['log_info'].append(log_data)
        else:
            launcher_success['id'].append(line_index)
            launcher_success['log_info'].append(log_data)

# 各種データ集計、html化

In [None]:
success_df = pd.DataFrame(builder_success['log_info'])
error_df = pd.DataFrame(builder_error['log_info'])
user_df = pd.DataFrame(launcher_success['log_info'])

# リポジトリ毎の起動成功したpod数
tmp_df = success_df['url'].value_counts().to_frame()
tmp_df = tmp_df.rename(columns={'url':'pods'})
success_repo_thtml = tmp_df.to_html()

# リポジトリ毎の起動失敗したpod数
tmp_df = error_df['url'].value_counts().to_frame()
tmp_df = tmp_df.rename(columns={'url':'pods'})
error_repo_thtml = tmp_df.to_html()

# ドメイン毎の起動成功したpod数
tmp_df = user_df['domain'].value_counts().to_frame()
tmp_df = tmp_df.rename(columns={'domain':'pods'})
domain_thtml = tmp_df.to_html()

content = template_html

content = content.replace('@success_repo_thtml', success_repo_thtml)
content = content.replace('@error_repo_thtml', error_repo_thtml)
content = content.replace('@domain_thtml', domain_thtml)

with open(os.path.join(work_dir, 'report.html'), 'w') as wf:
    wf.write(content)

In [None]:
%matplotlib inline

success_df['datetime'] = pd.to_datetime(success_df['datetime'])
success_df.set_index(success_df['datetime'], inplace=True)
days_pod_df = success_df.resample('1d').agg('size')
ax = days_pod_df.plot()
ax.set_xlabel('Day')
ax.set_ylabel('Pods')
ax.grid(color='b', linestyle=':', linewidth=0.3)
ax.figure.savefig(os.path.join(work_dir, "pods_per_day.png"))

In [None]:
user_df['datetime'] = pd.to_datetime(user_df['datetime'])
user_df.set_index('datetime', inplace=True)
days_user_df = user_df.resample('1d').nunique()
ax = days_user_df['mail_address'].plot()
ax.set_xlabel('Day')
ax.set_ylabel('Unique users')
ax.grid(color='b', linestyle=':', linewidth=0.3)
ax.figure.savefig(os.path.join(work_dir, "users_per_day.png"))

In [None]:
from weasyprint import HTML

!mkdir -p {OUTPUT_DIR}
pdf_path = os.path.join(OUTPUT_DIR, f'{START_DATE:%Y%m%d}-{END_DATE:%Y%m%d}.pdf')
HTML(os.path.join(work_dir, 'report.html')).write_pdf(pdf_path)

In [None]:
from ipynb.fs.full.report import create_report, post_to_slack
post_to_slack(pdf_path, f"Monthly inder report on {date} is created!")