In [8]:
import os
import pandas as pd
import datetime as dt
from typing import List, Dict
from langfuse import Langfuse
import requests
from requests.auth import HTTPBasicAuth
import glob
import logging
from dateutil import parser
import pytz
from src.blob_storage import BlobStorageHandler

from dotenv import load_dotenv
load_dotenv()



logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Clear existing handlers to avoid duplicate logs
if logger.hasHandlers():
    logger.handlers.clear()

# Add a handler to display log in notebook directly
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

logger.addHandler(handler)

function_names_mapping = {
    "meeting_minutes": ["minutes"],
    "jera_knowledge": ["jera_knowledge"],
    "file_difference": ["file_diff"],
    "presentation_generator": ["ppt_generator"],
    "summarize": ["summarize"],
    "chat_with_image": ["image_to_text"],
    "docx_translator": ["docx_file_translator"],
    "pptx_translator": ["pptx_file_translator"],
    "pdf_translator": ["pdf_file_translator"],
    "txt_translator": ["txt_file_translator"],
    "chat_with_file": ["rag"],
    "speech_generator": ["speech_draft_generator"],
    "meeting_setup": ["meeting_setup"],
    "text_translator": ["text_translator", "transText_extractor"],
    "simple_web_search":["simple_web_searcher"],
    "research_report":["research_report"],
    "outlook_redirect": ["outlook_redirect"],
    "outlook_triggered": ["outlook_adaptive_card_triggered"],
}


def upload_to_blob(project:str, container_name, fpath: str, overwrite: bool = False):
    conn_str = os.environ[f"{project}_CONNECTION_STRING_PROD"]
    bsh = BlobStorageHandler(conn_str=conn_str, container_name=container_name)

    file_name = os.path.basename(fpath)
    to_file_path = os.path.join(file_name)

    if bsh.blob_exists(to_file_path) and not overwrite:
        print(f"`{to_file_path}` already exists in {container_name}.")
    else:
        bsh.upload_file(fpath, to_file_path, overwrite)
        print(f"`{to_file_path}` uploaded to blob {container_name}.")

def convert_to_local(utc_time: str, formatter: str = "%Y-%m-%d %H:%M:%S"):
    utc_time = dt.datetime.strptime(utc_time[:19], "%Y-%m-%dT%H:%M:%S")
    # Set the timezone to UTC for the datetime object
    utc_time = utc_time.replace(tzinfo=pytz.UTC)
    # Convert to Japan Standard Time (JST)
    jst_time = utc_time.astimezone(pytz.timezone("Asia/Tokyo"))
    jst_time = dt.datetime.strftime(jst_time, formatter)

    return jst_time

def format_datetime(val):
    try:
        dt = parser.isoparse(val)
        return dt.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
    except Exception as e:
        return val  # or raise, log, or return None if desired

def add_new_name_by_quid(qid_to_new_names: dict, qid: str, new_name: str):
    if qid not in qid_to_new_names:
        qid_to_new_names[qid] = new_name
    else:
        logger.info(f"Question ID: {qid} already exists, the new function name is {qid_to_new_names[qid]}")
    return qid_to_new_names

def rename_funtion_name(df):
    df = df.copy()

    qid_to_new_fname = {}
    qid_col_name = "question_uuid"
    qid_to_org_fnames = dict(df[["name", qid_col_name]].groupby(qid_col_name)['name'].apply(lambda x: sorted(set(x))).reset_index().values)
    qid_to_traceid =  dict(df[["id", qid_col_name]].groupby([qid_col_name])['id'].apply(list).reset_index().values)

    for qid, org_fn in qid_to_org_fnames.items():
        channel = org_fn[0].split("_")[0]
        org_f_names = ["_".join(i.split("_")[1:]) for i in org_fn]
        
        # function names
        find_agent = False
        for new_f_name, mpping_org_f_names in function_names_mapping.items():
            if len([org_fn for org_fn in org_f_names if org_fn in mpping_org_f_names]) > 0:
                find_agent = True
                add_new_name_by_quid(qid_to_new_fname, qid, new_f_name)
                break

        # sob function
        if not find_agent:
            if org_f_names == ['language_detection']:
                find_agent = True
                add_new_name_by_quid(qid_to_new_fname, qid, 'other')
            else:
                find_agent = True
                # if org_f_names == ['sob'] or org_f_names == ['is_outlook_meeting']:
                add_new_name_by_quid(qid_to_new_fname, qid, 'sob')

        if not find_agent:
            logger.info(f"channel: {channel}, Question ID: {qid}, Function Names: {org_f_names}, qid_to_traceid: {qid_to_traceid[qid]}")

    df['agent_name'] = df['question_uuid'].astype(str).map(qid_to_new_fname)
    return df

def add_whisper_cost(trace_df, obs_df):
    trace_df = trace_df.copy()
    obs_df = obs_df.copy()
    trace_mm = trace_df[trace_df["agent_name"] =="meeting_minutes"][[
        "id", "question_uuid", "audio_length", "timestamp"]].dropna().drop_duplicates(["question_uuid", "audio_length"])

    data = []
    for i in range(len(trace_mm)):
        obs_id = trace_mm["id"].values[i] + "-tts"
        if obs_id in set(obs_df["id"]):
            print(f"{obs_id} already exists")
        else:
            tmp = {
                "id": trace_mm["id"].values[i] + "-tts",
                "name": "AgentExecutor",
                "startTime": trace_mm["timestamp"].values[i],
                "endTime": trace_mm["timestamp"].values[i],
                "parentObservationId": None,
                "type": "TTS",
                "model": "whisper",
                "completionTokens": 0,
                "promptTokens": 0,
                "totalTokens": 0,
                "version": None,
                "traceId": trace_mm["id"].values[i],
                "totalCost": 0.006*trace_mm["audio_length"].values[i]/(60000),
                "input": None,
                "date": convert_to_local(trace_mm["timestamp"].values[i])[:10]
            }
            data.append(tmp)
            logger.debug(tmp)
    tmp = pd.DataFrame(data)
    
    obs_df = pd.concat([obs_df, tmp], ignore_index=True).sort_values("startTime", ascending=False)
    return obs_df

def convert_columns(df):
    df = df.copy()
    df['startTime'] = df['startTime'].apply(format_datetime)
    df['endTime'] = df['endTime'].apply(format_datetime)
    df['date'] = [convert_to_local(i)[:10] for i in df["startTime"]]
    df["totalTokens"] = df["completionTokens"] + df["promptTokens"]
    df["totalCost"] = df["calculatedInputCost"] + df["calculatedOutputCost"]
    df['input'] = None
    return df

## Score

In [2]:
# Folder containing the CSV files
folder_path = './data/yui/score'  # change this to your folder path

# Get list of all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '2025-04*.csv'))
# print(csv_files)

# Read and merge all CSV files
df_list = [pd.read_csv(file) for file in sorted(csv_files)]
merged_score_df = pd.concat(df_list, ignore_index=True)
merged_score_df['date'] = [convert_to_local(i)[:10] for i in merged_score_df["timestamp"]]
merged_score_df = merged_score_df.sort_values(["timestamp"], ascending=False)
merged_score_df

Unnamed: 0,id,timestamp,name,value,comment,traceId,observationId,trace,date
45,10a9c30f-1f2e-479a-97e3-e8ade36a65a3,2025-04-14T02:28:45.852Z,井上 奈津季(Natsuki Inoue),1,LIKE,8bb8ab18-0f37-4eab-a073-d312935633e7,,"{'userId': '井上 奈津季(Natsuki Inoue)', 'tags': []}",2025-04-14
46,c489cec3-5642-449b-80a6-2b2ba47a486e,2025-04-14T01:59:34.706Z,熊 浩次(Koji Kuma),-1,Type: その他 - OMKの図面を呼び出せるようにしてほしい\n,d375011c-e858-42fb-9b5e-1c4253183134,,"{'userId': '熊 浩次(Koji Kuma)', 'tags': []}",2025-04-14
47,7633a69d-fc00-4936-ba9c-f0c03b7f2327,2025-04-14T01:58:23.292Z,熊 浩次(Koji Kuma),-1,Type: その他 - 図面の場所や図面を開けるようにしてほしい\n,e23faf9c-c315-4535-b7b3-6d2b5bfcf86b,,"{'userId': '熊 浩次(Koji Kuma)', 'tags': []}",2025-04-14
48,506de06f-5331-4617-b482-f30217243b15,2025-04-13T23:42:34.403Z,井上 奈津季(Natsuki Inoue),1,LIKE,320bc6dd-4f89-4512-86a8-aa8eaa06b4a8,,"{'userId': '井上 奈津季(Natsuki Inoue)', 'tags': []}",2025-04-14
43,2e280397-c09b-4bf5-b31d-851997387365,2025-04-11T08:03:57.884Z,吉野 ひろみ(Hiromi Yoshino),-1,Type: 不適切な回答 - 社給iPhoneでAcrobatLeaderへログインできない...,e383e66f-7742-4e89-9bf4-503548ed005b,,"{'userId': '吉野 ひろみ(Hiromi Yoshino)', 'tags': []}",2025-04-11
44,467e0397-afeb-4ce9-ab1e-daae3bbc9b3d,2025-04-11T04:40:31.506Z,野間 洸輝(Koki Noma),1,LIKE,9948a746-478d-4918-bb19-12aa04387b22,,"{'userId': '野間 洸輝(Koki Noma)', 'tags': []}",2025-04-11
40,d44ccc66-682e-4e5b-af24-6b0fb5426847,2025-04-10T06:19:54.362Z,青木 直之(Naoyuki Aoki),-1,Type: その他 - 他の翻訳エージェントでは正常に処理できた。\n事前に処理できないファ...,aa9f2d07-7f47-4550-a607-5e3710100ce6,,"{'userId': '青木 直之(Naoyuki Aoki)', 'tags': []}",2025-04-10
41,c82c5fca-127b-4f04-996a-f7c458a1ab46,2025-04-10T04:35:54.626Z,三國 弘樹(Hiroki Mikuni),1,LIKE,e0731aaf-021d-4577-844c-0e6065538448,,"{'userId': '三國 弘樹(Hiroki Mikuni)', 'tags': []}",2025-04-10
42,270012c3-9cf1-47ef-bf5a-ac3d4a688afa,2025-04-10T02:02:16.643Z,熊谷 祐稀(Yuki Kumagai),1,LIKE,6b597e25-a6f5-4a72-9711-1c997e5bb8df,,"{'userId': '熊谷 祐稀(Yuki Kumagai)', 'tags': []}",2025-04-10
31,ca220a8b-308a-4e12-90c3-d94221c8efae,2025-04-09T09:28:15.190Z,渡辺 清人(Kiyohito Watanabe),1,LIKE,448d11d5-ebfb-4f47-88e9-58f44658a463,,"{'userId': '渡辺 清人(Kiyohito Watanabe)', 'tags':...",2025-04-09


In [3]:
yyyymm = "2025-04"
to_file_path = f"data/yui/monthly/score/{yyyymm}.csv"
merged_score_df.to_csv(to_file_path, index=False, quoting=1)

In [8]:
upload_to_blob("SDCP", "score", to_file_path, overwrite=True)

2025-04-14 15:57:26,188 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://sdcpbeproddata.blob.core.windows.net/score/2025-04.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': 'b93a3344-18fd-11f0-9bc0-6df91223bea8'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-14 15:57:26,352 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '8806'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Thu, 10 Apr 2025 04:28:25 GMT'
    'Accept-Ranges': 'REDACTED'
    'ETag': '"0x8DD77E82272F4D0"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'cadf

`2025-04.csv` uploaded to blob score.


## Trace

In [12]:
folder_path = './data/yui/trace'  # change this to your folder path

# Get list of all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '2025-04*.csv'))
# print(csv_files)
df_list = [pd.read_csv(file) for file in sorted(csv_files)]
merged_trace_df = pd.concat(df_list, ignore_index=True)
merged_trace_df = rename_funtion_name(merged_trace_df)
merged_trace_df['timestamp'] = merged_trace_df['timestamp'].apply(format_datetime)
merged_trace_df['date'] = [convert_to_local(i)[:10] for i in merged_trace_df["timestamp"]]
merged_trace_df['using'] = merged_trace_df['using'].fillna('LLM8')
merged_trace_df = merged_trace_df.rename(columns={"user_email": 'email'})
merged_trace_df["new_question_uuid"] = merged_trace_df["question_uuid"]
trace_for_obs = merged_trace_df.copy()
merged_trace_df = merged_trace_df[['id', 'name', 'timestamp', 'userId', 'scores', 'version',
       'azure_user_id', 'question_uuid', 'department_name', 'using', 'email',
       'channel', 'date', 'agent_name', 'new_question_uuid']]

## Observation

In [11]:
folder_path = './data/yui/obs'  # change this to your folder path

csv_files = glob.glob(os.path.join(folder_path, '2025-04*.csv'))
df_list = [pd.read_csv(file) for file in sorted(csv_files)]
merged_obs_df = pd.concat(df_list, ignore_index=True)
converted_obs_df = convert_columns(merged_obs_df)
converted_obs_df = converted_obs_df[['id', 'name', 'startTime', 'endTime', 'parentObservationId', 'type',
       'model', 'completionTokens', 'promptTokens', 'totalTokens', 'version',
       'traceId', 'totalCost', 'input', 'date']]
obs_df = add_whisper_cost(trace_for_obs, converted_obs_df)
obs_df[obs_df["model"] == "whisper"]

Unnamed: 0,id,name,startTime,endTime,parentObservationId,type,model,completionTokens,promptTokens,totalTokens,version,traceId,totalCost,input,date
65289,8cabab44-1ba1-4d8e-8f51-8d16ea42f402-tts,AgentExecutor,2025-04-14T12:03:21.853Z,2025-04-14T12:03:21.853Z,,TTS,whisper,0,0,0,,8cabab44-1ba1-4d8e-8f51-8d16ea42f402,0.741091,,2025-04-14
65290,fdd0d6af-9878-4618-91c8-2c99fab81d73-tts,AgentExecutor,2025-04-14T05:10:40.729Z,2025-04-14T05:10:40.729Z,,TTS,whisper,0,0,0,,fdd0d6af-9878-4618-91c8-2c99fab81d73,0.506899,,2025-04-14
65291,5e675a74-c843-4aad-a6c8-cdc10bec3849-tts,AgentExecutor,2025-04-14T01:59:58.965Z,2025-04-14T01:59:58.965Z,,TTS,whisper,0,0,0,,5e675a74-c843-4aad-a6c8-cdc10bec3849,0.280841,,2025-04-14
65292,ba54e73f-4aca-49f5-90dc-56ce549ede1c-tts,AgentExecutor,2025-04-13T23:22:14.327Z,2025-04-13T23:22:14.327Z,,TTS,whisper,0,0,0,,ba54e73f-4aca-49f5-90dc-56ce549ede1c,0.041594,,2025-04-14
65288,bfba5dca-e123-4b06-bbaa-6e7b81733de7-tts,AgentExecutor,2025-04-11T15:36:34.664Z,2025-04-11T15:36:34.664Z,,TTS,whisper,0,0,0,,bfba5dca-e123-4b06-bbaa-6e7b81733de7,0.177168,,2025-04-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65177,16fc9e1c-df4d-4cf8-8495-46274425d251-tts,AgentExecutor,2025-04-01T02:53:49.348Z,2025-04-01T02:53:49.348Z,,TTS,whisper,0,0,0,,16fc9e1c-df4d-4cf8-8495-46274425d251,0.057192,,2025-04-01
65178,e833108c-35ff-43e7-9b5d-f81d3d51da9a-tts,AgentExecutor,2025-04-01T02:51:05.058Z,2025-04-01T02:51:05.058Z,,TTS,whisper,0,0,0,,e833108c-35ff-43e7-9b5d-f81d3d51da9a,0.088338,,2025-04-01
65179,12a342a5-59e0-4595-99b6-b421d5da73ef-tts,AgentExecutor,2025-04-01T01:19:51.799Z,2025-04-01T01:19:51.799Z,,TTS,whisper,0,0,0,,12a342a5-59e0-4595-99b6-b421d5da73ef,0.239971,,2025-04-01
65180,481a2abe-df27-4e98-821a-010f584a571f-tts,AgentExecutor,2025-03-31T23:23:40.007Z,2025-03-31T23:23:40.007Z,,TTS,whisper,0,0,0,,481a2abe-df27-4e98-821a-010f584a571f,0.037314,,2025-04-01


## Save Trace

In [13]:
trace_df = merged_trace_df.sort_values(["timestamp"], ascending=False)
to_file_path = f"data/yui/monthly/trace/{yyyymm}.csv"
trace_df.to_csv(to_file_path, index=False, quoting=1)
print(to_file_path)

data/yui/monthly/trace/2025-04.csv


In [17]:
upload_to_blob("SDCP", "trace", to_file_path, overwrite=True)

2025-04-14 15:59:22,872 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://sdcpbeproddata.blob.core.windows.net/trace/2025-04.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': 'fec6cefe-18fd-11f0-9bc0-6df91223bea8'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-14 15:59:23,031 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '3432197'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Thu, 10 Apr 2025 04:35:40 GMT'
    'Accept-Ranges': 'REDACTED'
    'ETag': '"0x8DD77E926259E35"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '8

`2025-04.csv` uploaded to blob trace.


### Save to OBS

In [15]:
obs_df = obs_df.sort_values(["startTime"], ascending=False)
to_file_path = f"data/yui/monthly/obs/{yyyymm}.csv"
obs_df.to_csv(to_file_path, index=False, quoting=1)
print(to_file_path)

data/yui/monthly/obs/2025-04.csv


In [20]:
upload_to_blob("SDCP", "obs", to_file_path, overwrite=True)

2025-04-14 16:00:38,280 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://sdcpbeproddata.blob.core.windows.net/obs/2025-04.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '2bb92056-18fe-11f0-9bc0-6df91223bea8'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-14 16:00:38,396 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '11268068'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Thu, 10 Apr 2025 04:36:38 GMT'
    'Accept-Ranges': 'REDACTED'
    'ETag': '"0x8DD77E948B103AB"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '6c

`2025-04.csv` uploaded to blob obs.
