In [1]:
LANGFUSE_PUBLIC_KEY = "pk-lf-be7ee4cb-12c8-4c14-825e-a564da8157ca"
LANGFUSE_SECRET_KEY = "sk-lf-81e0768d-8a19-4458-9375-931e6ef1fb3e"
LANGFUSE_HOSTNAME = "https://langfuse.dev.jera-stg.com"

In [2]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Clear existing handlers to avoid duplicate logs
if logger.hasHandlers():
    logger.handlers.clear()

# Add a handler to display log in notebook directly
handler = logging.StreamHandler()
handler.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

logger.addHandler(handler)

In [3]:
import os
import pandas as pd
import datetime as dt
from typing import List, Dict
from langfuse import Langfuse
import tempfile

# helper function
def pydantic_list_to_dataframe(pydantic_list):
    """
    Convert a list of pydantic objects to a pandas dataframe.
    """
    data = []
    for item in pydantic_list:
        data.append(item.dict())
    return pd.DataFrame(data)


def convert_to_utc(date_str: str, date_type: str, local_tz=dt.timezone(dt.timedelta(hours=9))):
    local_date = dt.datetime.strptime(date_str, "%Y-%m-%d")  # Convert string to datetime
    local_date = dt.datetime(local_date.year, local_date.month, local_date.day, 
                                0, 0, 0, 0, tzinfo=local_tz)  # Set time to 00:00:00
    if date_type == 'to':
        local_date = local_date - dt.timedelta(microseconds=1) # End of yesterday
    return local_date.astimezone(dt.timezone.utc)  # Convert to UTC

def get_timestamps(from_timestamp=None, to_timestamp=None, local_tz=dt.timezone(dt.timedelta(hours=9))):

    if not from_timestamp:
        from_timestamp = dt.datetime.strftime(dt.datetime.now() - dt.timedelta(days=1), "%Y-%m-%d")
    from_timestamp = convert_to_utc(from_timestamp, date_type='from', local_tz=local_tz)

    if not to_timestamp:
        to_timestamp = dt.datetime.strftime(dt.datetime.now(), "%Y-%m-%d")
    else:
        to_timestamp = dt.datetime.strftime(dt.datetime.strptime(to_timestamp, "%Y-%m-%d") + dt.timedelta(days=1), "%Y-%m-%d")
    to_timestamp = convert_to_utc(to_timestamp, date_type='to', local_tz=local_tz)

    return from_timestamp, to_timestamp

def expend_metadata(df, metadata_keys: List[str] | None = None):
        # Select specific keys to extract
        # metadata_keys = ['channel', 'user_email', 'department_name', 'azure_user_id', 'question_uuid', 'costs']

        if metadata_keys is None or metadata_keys == []:
            # Convert None/NaN to an empty dictionary before expanding
            df_expanded = df['metadata'].apply(lambda x: pd.Series(x) if isinstance(x, dict) else pd.Series(dtype='object'))

            # Concatenate original DataFrame (excluding 'metadata') with expanded columns
            df = pd.concat([df, df_expanded], axis=1)
        else:
            # Create new columns with selected keys
            for key in metadata_keys:
                df[key] = df['metadata'].apply(lambda x: x.get(key, None) if isinstance(x, dict) else None)  # Check if x is a dict

        # Drop the original 'metadata' column if not needed
        df = df.drop(columns=['metadata'])
        return df

def convert_to_jst(utc_dt):
    # Original datetime in UTC
    # utc_time = dt.datetime.fromisoformat('2025-03-15 15:00:00+00:00')

    # Convert to JST (UTC+9)
    jst_time = utc_dt.astimezone(dt.timezone(dt.timedelta(hours=9)))

    return jst_time

class LangfuseAPI(object):
    def __init__(self, public_key: str, secret_key: str, host: str):
        self.langfuse = Langfuse(public_key = public_key, secret_key = secret_key, host = host)

    def fetch_traces(
            self, 
            limit: int, 
            page: int, 
            user_id: str | None = None, 
            from_timestamp: str | None = None, 
            to_timestamp: str | None =None, 
            max_traces: int | None = None,
            metadata_keys: List[str] | None = None  # default keys to extract
            ):

        from_timestamp, to_timestamp = get_timestamps(from_timestamp, to_timestamp)
        logger.info(f"Collect data from JST {convert_to_jst(from_timestamp)} to {convert_to_jst(to_timestamp)}")
        # logger.debug(f"Collect data from uTC {from_timestamp} to {to_timestamp}")
        all_traces = []
        while True:
            # logger.debug(f"Page#{page}")
            traces = self.langfuse.fetch_traces(limit=limit, page=page, user_id=user_id, from_timestamp=from_timestamp, to_timestamp=to_timestamp)
            all_traces.extend(traces.data)

            if len(traces.data) == 0:
                logger.info("No more traces found.")
                break

            if page % 10 == 0:
                logger.info(f"Page#{page}, {traces.data[0].timestamp} ~ {traces.data[-1].timestamp}")

            if len(traces.data) < limit:
                logger.info("All traces have been collected as the number of records is less than limit per page.")
                break
            
            if max_traces is not None and len(all_traces) >= max_traces:
                logger.info("The number of traces has been collected exceeds the total limit")
                break
            
            page += 1
        
        logger.info(f"Retrieved {len(all_traces)} traces.")
        all_traces_df = pydantic_list_to_dataframe(all_traces)

        if len(all_traces) > 0 and 'metadata' in all_traces_df.columns:
            all_traces_df = expend_metadata(all_traces_df, metadata_keys)
        return all_traces_df

def extract_question_answer(df):
    # Modified extraction logic
    df = df.copy()
    df['question'] = df['input'].apply(lambda x: x.get('question') if isinstance(x, dict) else None)
    df['final_answer'] = df['output'].apply(lambda x: x.get('final_answer') if isinstance(x, dict) else None)
    return df

def mapping_service_id_to_name(df, mapping_dict):
    # Map caller_id to Name
    df = df.copy()
    df['service'] = df['business_service'].map(mapping_dict)
    return df

def mapping_service_id_to_category(df, mapping_dict):
    # Map caller_id to Name
    df = df.copy()
    df['category'] = df['business_service'].map(mapping_dict)
    return df

def mapping_service_id_incident_creation(df, mapping_dict):
    df = df.copy()
    df['is_incident_creation'] = df['business_service'].map(mapping_dict) == "incident creation"
    return df

def extract_ids_from_intput(df):
    # Modified extraction logic
    df = df.copy()
    df['question_uuid'] = df['input'].apply(lambda x: x.get('question_uuid') if isinstance(x, dict) else None)
    df['conversation_uuid'] = df['input'].apply(lambda x: x.get('conversation_uuid') if isinstance(x, dict) else None)
    return df

from dotenv import load_dotenv
load_dotenv()

from src.blob_storage import BlobStorageHandler

def upload_to_blob(project:str, container_name, from_file_path: str, to_file_path: str | None = None, overwrite: bool = False):
    conn_str = os.environ[f"{project}_CONNECTION_STRING_PROD"]
    bsh = BlobStorageHandler(conn_str=conn_str, container_name=container_name)

    if to_file_path is None:
        file_name = os.path.basename(from_file_path)
        to_file_path = os.path.join(file_name)

    if bsh.blob_exists(to_file_path) and not overwrite:
        logger.info(f"`{to_file_path}` already exists in {container_name}.")
    else:
        bsh.upload_file(from_file_path, to_file_path, overwrite)
        logger.info(f"`{to_file_path}` uploaded to blob {container_name}.")
        
def load_from_blob(project:str, container_name, from_file_path: str):
    conn_str = os.environ[f"{project}_CONNECTION_STRING_PROD"]
    bsh = BlobStorageHandler(conn_str=conn_str, container_name=container_name)

    if not bsh.blob_exists(from_file_path):
        logger.info(f"`{from_file_path}` does not exist in {container_name}.")
        return None
    else:
        # Create a temporary directory
        with tempfile.TemporaryDirectory() as tmpdir:
            file_name = from_file_path.split("/")[-1]
            to_file_path = os.path.join(tmpdir, file_name)
            bsh.download_file(from_file_path, to_file_path)
            logger.info(f"save data to temporary place`{to_file_path}`.")
            df = pd.read_csv(to_file_path)
            return df

def conver_to_yyyymm(df, datetime_col:str):
    df = df.copy()

    df[datetime_col] = pd.to_datetime(df[datetime_col], format='mixed', utc=True)
    # 3. Convert to JST (UTC+9)
    df[datetime_col] = df[datetime_col].dt.tz_convert('Asia/Tokyo')

    # Step 3: Standardize JST timestamp format as ISO8601 string
    # df[datetime_col] = df[datetime_col].dt.strftime('%Y-%m-%dT%H:%M:%S.%f%z')

    # 4. Create year-month column
    df['year_month'] = df[datetime_col].dt.strftime('%Y-%m')

    # Step 4: Convert JST timestamp to ISO format string (optional: overwrite or add new column)
    df[datetime_col] = df[datetime_col].dt.strftime('%Y-%m-%dT%H:%M:%S.%f%z')
    
    return df

# answer_df = pd.read_csv(answer_tmp_fpath)

import tempfile

def save_data(df, project: str, container: str, overwrite: bool = False):
    df_new = df.copy()
    df_new = conver_to_yyyymm(df_new, 'timestamp')
    for month, group in df_new.groupby('year_month'):
        group.drop(columns=['year_month'], inplace=True)  # optional
        logger.info(f"New data: {len(group)}")
        # Create a temporary file
        with tempfile.NamedTemporaryFile(mode='w+', suffix='.csv', delete=True) as tmp:

            # read year_month from blob
            df_old = load_from_blob(project, container, f"{month}.csv")
            if df_old is not None:
                logger.info(f"Find {len(df_old)} rows data of {month} in Blob.")

                all_exist = group['id'].isin(df_old['id']).all()
                if all_exist:
                    if not overwrite:
                        logger.warning(f"All new data already existing, will not overwrite.")
                        continue
                    else:
                        logger.warning(f"Overwrite all.")
                        df_old = df_old[~df_old['id'].isin(group['id'])]
                else:
                    if not overwrite:
                        logger.warning(f"Will not overwrite any old data, but remove new data if they are already existing.")
                        group = group[~group['id'].isin(df_old['id'])]
                        logger.info(f"remaining {len(group)} new data.")
                    else:
                        logger.warning(f"Overwrite old data if have.")
                        df_old = df_old[~df_old['id'].isin(group['id'])]
                        logger.info(f"remaining {len(df_old)} old data.")
                # else:
                #     logger.warning(f"All are new data.")
                    
                result = pd.concat([df_old, group], ignore_index=True).sort_values(["timestamp"], ascending=False)
                logger.info(f"Merged data: {len(result)}")
                
            else:
                logger.info(f"No {month} data, will directly save new data to blob")
                result = group.copy()

            # Save the DataFrame to CSV
            result.to_csv(tmp.name, index=False, quoting=1)
            logger.info(f'Temporary {container} CSV saved at: {tmp.name}')
            upload_to_blob(project, container, tmp.name, f"{month}.csv", True)
    
    return True

# Collection new data

In [5]:
langfuseapi = LangfuseAPI(LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOSTNAME)

limit = 100  # Adjust as needed to balance performance and data retrieval.
page = 1
max_traces = None
# user_id = "余 翔(Xiang Yu)"
user_id = None

from_timestamp = None
to_timestamp = None

from_timestamp = "2025-04-14"
to_timestamp = "2025-04-14"

metadata_keys = None

trace_df = langfuseapi.fetch_traces(
    limit=limit, 
    page=page, 
    user_id=user_id, 
    from_timestamp=from_timestamp, 
    to_timestamp=to_timestamp,
    metadata_keys=metadata_keys
)
trace_df = trace_df[["id", "timestamp", "name", "userId", "department_name", 'service_category', "service_name","question_id", 'incident_category','input', 'output']]

2025-04-15 10:36:35,793 - root - INFO - Collect data from JST 2025-04-14 00:00:00+09:00 to 2025-04-14 23:59:59.999999+09:00
2025-04-15 10:36:36,158 - root - INFO - All traces have been collected as the number of records is less than limit per page.
2025-04-15 10:36:36,159 - root - INFO - Retrieved 192 traces.


## Answer

In [None]:
trace_df_answer = trace_df[trace_df["name"] == "HDA_Answer"].copy()
trace_df_answer = extract_question_answer(trace_df_answer)
trace_df_answer = trace_df_answer.drop(columns=['input', 'output'])


In [8]:
save_data(trace_df_answer, 'NOVA', 'answer', False)

2025-04-15 10:36:44,666 - root - INFO - New data: 69


2025-04-15 10:36:44,796 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://hdabeproddata.blob.core.windows.net/answer/2025-04.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '16e08da8-199a-11f0-8c4c-13849cff4d4f'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-15 10:36:44,922 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '677528'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Mon, 14 Apr 2025 02:03:21 GMT'
    'Accept-Ranges': 'REDACTED'
    'ETag': '"0x8DD7AF88838CB7F"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'cb

True

In [45]:
# column_name = "final_answer"
# nan_rows = trace_df_answer[trace_df_answer[column_name].isnull()]
# nan_rows

## knowledage

In [9]:
trace_df_know = trace_df[trace_df["name"] == "HDA_Knowledge"]
trace_df_know = trace_df_know.drop(columns=['input', 'output'])
trace_df_know

Unnamed: 0,id,timestamp,name,userId,department_name,service_category,service_name,question_id,incident_category
107,e5833a6c-be38-4dd9-b9c9-1b53381d6a01,2025-04-14 04:48:40.533000+00:00,HDA_Knowledge,細貝 知宏(Tomohiro Hosokai),デジタルクリエーション部 デジタルトランスフォーメーションU,99その他,その他（OA）,0feb4a64-12d6-481a-b3af-0ba7368113c8,oa


In [None]:
# know_tmp_fpath = "data/nova/knowledge/tmp.csv"
# trace_df_know.to_csv(know_tmp_fpath, index=False, quoting=1)

In [10]:
save_data(trace_df_know, 'NOVA', 'knowledge', False)

2025-04-15 10:37:02,674 - root - INFO - New data: 1
2025-04-15 10:37:02,679 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://hdabeproddata.blob.core.windows.net/knowledge/2025-04.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '21894c5e-199a-11f0-8c4c-13849cff4d4f'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-15 10:37:02,791 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '8940'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Mon, 14 Apr 2025 02:06:14 GMT'
    'Accept-Ranges': 'REDACTED'
    'ETag': '"0x8DD7AF8EFC98F54"'
    'Server': 'Windows-Azure-Blob/

True

## Incident

In [11]:
trace_df_inc = trace_df[trace_df["name"] == "HDA_Incident"]
trace_df_inc = trace_df_inc.drop(columns=['input', 'output'])
trace_df_inc

Unnamed: 0,id,timestamp,name,userId,department_name,service_category,service_name,question_id,incident_category
0,5a27b0db-bf10-4767-ac2e-76fb65101452,2025-04-14 12:34:00.811000+00:00,HDA_Incident,黄 凱凱(Kaikai Huang),企画統括部 JERA Global Institute 経済・エネルギーモデル開発・分析U,20経理,会計伝票_経営企画,,business
1,4ce3a271-140e-4296-bef8-59b792e71d67,2025-04-14 11:15:17.146000+00:00,HDA_Incident,飛田 萌里(Moeri Tobita),グローバル再生可能エネルギー統括部 再生可能エネルギー計画部 再エネ総括U,90IT基盤,SAP S/4 HANA(共通),,business
2,4d120cf0-d4c0-4dcd-8584-1d30a545fdc8,2025-04-14 11:12:14.148000+00:00,HDA_Incident,川井 一平(Ippei Kawai),国内ゼロエミッション火力推進統括部 国内事業計画部 事業計画U,20経理,JUPITER(経理),,business
3,114fa304-5b3a-44bb-8491-784085e02f03,2025-04-14 10:35:56.393000+00:00,HDA_Incident,安平 周作(Shusaku Yasuhira),グローバル再生可能エネルギー統括部 国内洋上風力事業部 国内洋上風力第一U,10労務・人事,労務に関するお問い合わせ,,business
4,49598d6a-d508-44a9-8eac-782b2c44b297,2025-04-14 10:29:28.123000+00:00,HDA_Incident,粟野 亮(Ryo Awano),最適化統括部 統合ポート戦略部 統合ポート戦略U,90IT基盤,SAP S/4 HANA(共通),,business
...,...,...,...,...,...,...,...,...,...
183,99b849b7-f488-46de-b76a-f9c1c4eae8b2,2025-04-14 00:06:32.438000+00:00,HDA_Incident,丸山 泰信(Yoshinobu Maruyama),OM・エンジ戦略統括部 OM・エンジ戦部 西OM・エンジ計部 K推進U,90IT基盤,SAP S/4 HANA(共通),,business
185,6a2ea0f8-7378-466b-940a-d11c41a5b7ed,2025-04-14 00:01:59.470000+00:00,HDA_Incident,那須川 敦(Atsushi Nasukawa),資材調達統括部 購買契約部 購買契約第一U,10労務・人事,SuccessFactors_その他操作,,business
187,733ef1b8-4773-4655-8c59-a85344af9645,2025-04-14 00:00:55.258000+00:00,HDA_Incident,南 匠(Takumi Minami),ICTマネ部 ICTオペU,10労務・人事,労務に関するお問い合わせ,,business
189,68651edc-7a06-46d1-9edd-d8e67a58df27,2025-04-13 19:43:32.135000+00:00,HDA_Incident,白木 将(Masashi Shiraki),販売統括部 需給オペ部 西プラ運用C 西プラ運用U,10労務・人事,労務に関するお問い合わせ,,business


In [None]:
# inc_tmp_fpath = "data/nova/incident/tmp.csv"
# trace_df_inc.to_csv(inc_tmp_fpath, index=False, quoting=1)

In [12]:
save_data(trace_df_inc, 'NOVA', 'incident', False)

2025-04-15 10:37:08,028 - root - INFO - New data: 122
2025-04-15 10:37:08,032 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://hdabeproddata.blob.core.windows.net/incident/2025-04.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '24ba2b8c-199a-11f0-8c4c-13849cff4d4f'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-15 10:37:08,128 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '248463'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Mon, 14 Apr 2025 02:06:50 GMT'
    'Accept-Ranges': 'REDACTED'
    'ETag': '"0x8DD7AF904AB54D6"'
    'Server': 'Windows-Azure-Bl

True

## Merge Data

In [13]:
df_list = [trace_df_answer, trace_df_know, trace_df_inc]
merged_alltrace_df = pd.concat(df_list, ignore_index=True).sort_values(["timestamp"], ascending=False)
merged_alltrace_df

Unnamed: 0,id,timestamp,name,userId,department_name,service_category,service_name,question_id,incident_category,question,final_answer
70,5a27b0db-bf10-4767-ac2e-76fb65101452,2025-04-14 12:34:00.811000+00:00,HDA_Incident,黄 凱凱(Kaikai Huang),企画統括部 JERA Global Institute 経済・エネルギーモデル開発・分析U,20経理,会計伝票_経営企画,,business,,
71,4ce3a271-140e-4296-bef8-59b792e71d67,2025-04-14 11:15:17.146000+00:00,HDA_Incident,飛田 萌里(Moeri Tobita),グローバル再生可能エネルギー統括部 再生可能エネルギー計画部 再エネ総括U,90IT基盤,SAP S/4 HANA(共通),,business,,
72,4d120cf0-d4c0-4dcd-8584-1d30a545fdc8,2025-04-14 11:12:14.148000+00:00,HDA_Incident,川井 一平(Ippei Kawai),国内ゼロエミッション火力推進統括部 国内事業計画部 事業計画U,20経理,JUPITER(経理),,business,,
73,114fa304-5b3a-44bb-8491-784085e02f03,2025-04-14 10:35:56.393000+00:00,HDA_Incident,安平 周作(Shusaku Yasuhira),グローバル再生可能エネルギー統括部 国内洋上風力事業部 国内洋上風力第一U,10労務・人事,労務に関するお問い合わせ,,business,,
74,49598d6a-d508-44a9-8eac-782b2c44b297,2025-04-14 10:29:28.123000+00:00,HDA_Incident,粟野 亮(Ryo Awano),最適化統括部 統合ポート戦略部 統合ポート戦略U,90IT基盤,SAP S/4 HANA(共通),,business,,
...,...,...,...,...,...,...,...,...,...,...,...
189,733ef1b8-4773-4655-8c59-a85344af9645,2025-04-14 00:00:55.258000+00:00,HDA_Incident,南 匠(Takumi Minami),ICTマネ部 ICTオペU,10労務・人事,労務に関するお問い合わせ,,business,,
67,5ac0fa43-b6c2-4983-8e2b-c16c0a6e867c,2025-04-13 23:54:54.313000+00:00,HDA_Answer,高梨 敦史(Atsushi Takanashi),ICTマネ部 ネットワーク総計U,99その他,その他（OA）,19b7bb4e-fb00-4b9c-8c38-f38cb18ab88a,oa,Wrikeで外部ユーザーを登録するときはどうやって登録するの？,提供されたデータには、Wrikeで外部ユーザーを登録する具体的な手順は記載されていません。W...
190,68651edc-7a06-46d1-9edd-d8e67a58df27,2025-04-13 19:43:32.135000+00:00,HDA_Incident,白木 将(Masashi Shiraki),販売統括部 需給オペ部 西プラ運用C 西プラ運用U,10労務・人事,労務に関するお問い合わせ,,business,,
68,b29dae2a-096f-448d-80fa-473e4dfc586a,2025-04-13 19:04:29.198000+00:00,HDA_Answer,川上 佳洋(Yoshihiro Kawakami),最適化統括部,90IT基盤,iPhone/iPad,fdbc7448-dfd1-4986-a881-a7d116ef60e9,oa,JERA Wi-Fiの設定時の STEP２ タップ時に「接続がプライベートではありません」と...,JERA Wi-Fiの設定時に「接続がプライベートではありません」と表示される問題については...


In [14]:
save_data(merged_alltrace_df, 'NOVA', 'alltrace', False)

2025-04-15 10:37:15,103 - root - INFO - New data: 192
2025-04-15 10:37:15,106 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://hdabeproddata.blob.core.windows.net/alltrace/2025-04.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '28f1848e-199a-11f0-8c4c-13849cff4d4f'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-15 10:37:15,215 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '941011'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Mon, 14 Apr 2025 02:10:05 GMT'
    'Accept-Ranges': 'REDACTED'
    'ETag': '"0x8DD7AF9792D7C7B"'
    'Server': 'Windows-Azure-Bl

True

In [29]:
# for month, group in merged_alltrace_df.groupby('year_month'):
#     group.drop(columns=['year_month'], inplace=True)  # optional
#     fpath = f'data/nova/monthly/alltrace/{month}.csv'
#     if os.path.exists(fpath):
#         logger.info(f"{fpath} already exisiting")
#         old_df = pd.read_csv(fpath)
#         logger.info(f"Old data: {len(old_df)}")
#         merged_df = pd.concat([old_df, group], ignore_index=True).sort_values(["timestamp"], ascending=False)

#         all_exist = group['id'].isin(old_df['id']).all()
#         if all_exist:
#             logger.warning(f"all new data already existing, no need to save")
#         else:
#             duplicates = merged_df[merged_df.duplicated('id', keep=False)]
#             # Print the duplicate rows (if any)
#             logger.warning(f"{len(duplicates)} data already existing")

#             merged_df = merged_df.drop_duplicates(["id"], keep='first')
#             logger.info(f"removed duplicated data")
#             logger.info(f"New data: {len(merged_df)}")
#             logger.info(f"save new {month} data to {fpath}")


#             # merged_df.to_csv(fpath, index=False, quoting=1)
#     else:
#         logger.info(f"save new month data to {fpath}")
#         logger.info(f"New data: {len(group)}")


#         # group.to_csv(fpath, index=False, quoting=1)
        
#     # upload_to_blob("NOVA", "alltrace", fpath, overwrite = True)

In [23]:
# import glob

# # Folder containing the CSV files
# folder_path = './data/nova/monthly/answer'  # change this to your folder path

# # Get list of all CSV files in the folder
# csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# # Folder containing the CSV files
# folder_path = './data/nova/monthly/incident'  # change this to your folder path

# # Get list of all CSV files in the folder
# csv_files += glob.glob(os.path.join(folder_path, '*.csv'))

# # Folder containing the CSV files
# folder_path = './data/nova/monthly/knowledge'  # change this to your folder path

# # Get list of all CSV files in the folder
# csv_files += glob.glob(os.path.join(folder_path, '*.csv'))


# print(csv_files)

# # Read and merge all CSV files
# df_list = [pd.read_csv(file) for file in sorted(csv_files)]
# merged_all_df = pd.concat(df_list, ignore_index=True).sort_values(["timestamp"], ascending=False)

# # Display merged DataFrame
# merged_all_df

In [None]:
# merged_trace_df = conver_to_yyyymm(merged_all_df, 'timestamp')
# # merged_trace_df

# for month, group in merged_trace_df.groupby('year_month'):
#     group.drop(columns=['year_month'], inplace=True)  # optional
#     fpath = f'data/nova/monthly/alltrace/{month}.csv'
#     if os.path.exists(fpath):
#         logger.info(f"{fpath} already exisiting")
#         old_df = pd.read_csv(fpath)
#         merged_df = pd.concat([old_df, group], ignore_index=True).sort_values(["timestamp"], ascending=False)
#         merged_df = merged_df.drop_duplicates(["id"], keep='first')
#         logger.info(f"save new {month} data to {fpath}")
#         merged_df.to_csv(fpath, index=False, quoting=1)
#     else:
#         logger.info(f"save new month data to {fpath}")
#         group.to_csv(fpath, index=False, quoting=1)
        
#     upload_to_blob("NOVA", "alltrace", fpath, overwrite = True)

2025-04-11 12:24:27,249 - root - INFO - data/nova/monthly/alltrace/2025-03.csv already exisiting
2025-04-11 12:24:27,296 - root - INFO - save new 2025-03 data to data/nova/monthly/alltrace/2025-03.csv
2025-04-11 12:24:27,324 - azure.core.pipeline.policies.http_logging_policy - INFO - Request URL: 'https://hdabeproddata.blob.core.windows.net/alltrace/2025-03.csv'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.25.0 Python/3.10.4 (Linux-5.15.146.1-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '79312df0-1684-11f0-9bc0-6df91223bea8'
    'Authorization': 'REDACTED'
No body was attached to the request
2025-04-11 12:24:27,529 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 200
Response headers:
    'Content-Length': '2824622'
    'Content-Type': 'application/octet-stream'
    'Content-MD5': 'REDACTED'
  

`2025-03.csv` uploaded to blob alltrace.


2025-04-11 12:24:29,061 - azure.core.pipeline.policies.http_logging_policy - INFO - Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Fri, 11 Apr 2025 03:24:29 GMT'
    'ETag': '"0x8DD78A85E6B58F8"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '6368ad55-a01e-007d-4291-aa32ae000000'
    'x-ms-client-request-id': '79312df3-1684-11f0-9bc0-6df91223bea8'
    'x-ms-version': 'REDACTED'
    'x-ms-content-crc64': 'REDACTED'
    'x-ms-request-server-encrypted': 'REDACTED'
    'Date': 'Fri, 11 Apr 2025 03:24:28 GMT'


`2025-04.csv` uploaded to blob alltrace.
