**<h3 style="text-align: center; color: #edc9af;">CONVERT AND MERGE FILES TO ONE FILE </h3>**

In [3]:
import os
import json
import pandas as pd
from datetime import datetime

#### **Event login**

In [28]:
event_file="login"

In [6]:
input_folder = "data/data_json"
output_folder = "data"

jsonl_files = sorted([
    f for f in os.listdir(input_folder)
    if f.startswith(f"tmp_{event_file}") and f.endswith(".jsonl")
])

if not jsonl_files:
    print("Cannot find *.jsonl in data/data_json/")
    exit()

In [4]:
def extract_datetime_from_filename(filename: str, part: str) -> str:
    base = filename.replace(".jsonl", "").split("tmp_")[1]
    start_str, end_str = base.split("_to_")
    return start_str if part == 'start' else end_str

def format_for_filename(ts: str) -> str:
    return ts.replace(":", "-").replace("T", "_")

In [7]:
first_time_raw = extract_datetime_from_filename(jsonl_files[0], 'start')
last_time_raw  = extract_datetime_from_filename(jsonl_files[-1], 'end')

first_time = format_for_filename(first_time_raw)
last_time  = format_for_filename(last_time_raw)

output_csv_name = f"m952_{first_time}_to_{last_time}.csv"
output_csv_path = os.path.join(output_folder, output_csv_name)

In [33]:
columns_keep = [
    "@timestamp",
    "user.id",
    "event.its.properties.gold",
    "event.its.properties.diamond",
    "event.its.properties.power_point",
    "event.its.properties.level",
    "event.its.properties.vip_level",
    "event.its.properties.dragon_gold"
]

if os.path.exists(output_csv_path):
    print(f"File exists: {output_csv_path} , stopped.")

else:
    is_first = True
    total_records = 0

    for file in jsonl_files:
        full_path = os.path.join(input_folder, file)
        with open(full_path, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                try:
                    obj = json.loads(line.strip())
                    lines.append(obj)
                except json.JSONDecodeError:
                    print(f"Error decoding line in file: {file}")

            if lines:
                df = pd.json_normalize(lines)

                for col in columns_keep:
                    if col not in df.columns:
                        df[col] = None

                df = df[columns_keep]
                df.to_csv(output_csv_path, index=False, mode='w' if is_first else 'a', header=is_first)
                total_records += len(df)
                is_first = False
                print(f"Done: {file} — {len(df)} records")

    print(f"\nCompleted convert {total_records:,} records to: {output_csv_path}")


Done: tmp_login_2025-05-04_00-00-00_to_2025-05-05_00-00-00.jsonl — 475205 records
Done: tmp_login_2025-05-05_00-00-00_to_2025-05-06_00-00-00.jsonl — 589090 records
Done: tmp_login_2025-05-06_00-00-00_to_2025-05-07_00-00-00.jsonl — 554523 records
Done: tmp_login_2025-05-07_00-00-00_to_2025-05-08_00-00-00.jsonl — 596651 records
Done: tmp_login_2025-05-08_00-00-00_to_2025-05-09_00-00-00.jsonl — 562847 records
Done: tmp_login_2025-05-09_00-00-00_to_2025-05-10_00-00-00.jsonl — 588048 records
Done: tmp_login_2025-05-10_00-00-00_to_2025-05-11_00-00-00.jsonl — 562190 records
Done: tmp_login_2025-05-11_00-00-00_to_2025-05-12_00-00-00.jsonl — 557629 records
Done: tmp_login_2025-05-12_00-00-00_to_2025-05-13_00-00-00.jsonl — 574555 records
Done: tmp_login_2025-05-13_00-00-00_to_2025-05-14_00-00-00.jsonl — 551753 records
Done: tmp_login_2025-05-14_00-00-00_to_2025-05-15_00-00-00.jsonl — 585040 records
Done: tmp_login_2025-05-15_00-00-00_to_2025-05-16_00-00-00.jsonl — 379240 records
Done: tmp_login_

#### **Not event login**

In [8]:
event_file="shopflow"

In [9]:
# columns_keep = [
#     "@timestamp", "user.id",
#     'event.its.properties.product_name',
#     'event.its.properties.payment_amount',
# ]
columns_keep = [
    '@timestamp',
    'user.id',
    'event.its.properties.i_money_type',
    'event.its.properties.i_zone_area_id',
    'event.its.properties.quantity',
    'event.its.properties.i_money_value',
    'event.its.properties.power_point',
    'event.its.properties.i_shop_type',
]

In [None]:
def convert_jsonl_to_csv(
    jsonl_files,
    input_folder,
    output_csv_path,
    columns_keep
):
    is_first = True
    total_records = 0

    for file in jsonl_files:
        full_path = os.path.join(input_folder, file)
        with open(full_path, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                try:
                    obj = json.loads(line.strip())
                    lines.append(obj)
                except json.JSONDecodeError:
                    print(f"Error decoding line in file: {file}")

        if lines:  
            df = pd.json_normalize(lines)
            for col in columns_keep:
                if col not in df.columns:
                    df[col] = None

            df = df[columns_keep]

            df.to_csv(output_csv_path, index=False, mode='w' if is_first else 'a', header=is_first)
            total_records += len(df)
            is_first = False

            print(f"Done: {file} ({len(df)} records)")

    print(f"\nConverted {total_records:,} records to: {output_csv_path}")


In [11]:
input_folder = "data/data_json"
jsonl_files = sorted([
    f for f in os.listdir(input_folder)
    if f.startswith(f"tmp_{event_file}") and f.endswith(".jsonl")
])

In [12]:
start_date = "2025-05-04_00-00-00"
end_date   = "2025-06-19_00-00-00"
output_csv_path = f"data/m952_{event_file}_{start_date}_to_{end_date}.csv"
convert_jsonl_to_csv(
    jsonl_files=jsonl_files,
    input_folder=input_folder,
    output_csv_path=output_csv_path,
    columns_keep=columns_keep
)

Done: tmp_shopflow_2025-05-04_00-00-00_to_2025-05-05_00-00-00.jsonl (2414669 records)
Done: tmp_shopflow_2025-05-05_00-00-00_to_2025-05-06_00-00-00.jsonl (2066940 records)
Done: tmp_shopflow_2025-05-06_00-00-00_to_2025-05-07_00-00-00.jsonl (1895001 records)
Done: tmp_shopflow_2025-05-07_00-00-00_to_2025-05-08_00-00-00.jsonl (1951789 records)
Done: tmp_shopflow_2025-05-08_00-00-00_to_2025-05-09_00-00-00.jsonl (1939449 records)
Done: tmp_shopflow_2025-05-09_00-00-00_to_2025-05-10_00-00-00.jsonl (1903325 records)
Done: tmp_shopflow_2025-05-10_00-00-00_to_2025-05-11_00-00-00.jsonl (1916484 records)
Done: tmp_shopflow_2025-05-11_00-00-00_to_2025-05-12_00-00-00.jsonl (1874801 records)
Done: tmp_shopflow_2025-05-12_00-00-00_to_2025-05-13_00-00-00.jsonl (1847127 records)
Done: tmp_shopflow_2025-05-13_00-00-00_to_2025-05-14_00-00-00.jsonl (1542640 records)
Done: tmp_shopflow_2025-05-14_00-00-00_to_2025-05-15_00-00-00.jsonl (1796364 records)
Done: tmp_shopflow_2025-05-15_00-00-00_to_2025-05-16_0