**<h3 style="text-align: center; color: #edc9af;">INGEST DATA FROM ITS</h3>**

#### **Import libraries and .env**

In [4]:
import pandas as pd
import requests
import os
from dotenv import load_dotenv
import json
import psutil
import csv
from datetime import datetime,timedelta
import urllib3  
from concurrent.futures import ThreadPoolExecutor
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # shut down warnings

In [5]:
# os.environ["OMP_NUM_THREADS"] = "8"
# os.environ["OPENBLAS_NUM_THREADS"] = "8"
# os.environ["MKL_NUM_THREADS"] = "8"
# os.environ["NUMEXPR_NUM_THREADS"] = "8"
# os.environ["VECLIB_MAXIMUM_THREADS"] = "8"

# try:
#     p = psutil.Process()
#     p.cpu_affinity([0, 1, 2, 3, 4, 5, 6, 7]) 
#     print("Limit core and thread sucessful.")
# except Exception as e:
#     print(f"Error while litmiting resources: {e}")

load_dotenv()
API_KEY_ELASTIC=os.getenv("ELASTIC_KEY")

In [None]:
def sanitize_filename(ts: str) -> str:
    return ts.replace(":", "-").replace("T", "_")

def split_time_range(start, end, delta_days=1):
    ranges = []
    while start < end:
        next_start = start + timedelta(days=delta_days)
        ranges.append((start, min(next_start, end)))
        start = next_start
    return ranges

In [None]:
start_query = datetime.fromisoformat("2025-05-15T00:00:00")
end_query = datetime.fromisoformat("2025-06-16T00:00:00")
ranges = split_time_range(start_query, end_query, delta_days=1)

#### **Pull login event**

In [None]:
def fetch_and_save_elastic_data(
    index_name,
    start_query,
    end_query,
    event_action,
    event_module,
    output_file_json,
    api_key
):
    url = f'https://103.9.206.216:9200/{index_name}/_search?scroll=2m'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'{api_key}'
    }

    query = {
        "size": 10000,
        "sort": ["@timestamp"],
        "_source": True,
        "query": {
            "bool": {
                "must": [
                    {"range": {
                        "@timestamp": {
                            "gte": start_query,
                            "lt": end_query,
                            "time_zone": "+07:00"
                        }
                    }},
                    {"match": {"event.action": event_action}},
                    {"match": {"event.module": event_module}},
                    {"match": {"agent.name": "playerlogin"}}
                ]
            }
        }
    }

    if os.path.exists(output_file_json):
        os.remove(output_file_json)

    def write_jsonl(hits):
        with open(output_file_json, 'a', encoding='utf-8') as f:
            for hit in hits:
                source = hit.get('_source')
                if source:
                    f.write(json.dumps(source, ensure_ascii=False) + '\n')

    # Initial fetch
    try:
        response = requests.post(url, headers=headers, json=query, verify=False)
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        print(f"[{start_query}] Initial fetch error:", e)
        return

    hits = data.get('hits', {}).get('hits', [])
    scroll_id = data.get('_scroll_id')

    total = len(hits)
    scroll_count = 0
    write_jsonl(hits)

    if not scroll_id:
        print(f"[{start_query}] No scroll_id returned.")
        return

    print(f"[{start_query}] Initial batch: {len(hits)} records")

    # Scroll loop
    scroll_url = 'https://103.9.206.216:9200/_search/scroll'
    seen_scroll_ids = set()
    seen_scroll_ids.add(scroll_id)
    total = len(hits)
    scroll_count = 0

    while True:
        scroll_payload = {"scroll": "2m", "scroll_id": scroll_id}
        try:
            response = requests.post(scroll_url, headers=headers, json=scroll_payload, verify=False)
            response.raise_for_status()
            data = response.json()
        except Exception as e:
            print(f"[{start_query}] Scroll fetch error:", e)
            break

        hits = data.get('hits', {}).get('hits', [])
        new_scroll_id = data.get('_scroll_id')

        if not hits or not new_scroll_id:
            print(f"[{start_query}] No more hits or missing scroll_id.")
            break

        seen_scroll_ids.add(new_scroll_id)
        scroll_id = new_scroll_id

        write_jsonl(hits)
        scroll_count += 1
        total += len(hits)

        print(f"[{start_query}] Scroll #{scroll_count}: {len(hits)} records — Total: {total}")


    # Cleanup scroll context
    try:
        requests.delete(
            "https://103.9.206.216:9200/_search/scroll",
            headers=headers,
            json={"scroll_id": list(seen_scroll_ids)},
            verify=False
        )
    except Exception as e:
        print(f"[{start_query}] Scroll delete warning:", e)

    print(f"[{start_query}] :)) Done: {total} records written to {output_file_json}")

In [None]:
index_name = "event_gamo_m952"
event_action = "its_login"
event_module = "sources"


def run_fetch(start_dt, end_dt):
    start_str = start_dt.isoformat()
    end_str = end_dt.isoformat()
    jsonl_path = f"data/data_json/tmp_login_{sanitize_filename(start_str)}_to_{sanitize_filename(end_str)}.jsonl"

    fetch_and_save_elastic_data(
        index_name=index_name,
        start_query=start_str,
        end_query=end_str,
        event_action=event_action,
        event_module=event_module,
        output_file_json=jsonl_path,
        api_key=API_KEY_ELASTIC,
    )
    return jsonl_path


In [None]:
with ThreadPoolExecutor(max_workers=15) as executor:
    futures = [executor.submit(run_fetch, start, end) for start, end in ranges]
    result_paths = [f.result() for f in futures]

#### **Pull not login event - make sure not conflict**

In [1]:
def fetch_and_save_elastic_data_not_login(
    index_name,
    start_query,
    end_query,
    event_action,
    event_module,
    output_file_json,
    api_key
):
    url = f'https://103.9.206.216:9200/{index_name}/_search?scroll=2m'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'{api_key}'
    }

    query = {
        "size": 10000,
        "sort": ["@timestamp"],
        "_source": True,
        "query": {
            "bool": {
                "must": [
                    {"range": {
                        "@timestamp": {
                            "gte": start_query,
                            "lt": end_query,
                            "time_zone": "+07:00"
                        }
                    }},
                    {"match": {"event.action": event_action}},
                    {"match": {"event.module": event_module}},
                    #{"match": {"agent.name": "playerlogin"}}
                ]
            }
        }
    }

    if os.path.exists(output_file_json):
        os.remove(output_file_json)

    def write_jsonl(hits):
        with open(output_file_json, 'a', encoding='utf-8') as f:
            for hit in hits:
                source = hit.get('_source')
                if source:
                    f.write(json.dumps(source, ensure_ascii=False) + '\n')

    # Initial fetch
    try:
        response = requests.post(url, headers=headers, json=query, verify=False)
        response.raise_for_status()
        data = response.json()
    except Exception as e:
        print(f"[{start_query}] Initial fetch error:", e)
        return

    hits = data.get('hits', {}).get('hits', [])
    scroll_id = data.get('_scroll_id')

    total = len(hits)
    scroll_count = 0
    write_jsonl(hits)

    if not scroll_id:
        print(f"[{start_query}] No scroll_id returned.")
        return

    print(f"[{start_query}] Initial batch: {len(hits)} records")

    # Scroll loop
    scroll_url = 'https://103.9.206.216:9200/_search/scroll'
    seen_scroll_ids = set()
    seen_scroll_ids.add(scroll_id)
    total = len(hits)
    scroll_count = 0

    while True:
        scroll_payload = {"scroll": "2m", "scroll_id": scroll_id}
        try:
            response = requests.post(scroll_url, headers=headers, json=scroll_payload, verify=False)
            response.raise_for_status()
            data = response.json()
        except Exception as e:
            print(f"[{start_query}] Scroll fetch error:", e)
            break

        hits = data.get('hits', {}).get('hits', [])
        new_scroll_id = data.get('_scroll_id')

        if not hits or not new_scroll_id:
            print(f"[{start_query}] No more hits or missing scroll_id.")
            break

        seen_scroll_ids.add(new_scroll_id)
        scroll_id = new_scroll_id

        write_jsonl(hits)
        scroll_count += 1
        total += len(hits)

        print(f"[{start_query}] Scroll #{scroll_count}: {len(hits)} records — Total: {total}")


    # Cleanup scroll context
    try:
        requests.delete(
            "https://103.9.206.216:9200/_search/scroll",
            headers=headers,
            json={"scroll_id": list(seen_scroll_ids)},
            verify=False
        )
    except Exception as e:
        print(f"[{start_query}] Scroll delete warning:", e)

    print(f"[{start_query}] :)) Done: {total} records written to {output_file_json}")

In [2]:
index_name = "event_gamo_m952"
event_module = "sources"
event_action = "shopflow"
file_name = "shopflow"

def run_fetch(start_dt, end_dt):
    start_str = start_dt.isoformat()
    end_str = end_dt.isoformat()
    jsonl_path = f"data/data_json/tmp_{file_name}_{sanitize_filename(start_str)}_to_{sanitize_filename(end_str)}.jsonl"

    fetch_and_save_elastic_data_not_login(
        index_name=index_name,
        start_query=start_str,
        end_query=end_str,
        event_action=event_action,
        event_module=event_module,
        output_file_json=jsonl_path,
        api_key=API_KEY_ELASTIC,
    )
    return jsonl_path

In [None]:
with ThreadPoolExecutor(max_workers=15) as executor:
    futures = [executor.submit(run_fetch, start, end) for start, end in ranges]
    result_paths = [f.result() for f in futures]

[2025-05-25T00:00:00] Initial batch: 10000 records
[2025-05-24T00:00:00] Initial batch: 10000 records
[2025-05-15T00:00:00] Initial batch: 10000 records
[2025-05-28T00:00:00] Initial batch: 10000 records
[2025-05-18T00:00:00] Initial batch: 10000 records
[2025-05-22T00:00:00] Initial batch: 10000 records
[2025-05-23T00:00:00] Initial batch: 10000 records
[2025-05-19T00:00:00] Initial batch: 10000 records
[2025-05-26T00:00:00] Initial batch: 10000 records
[2025-05-16T00:00:00] Initial batch: 10000 records
[2025-05-27T00:00:00] Initial batch: 10000 records
[2025-05-21T00:00:00] Initial batch: 10000 records
[2025-05-20T00:00:00] Initial batch: 10000 records
[2025-05-17T00:00:00] Initial batch: 10000 records
[2025-05-29T00:00:00] Initial batch: 10000 records
[2025-05-25T00:00:00] Scroll #1: 10000 records — Total: 20000
[2025-05-24T00:00:00] Scroll #1: 10000 records — Total: 20000
[2025-05-15T00:00:00] Scroll #1: 10000 records — Total: 20000
[2025-05-26T00:00:00] Scroll #1: 10000 records — 