In [None]:
"""
date : 2024-11-25
version : 1.1.0
writer : cwkang
"""

In [1]:
import os
import json
import logging
import argparse
import folium
import warnings
import numpy as np
import pandas as pd
from PIL import Image
from folium import plugins
from ast import literal_eval
from attrdictionary  import AttrDict

from utils import init_logger
from preprocessing import Preprocessing
from analyzer import clustering, metrix_to_sequence
from plot import CongestionPolt, CongestionImagePolt

logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

In [2]:
init_logger()

cli_parser = argparse.ArgumentParser()
cli_args = cli_parser.parse_args("")

cli_args.config_dir = "config"
cli_args.config_file = "config.json"

with open(os.path.join(cli_args.config_dir, cli_args.config_file), encoding="utf-8") as f:
    args = AttrDict(json.load(f))

In [3]:
%%time
prep = Preprocessing(args)

# SRCMAC Unique 값 추출
pre_raw_data = prep.get_preprocessed_data(raw_data=args.raw_data, time=True)
srcmac_unique_df = prep.calc_srcmac_unique(pre_raw_data)

# 일자별 SRCMAC의 수
people_cnt, _ = prep.calc_loc_people_counts(pre_raw_data, srcmac_unique_df, cumsum="00:01:00", loc=False, on_time=False, is_save=False)

# 시간대별 SRCMAC의 수
h_people_cnt, _ = prep.calc_loc_people_counts(pre_raw_data, srcmac_unique_df, cumsum="00:01:00", loc=False, on_time=True, is_save=False)

# 공간별 체류인원수
loc_srcmac_cnt, loc_people_cnt, srcmac_uni_matched_raw_data = prep.calc_loc_people_counts(pre_raw_data, srcmac_unique_df, cumsum="00:01:00", loc=True, on_time=False, is_save=False)

# 공간별 시간별 체류인원수
loc_srcmac_cnt, loc_h_people_cnt, srcmac_uni_matched_raw_data = prep.calc_loc_people_counts(pre_raw_data, srcmac_unique_df, cumsum="00:01:00", loc=True, on_time=True, is_save=False)

# 일자별 평균 체류시간
# spenttime_mean_df, loc_spenttime_mean_df, over_basetime_both_df = prep.calc_loc_people_spenttime(pre_raw_data, srcmac_unique_df, base_time=10, time_diff="00:01:00", loc=False, on_time=False, is_save=False)

# 일자별 공간별 시간대별 평균 체류시간
loc_h_spenttime_mean_df = prep.calc_loc_people_spenttime(pre_raw_data, srcmac_unique_df, base_time=10, time_diff="00:01:00", loc=True, entry_time=True, is_save=True)

# # Plot를 위한 Matrix 데이터
# srcmac_loc_seq_metrix = prep.create_srcmac_loc_sequence_metrix(over_basetime_both_df)
# cluster_metrix = clustering(srcmac_loc_seq_metrix)
# mtx_seq_df = metrix_to_sequence(prep.zone_tbl, srcmac_loc_seq_metrix)

11/25/2024 16:53:23 - INFO - preprocessing.preprocessing -   gangneung_zoneTable.csv DataFrame Shape : (12, 8)
11/25/2024 16:53:24 - INFO - preprocessing.preprocessing -   gangneung_rawData_20241026.csv DataFrame Shape : (2894868, 4)
11/25/2024 16:53:45 - INFO - preprocessing.preprocessing -   The Processed DataFrame Shape : (2517406, 5)
11/25/2024 16:53:45 - INFO - preprocessing.preprocessing -   SRCMAC Unique Number : (41872, 4)
11/25/2024 16:53:45 - INFO - preprocessing.preprocessing -   People Count by Non-Location : (41872, 5)
11/25/2024 16:53:45 - INFO - preprocessing.preprocessing -   People Count by Non-Location : (12, 2)
11/25/2024 16:53:46 - INFO - preprocessing.preprocessing -   People Count by Location : (37956, 6)
11/25/2024 16:53:46 - INFO - preprocessing.preprocessing -   People Count by Location : (37657, 7)
11/25/2024 16:53:49 - INFO - preprocessing.preprocessing -   v1026_일자별 공간별 시간대별 체류시간.csv has been created!


CPU times: total: 12.7 s
Wall time: 26.4 s


In [None]:
# 체류 시간
# 일자별 평균 체류시간
# spenttime_mean_df["time_diff"].mean()

# 일자별 시간대별 평균 체류시간
loc_spenttime_mean_df

# 일자별 위치별 평균 체류시간
# 일자별 시간대별 위치별 평균 체류시간

In [None]:
pre_raw_data
srcmac_unique_df

# 1시간 단위로 그룹화하기 위해 hour 컬럼 추가
pre_raw_data["hour"] = pre_raw_data["TIME_KST"].dt.floor("h")

# 시계열 순으로 정렬
pre_raw_data = pre_raw_data.sort_values(by="TIME_KST", ascending=True).reset_index(drop=True)

# TIME_KST hh:mm:ss 포멧 추가
pre_raw_data["TIME_KST(hh:mm:ss)"] = pd.to_datetime(pre_raw_data["TIME_KST"]).dt.strftime("%Y-%m-%d %H:%M:%S")
pre_raw_data["TIME_KST(hh:mm:ss)"] = pd.to_datetime(pre_raw_data["TIME_KST(hh:mm:ss)"])

# 동일 시간에 SRCMAC이 중복 시 max rssi만 추출
pre_raw_data = pre_raw_data.loc[pre_raw_data.groupby(["TIME_KST(hh:mm:ss)", "SRCMAC"])["RSSI"].idxmax()]

# 시계열 순으로 정렬
pre_raw_data = pre_raw_data.sort_values(by="TIME_KST", ascending=True).reset_index(drop=True)

# srcmac unique만 추출
_srcmac_unique_df = pre_raw_data[pre_raw_data["SRCMAC"].isin(srcmac_unique_df["SRCMAC"])].reset_index(drop=True)
_srcmac_unique_df

In [None]:
# 각 hour, SRCMAC, locatioon 별 min time과 max time을 계산
h_src_loc_df = _srcmac_unique_df.groupby(["SRCMAC", "location", "hour"])["TIME_KST(hh:mm:ss)"].agg(["min", "max"])
h_src_loc_df["time_diff"] = h_src_loc_df["max"] - h_src_loc_df["min"]

# 각 hour, SRCMAC, locatioon 별 rssi mean을 계산
rssi_df = _srcmac_unique_df.groupby(["SRCMAC", "location", "hour"])["RSSI"].agg(["mean"]).round(0)

# h_src_loc_df & rssi_df merge
both_df = pd.merge(h_src_loc_df, rssi_df, left_index=True, right_index=True).reset_index()
both_df

In [None]:
# SRCMAC 별 time_diff의 평균이 10시간 이상인 SRCMAC 제거
time_diff_sum = both_df.groupby("SRCMAC")["time_diff"].sum()
over_basetime_index = time_diff_sum[time_diff_sum < pd.Timedelta(hours=10)].index
over_basetime_both_df = both_df[both_df["SRCMAC"].isin(over_basetime_index)].reset_index(drop=True)
over_basetime_both_df

In [None]:
# time_diff가 0 제거
over_basetime_both_df = over_basetime_both_df[over_basetime_both_df["time_diff"] > pd.Timedelta("00:01:00")].reset_index(drop=True)
over_basetime_both_df

In [None]:
hour_min = over_basetime_both_df.groupby(["SRCMAC", "location"])[["hour"]].agg("min")
hour_min.columns = ["hour_min"]
hour_min

over_basetime_both_df = over_basetime_both_df.set_index(["SRCMAC", "location"])
over_basetime_both_df

In [None]:
over_basetime_both_df = pd.merge(over_basetime_both_df, hour_min, left_index=True, right_index=True).reset_index()

In [None]:
# SRCMAC, location별 평균 time_diff
src_loc_spenttime_mean_df = over_basetime_both_df.groupby(["SRCMAC", "location", "hour_min"])[["time_diff"]].agg("sum").reset_index()
src_loc_spenttime_mean_df

In [None]:
# location별 평균 time_diff
loc_spenttime_mean_df = src_loc_spenttime_mean_df.groupby(["location", "hour_min"])[["time_diff"]].agg("mean").reset_index()
loc_spenttime_mean_df.to_csv("./data/test.csv", encoding="utf-8-sig")

In [None]:
if args.task == "gangneung":
    # min: 43, max: 2527
    plot = CongestionPolt(prep.zone_tbl, args)

    if args.is_line_plot == True:
        # Line Plot    
        plot.seq_to_seq_flow_polyline(cluster_metrix, n_priority=50)

    if args.is_heatmap_plot == True:
        for hour in range(9, 21):
            # Heatmap
            plot.heat_map(loc_h_people_cnt, hour, is_interpolate=True)

elif args.task == "kme2024":
    # min: 26, max: 794
    plot = CongestionImagePolt(prep.zone_tbl, args)

    if args.is_line_plot == True:
        plot.seq_to_seq_flow_polyline(cluster_metrix, n_priority=50)

    if args.is_heatmap_plot == True: 
        for hour in range(10, 19):
            plot.heat_map(loc_h_people_cnt, hour, radius=70, is_interpolate=True, min_value=26, max_value=794)