In [1]:
from langgraph.graph import StateGraph

print("LangGraph successfully installed!")

LangGraph successfully installed!


In [1]:
import numpy as np 
import pandas as pd 
import geopandas as gpd
import json
from shapely.ops import unary_union
import random
import os
import folium 
from requests.adapters import HTTPAdapter 
from urllib3.util.retry import Retry 
from datetime import timedelta
from shapely.geometry import Point 
from ortools.linear_solver import pywraplp 
from tqdm import tqdm 
import pickle
import warnings; warnings.filterwarnings('ignore')

In [2]:
print(os.getcwd())  # 지금 실행되는 작업 디렉토리

/Users/jung-eunjoo/Desktop/은주/연구/AI_agent/Seongnam_Scenario_1


In [3]:
# 원하는 경로로 변경
os.chdir("/Users/jung-eunjoo/Desktop/은주/연구/AI_agent/Seongnam_Scenario_1")

print("변경 후 작업 디렉토리:", os.getcwd())

변경 후 작업 디렉토리: /Users/jung-eunjoo/Desktop/은주/연구/AI_agent/Seongnam_Scenario_1


In [4]:
boundary_path = "data/seongnam_boundary.geojson"
taxi_path = "raw_data/Seongnam-si/Seongnam_Taxi/(SNRI) TIMS자료_영업별 자료_QUERRY_4월.xlsx"

In [5]:
taxi_df = pd.read_excel(taxi_path)

In [6]:
df = taxi_df 

# 1) 호출시간 = 승차시간 - 승차시간(sec)
df["승차시간"] = pd.to_datetime(df["승차시간"], errors="coerce")
df["승차시간(sec)"] = pd.to_numeric(df["승차시간(sec)"], errors="coerce")
df["호출시간"] = df["승차시간"] - pd.to_timedelta(df["승차시간(sec)"].fillna(0), unit="s")


In [9]:
# 호출시간이 datetime인지 확인 + 변환
df["호출시간"] = pd.to_datetime(df["호출시간"], errors="coerce")

start = pd.Timestamp("2024-04-18 23:00:00")
end   = pd.Timestamp("2024-04-19 02:00:00")

night_df = df.loc[df["호출시간"].between(start, end, inclusive="both")].copy()
print("rows in window:", len(night_df))

rows in window: 4840


In [10]:
# --- (필수) 컬럼명 숨은 문자/공백 정리 (가끔 엑셀에서 섞임) ---
df.columns = [str(c).replace("\x08","").strip() for c in df.columns]

# --- (필수) taxi_type 생성: 개인=0, 법인/일반=1 ---
def map_taxi_type(v):
    s = ("" if pd.isna(v) else str(v)).strip().lower().replace(" ", "")
    if "개인" in s: return 0
    if ("법인" in s) or ("일반" in s): return 1
    return 1

df["taxi_type"] = df["구분"].map(map_taxi_type).astype("Int64")

In [13]:
# taxi_type: 개인=0, 법인=1 이라고 가정
# 차량번호별 최빈값(모드)으로 일관 라벨 부여
type_mode = (
    df
    .dropna(subset=["차량번호"])
    .groupby("차량번호")["taxi_type"]
    .agg(lambda s: int(s.mode().iat[0]) if not s.mode().empty else 1)  # 비어있으면 법인(1) 디폴트
)

# 모드 라벨을 night_df에 붙여서 정정
night_df["taxi_type_consistent"] = night_df["차량번호"].map(type_mode)

In [15]:
corp_set = set(night_df.loc[night_df["taxi_type_consistent"] == 1, "차량번호"].dropna().unique())
priv_set = set(night_df.loc[night_df["taxi_type_consistent"] == 0, "차량번호"].dropna().unique())
overlap  = corp_set & priv_set  # 이게 0이 되어야 정상

print("overlap vehicles (should be 0):", len(overlap))

# 최종 고유 대수
corp_unique = len(corp_set - overlap)
priv_unique = len(priv_set - overlap)
total_unique = len((corp_set | priv_set))

print("심야 데이터셋 관측 (2024-04-18 23시~02시, 고유 차량 기준)")
print(f"법인택시 활성 대수: {corp_unique}")
print(f"개인택시 활성 대수: {priv_unique}")
print(f"총합: {total_unique}")

overlap vehicles (should be 0): 0
심야 데이터셋 관측 (2024-04-18 23시~02시, 고유 차량 기준)
법인택시 활성 대수: 538
개인택시 활성 대수: 425
총합: 963


In [16]:
# 승차~하차 소요와 '승차시간(sec)' 비교
df["승차시간"] = pd.to_datetime(df["승차시간"], errors="coerce")
df["하차시간"] = pd.to_datetime(df["하차시간"], errors="coerce")
dur_from_times = (df["하차시간"] - df["승차시간"]).dt.total_seconds()
sec_col = pd.to_numeric(df["승차시간(sec)"], errors="coerce")
diff = (dur_from_times - sec_col).abs()
print("median |(하차-승차) - 승차시간(sec)|:", np.nanmedian(diff))

median |(하차-승차) - 승차시간(sec)|: 0.0


In [17]:
start = pd.Timestamp("2024-04-18 23:00:00")
end   = pd.Timestamp("2024-04-19 02:00:00")
mask_pickup = df["승차시간"].between(start, end, inclusive="both")

# taxi_type 일관화(모드)
type_mode = (df.dropna(subset=["차량번호"])
               .groupby("차량번호")["taxi_type"]
               .agg(lambda s: int(s.mode().iat[0]) if not s.mode().empty else 1))
tmp = df.loc[mask_pickup].copy()
tmp["taxi_type_consistent"] = tmp["차량번호"].map(type_mode)

corp = tmp.loc[tmp["taxi_type_consistent"]==1, "차량번호"].nunique()
priv = tmp.loc[tmp["taxi_type_consistent"]==0, "차량번호"].nunique()
print("픽업기준 활성:", corp, priv, corp+priv)

픽업기준 활성: 540 441 981


In [19]:
import numpy as np
import pandas as pd

# 1) 형 변환
taxi_df["승차시간"] = pd.to_datetime(taxi_df["승차시간"], errors="coerce")
taxi_df["하차시간"] = pd.to_datetime(taxi_df["하차시간"], errors="coerce")
sec_col = pd.to_numeric(taxi_df["승차시간(sec)"], errors="coerce")

# 2) 실제 주행시간(초) 계산
dur = (taxi_df["하차시간"] - taxi_df["승차시간"]).dt.total_seconds()

# 3) 비교 지표
abs_diff = (dur - sec_col).abs()

summary = {
    "rows_total": len(taxi_df),
    "rows_valid_both": int(np.sum(~pd.isna(dur) & ~pd.isna(sec_col))),
    "exact_match_count": int(np.sum(abs_diff == 0)),
    "within_1s": float(np.mean(abs_diff <= 1)) if len(abs_diff.dropna()) else np.nan,
    "within_5s": float(np.mean(abs_diff <= 5)) if len(abs_diff.dropna()) else np.nan,
    "median_abs_diff": float(np.nanmedian(abs_diff)),
    "mean_abs_diff": float(np.nanmean(abs_diff)),
    "max_abs_diff": float(np.nanmax(abs_diff)),
    "negative_duration_count": int(np.sum(dur < 0)),
    "nan_duration_count": int(np.sum(pd.isna(dur))),
    "nan_sec_col_count": int(np.sum(pd.isna(sec_col))),
}
print(summary)

# 4) 불일치 사례 몇 건만 확인(±1초 초과)
mismatch = taxi_df.loc[abs_diff > 1, ["승차시간","하차시간","승차시간(sec)"]].copy()
print("mismatch_rows(>1s):", len(mismatch))
display(mismatch.head(10))

{'rows_total': 920506, 'rows_valid_both': 920506, 'exact_match_count': 920506, 'within_1s': 1.0, 'within_5s': 1.0, 'median_abs_diff': 0.0, 'mean_abs_diff': 0.0, 'max_abs_diff': 0.0, 'negative_duration_count': 0, 'nan_duration_count': 0, 'nan_sec_col_count': 0}
mismatch_rows(>1s): 0


Unnamed: 0,승차시간,하차시간,승차시간(sec)


In [20]:
taxi_df.columns


Index(['구분', '차량번호', '운송사업자 등록번호', '결제일시', '승차요금', '호출요금', '기타요금', '할증여부',
       '결제구분(구분불가/현금/카드)', '승차시간', '승차X좌표', '승차Y좌표', '승차법정동코드', '하차시간',
       '하차X좌표', '하차Y좌표', '하차법정동코드', '승차거리(m)', '승차전빈차거리(m)', '승차시간(sec)',
       '결제연월일', '호출시간', 'taxi_type'],
      dtype='object')