In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 로드
standard_df = pd.read_csv('gStation_standard.csv')
real_df = pd.read_csv('real_data.csv')

# 요일 목록
days = ['MON', 'TUE', 'WED', 'THU', 'FRI']

# standard_df를 요일별로 분리
standard_by_day = {}
for day in days:
    day_cols = [col for col in standard_df.columns if day in col]
    standard_by_day[day] = standard_df[day_cols].rename(columns={
        f'{day}_FIXED': 'FIXED',
        f'{day}_SCHOOL_DEPART': 'SCHOOL_DEPART',
        f'{day}_STATION_DEPART': 'STATION_DEPART'
    })

In [3]:
# 시간 형식을 datetime.time으로 변환
real_df['DEPART_TIME'] = pd.to_datetime(real_df['DEPART_TIME'], format='%H:%M').dt.time
for day in days:
    standard_by_day[day]['SCHOOL_DEPART'] = standard_by_day[day]['SCHOOL_DEPART'].apply(
        lambda x: pd.to_datetime(x, format='%H:%M').time()
    )
    standard_by_day[day]['STATION_DEPART'] = standard_by_day[day]['STATION_DEPART'].apply(
        lambda x: pd.to_datetime(x, format='%H:%M').time()
    )

# 시간대를 정의하는 함수
def define_time_slots(df):
    time_slots = []
    current_slot = []
    for i, row in df.iterrows():
        if row['FIXED']:
            if current_slot:
                time_slots.append(current_slot)
            time_slots.append([i])  # FIXED=TRUE는 독립적인 시간대
            current_slot = []
        else:
            current_slot.append(i)
    if current_slot:
        time_slots.append(current_slot)
    return time_slots

In [4]:
# 요일별 시간대 정의
time_slots_by_day = {day: define_time_slots(standard_by_day[day]) for day in days}

# 시간(분)으로 변환하는 함수
def time_to_minutes(t):
    return t.hour * 60 + t.minute

# 특징 생성 함수
def prepare_features(df, time_slots, day):
    features = []
    targets_school = []
    targets_station = []
    indices = []

    for slot in time_slots:
        slot_df = df.iloc[slot]
        start_idx = slot[0]
        start_time = min(time_to_minutes(df.iloc[start_idx]['SCHOOL_DEPART']),
                         time_to_minutes(df.iloc[start_idx]['STATION_DEPART']))

        for i, idx in enumerate(slot):
            row = df.iloc[idx]
            feature = {
                'day': day,
                'slot_start_minutes': start_time,
                'order_in_slot': i,
                'slot_size': len(slot),
                'relative_position': i / len(slot) if len(slot) > 1 else 0
            }
            features.append(feature)
            targets_school.append(time_to_minutes(row['SCHOOL_DEPART']))
            targets_station.append(time_to_minutes(row['STATION_DEPART']))
            indices.append(idx)

    return pd.DataFrame(features), np.array(targets_school), np.array(targets_station), indices

In [5]:
# 요일별 특징 준비
features_by_day = {}
targets_school_by_day = {}
targets_station_by_day = {}
indices_by_day = {}
for day in days:
    features, targets_school, targets_station, indices = prepare_features(
        standard_by_day[day], time_slots_by_day[day], day
    )
    features_by_day[day] = features
    targets_school_by_day[day] = targets_school
    targets_station_by_day[day] = targets_station
    indices_by_day[day] = indices

# 실제 데이터를 학습 데이터로 변환
real_features = []
real_targets_school = []
real_targets_station = []

for _, row in real_df.iterrows():
    day = row['DAY']
    depart_time = time_to_minutes(row['DEPART_TIME'])
    slot_starts = [min(time_to_minutes(standard_by_day[day].iloc[slot[0]]['SCHOOL_DEPART']),
                       time_to_minutes(standard_by_day[day].iloc[slot[0]]['STATION_DEPART']))
                   for slot in time_slots_by_day[day]]
    closest_slot_start = min(slot_starts, key=lambda x: abs(x - depart_time))
    slot_idx = slot_starts.index(closest_slot_start)
    slot_size = len(time_slots_by_day[day][slot_idx])

    feature = {
        'day': day,
        'slot_start_minutes': closest_slot_start,
        'order_in_slot': min(range(slot_size), key=lambda i: abs(
            time_to_minutes(standard_by_day[day].iloc[time_slots_by_day[day][slot_idx][i]][
                                'SCHOOL_DEPART' if row['DEPART_AT'] == 'SCH' else 'STATION_DEPART']) - depart_time)),
        'slot_size': slot_size,
        'relative_position': feature['order_in_slot'] / slot_size if slot_size > 1 else 0
    }
    real_features.append(feature)
    if row['DEPART_AT'] == 'SCH':
        real_targets_school.append(depart_time)
        real_targets_station.append(np.nan)
    else:
        real_targets_station.append(depart_time)
        real_targets_school.append(np.nan)

In [6]:
# 데이터프레임 생성
real_features_df = pd.DataFrame(real_features)
real_features_df['target_school'] = real_targets_school
real_features_df['target_station'] = real_targets_station

# 요일 매핑 (문자열을 숫자로 변환)
day_mapping = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4}
real_features_df['day'] = real_features_df['day'].map(day_mapping)

# 학습 데이터 준비
X_school = real_features_df.dropna(subset=['target_school'])[['day', 'slot_start_minutes', 'order_in_slot', 'slot_size', 'relative_position']]
y_school = real_features_df.dropna(subset=['target_school'])['target_school']
X_station = real_features_df.dropna(subset=['target_station'])[['day', 'slot_start_minutes', 'order_in_slot', 'slot_size', 'relative_position']]
y_station = real_features_df.dropna(subset=['target_station'])['target_station']

# 모델 학습
rf_school = RandomForestRegressor(n_estimators=100, random_state=42)
rf_station = RandomForestRegressor(n_estimators=100, random_state=42)
if len(X_school) > 0:
    rf_school.fit(X_school, y_school)
if len(X_station) > 0:
    rf_station.fit(X_station, y_station)

# 분을 시간 형식으로 변환
def minutes_to_time(minutes):
    hours = int(minutes // 60)
    mins = int(minutes % 60)
    return pd.Timestamp(f'{hours:02d}:{mins:02d}').time()