In [20]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')  # 경고 메시지 억제

In [21]:
# 데이터 로드
standard_df = pd.read_csv('gStation_standard.csv')
real_df = pd.read_csv('real_data.csv')

# 데이터 확인
print("gStation_standard.csv head:")
print(standard_df.head())
print("\nreal_data.csv head:")
print(real_df.head())

gStation_standard.csv head:
   MON_FIXED MON_SCHOOL_DEPART MON_STATION_DEPART  TUE_FIXED  \
0      False              7:55               8:10      False   
1       True              8:00               8:15       True   
2      False              8:02               8:17      False   
3      False              8:04               8:19      False   
4       True              8:05               8:20       True   

  TUE_SCHOOL_DEPART TUE_STATION_DEPART  WED_FIXED WED_SCHOOL_DEPART  \
0              7:55               8:10      False              7:55   
1              8:00               8:15       True              8:00   
2              8:02               8:17      False              8:02   
3              8:04               8:19      False              8:04   
4              8:05               8:20       True              8:05   

  WED_STATION_DEPART  THU_FIXED THU_SCHOOL_DEPART THU_STATION_DEPART  \
0               8:10      False              7:55               8:10   
1               

In [22]:
# 요일 목록
days = ['MON', 'TUE', 'WED', 'THU', 'FRI']

# standard_df를 요일별로 분리
standard_by_day = {}
for day in days:
    day_cols = [col for col in standard_df.columns if day in col]
    standard_by_day[day] = standard_df[day_cols].rename(columns={
        f'{day}_FIXED': 'FIXED',
        f'{day}_SCHOOL_DEPART': 'SCHOOL_DEPART',
        f'{day}_STATION_DEPART': 'STATION_DEPART'
    })

# real_df의 시간 형식을 datetime으로 변환
real_df['DEPART_TIME'] = pd.to_datetime(real_df['DEPART_TIME'], format='%H:%M').dt.time

# standard_df의 시간 형식도 일치시키기 (문자열로 유지하며 학습에 대비)
for day in days:
    standard_by_day[day]['SCHOOL_DEPART'] = standard_by_day[day]['SCHOOL_DEPART'].apply(
        lambda x: pd.to_datetime(x, format='%H:%M').time()
    )
    standard_by_day[day]['STATION_DEPART'] = standard_by_day[day]['STATION_DEPART'].apply(
        lambda x: pd.to_datetime(x, format='%H:%M').time()
    )

In [23]:
# 시간대를 정의하는 함수
def define_time_slots(df):
    time_slots = []
    current_slot = []
    for i, row in df.iterrows():
        if row['FIXED']:
            if current_slot:
                time_slots.append(current_slot)
            time_slots.append([i])  # FIXED=TRUE는 독립적인 시간대
            current_slot = []
        else:
            current_slot.append(i)
    if current_slot:
        time_slots.append(current_slot)
    return time_slots

# 요일별 시간대 정의
time_slots_by_day = {day: define_time_slots(standard_by_day[day]) for day in days}

# 예시 출력
print("MON의 시간대:", time_slots_by_day['MON'])

MON의 시간대: [[0], [1], [2, 3], [4], [5, 6], [7], [8], [9], [10], [11], [12, 13], [14], [15], [16], [17], [18], [19], [20, 21], [22], [23, 24, 25], [26], [27], [28], [29]]


In [24]:
# 시간(분)으로 변환하는 함수
def time_to_minutes(t):
    return t.hour * 60 + t.minute

# 특징 생성 함수
def prepare_features(df, time_slots, day):
    features = []
    targets_school = []
    targets_station = []
    indices = []

    for slot in time_slots:
        slot_df = df.iloc[slot]
        start_idx = slot[0]
        start_time = min(time_to_minutes(df.iloc[start_idx]['SCHOOL_DEPART']),
                         time_to_minutes(df.iloc[start_idx]['STATION_DEPART']))

        for i, idx in enumerate(slot):
            row = df.iloc[idx]
            feature = {
                'day': day,
                'slot_start_minutes': start_time,
                'order_in_slot': i,
                'slot_size': len(slot)
            }
            features.append(feature)
            targets_school.append(time_to_minutes(row['SCHOOL_DEPART']))
            targets_station.append(time_to_minutes(row['STATION_DEPART']))
            indices.append(idx)

    return pd.DataFrame(features), np.array(targets_school), np.array(targets_station), indices

# 요일별 특징 준비
features_by_day = {}
targets_school_by_day = {}
targets_station_by_day = {}
indices_by_day = {}
for day in days:
    features, targets_school, targets_station, indices = prepare_features(
        standard_by_day[day], time_slots_by_day[day], day
    )
    features_by_day[day] = features
    targets_school_by_day[day] = targets_school
    targets_station_by_day[day] = targets_station
    indices_by_day[day] = indices

In [26]:
# Random Forest 모델 학습
rf_school = RandomForestRegressor(n_estimators=100, random_state=42)
rf_station = RandomForestRegressor(n_estimators=100, random_state=42)

# 실제 데이터를 학습 데이터로 변환
real_features = []
real_targets_school = []
real_targets_station = []

for _, row in real_df.iterrows():
    day = row['DAY']
    depart_time = time_to_minutes(row['DEPART_TIME'])
    slot_starts = [min(time_to_minutes(standard_by_day[day].iloc[slot[0]]['SCHOOL_DEPART']),
                       time_to_minutes(standard_by_day[day].iloc[slot[0]]['STATION_DEPART']))
                   for slot in time_slots_by_day[day]]
    closest_slot_start = min(slot_starts, key=lambda x: abs(x - depart_time))
    slot_idx = slot_starts.index(closest_slot_start)
    slot_size = len(time_slots_by_day[day][slot_idx])

    feature = {
        'day': day,
        'slot_start_minutes': closest_slot_start,
        'order_in_slot': min(range(slot_size), key=lambda i: abs(
            time_to_minutes(standard_by_day[day].iloc[time_slots_by_day[day][slot_idx][i]][
                                'SCHOOL_DEPART' if row['DEPART_AT'] == 'SCH' else 'STATION_DEPART']) - depart_time)),
        'slot_size': slot_size
    }
    real_features.append(feature)
    if row['DEPART_AT'] == 'SCH':
        real_targets_school.append(depart_time)
        real_targets_station.append(np.nan)
    else:
        real_targets_station.append(depart_time)
        real_targets_school.append(np.nan)

# 데이터프레임 생성
real_features_df = pd.DataFrame(real_features)

# 타겟 컬럼 추가
real_features_df['target_school'] = real_targets_school
real_features_df['target_station'] = real_targets_station

# 요일 매핑 (문자열을 숫자로 변환)
day_mapping = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4}
real_features_df['day'] = real_features_df['day'].map(day_mapping)

# 학습 데이터 준비
X_school = real_features_df.dropna(subset=['target_school'])[['day', 'slot_start_minutes', 'order_in_slot', 'slot_size']]
y_school = real_features_df.dropna(subset=['target_school'])['target_school']
X_station = real_features_df.dropna(subset=['target_station'])[['day', 'slot_start_minutes', 'order_in_slot', 'slot_size']]
y_station = real_features_df.dropna(subset=['target_station'])['target_station']

# 모델 학습
if len(X_school) > 0:
    rf_school.fit(X_school, y_school)
if len(X_station) > 0:
    rf_station.fit(X_station, y_station)

In [29]:
# 분을 시간 형식으로 변환
def minutes_to_time(minutes):
    hours = int(minutes // 60)
    mins = int(minutes % 60)
    return pd.Timestamp(f'{hours:02d}:{mins:02d}').time()

# 예측 및 조정
adjusted_standard = standard_df.copy()

# 요일 매핑 (문자열을 숫자로 변환)
day_mapping = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4}

for day in days:
    df = standard_by_day[day]
    features = features_by_day[day].copy()  # 원본 데이터 보호를 위해 복사
    indices = indices_by_day[day]

    # 특징 데이터의 'day' 컬럼을 숫자로 변환
    features['day'] = features['day'].map(day_mapping)

    # 예측
    pred_school = rf_school.predict(features)
    pred_station = rf_station.predict(features)

    # 시간대별 조정
    for slot in time_slots_by_day[day]:
        slot_indices = [indices[i] for i in range(len(indices)) if indices[i] in slot]
        if not slot_indices:
            continue
        slot_preds_school = [pred_school[i] for i in range(len(indices)) if indices[i] in slot]
        slot_preds_station = [pred_station[i] for i in range(len(indices)) if indices[i] in slot]

        # 시간대 내 정렬 및 대수 유지
        slot_df = df.iloc[slot]
        fixed_row = slot_df[slot_df['FIXED']]
        if not fixed_row.empty:
            max_time_school = time_to_minutes(fixed_row['SCHOOL_DEPART'].iloc[0])
            max_time_station = time_to_minutes(fixed_row['STATION_DEPART'].iloc[0])
            slot_preds_school = sorted([min(p, max_time_school) for p in slot_preds_school])
            slot_preds_station = sorted([min(p, max_time_station) for p in slot_preds_station])

        # 결과 반영
        for idx, school_time, station_time in zip(slot_indices, slot_preds_school, slot_preds_station):
            if not df.iloc[idx]['FIXED']:  # FIXED=FALSE만 조정
                adjusted_standard.at[idx, f'{day}_SCHOOL_DEPART'] = minutes_to_time(school_time)
                adjusted_standard.at[idx, f'{day}_STATION_DEPART'] = minutes_to_time(station_time)

In [31]:
# 조정된 시간표 저장
adjusted_standard.to_csv('grok_gStation.csv', index=False)
print("조정된 시간표가 'grok_gStation.csv'로 저장되었습니다.")

조정된 시간표가 'grok_gStation.csv'로 저장되었습니다.
