In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

In [2]:
drive.mount('/content/drive')
PATH = '/content/drive/My Drive/Colab Notebooks/homerun/'
real_data_path = f'{PATH}arrival_data2.csv'
timetable_path = f'{PATH}gStation3.csv'

def time_to_minutes(time_str):
  # 시간 -> 분
  hours, minutes = map(int, time_str.split(':'))
  return hours * 60 + minutes

def minutes_to_time(minutes):
  # 분 -> 시간
  hours = minutes // 60
  mins = minutes % 60
  return f"{hours:02d}:{mins:02d}"

Mounted at /content/drive


In [15]:
# 1. 데이터 로드
real_data = pd.read_csv(real_data_path)
timetable = pd.read_csv(timetable_path)

print("\ntimetable data:\n", timetable.head())
print("\narrival data:\n", real_data.head())


timetable data:
    FIXED MON_SCHOOL_DEPART MON_STATION_DEPART MON_SCHOOL_ARRIVE  \
0  False              7:55               8:10              8:25   
1   True              8:00               8:15              8:30   
2  False              8:02               8:17              8:32   
3  False              8:03               8:18              8:33   
4   True              8:05               8:20              8:35   

  TUE_SCHOOL_DEPART TUE_STATION_DEPART TUE_SCHOOL_ARRIVE WED_SCHOOL_DEPART  \
0              7:55               8:10              8:25              7:55   
1              8:00               8:15              8:30              8:00   
2              8:02               8:17              8:32              8:02   
3              8:03               8:18              8:33              8:03   
4              8:05               8:20              8:35              8:05   

  WED_STATION_DEPART WED_SCHOOL_ARRIVE THU_SCHOOL_DEPART THU_STATION_DEPART  \
0               8:10           

In [16]:
# 2. 데이터 전처리 시작
days = ['MON', 'TUE', 'WED', 'THU', 'FRI']

# 시간 data -> minutes로 변환
for day in days:
  timetable[f'{day}_SCHOOL_DEPART'] = timetable[f'{day}_SCHOOL_DEPART'].apply(time_to_minutes)
  timetable[f'{day}_STATION_DEPART'] = timetable[f'{day}_STATION_DEPART'].apply(time_to_minutes)
  timetable[f'{day}_SCHOOL_ARRIVE'] = timetable[f'{day}_SCHOOL_ARRIVE'].apply(time_to_minutes)

real_data['ARRIVAL_TIME'] = real_data['ARRIVAL_TIME'].apply(time_to_minutes)

print("\ntimetable data:\n", timetable.head())
print("\narrival data:\n", real_data.head())


timetable data:
    FIXED  MON_SCHOOL_DEPART  MON_STATION_DEPART  MON_SCHOOL_ARRIVE  \
0  False                475                 490                505   
1   True                480                 495                510   
2  False                482                 497                512   
3  False                483                 498                513   
4   True                485                 500                515   

   TUE_SCHOOL_DEPART  TUE_STATION_DEPART  TUE_SCHOOL_ARRIVE  \
0                475                 490                505   
1                480                 495                510   
2                482                 497                512   
3                483                 498                513   
4                485                 500                515   

   WED_SCHOOL_DEPART  WED_STATION_DEPART  WED_SCHOOL_ARRIVE  \
0                475                 490                505   
1                480                 495                

# DAY_SCHOOL_ARRIVE 데이터는 언제 도착한다고 알려줄 때 전혀 쓰지 않아서 필요 없을수도?

In [28]:
# Training data 생성
training_data = []

# 셔틀이 절!대 없는 시간대
NO_SHUTTLE_TIME = [
    ["00:00", "07:40"],
    ["08:30", "08:40"],
    ["09:30", "09:40"],
    ["10:30", "11:45"],
    ["12:30", "12:45"],
    ["13:30", "13:45"],
    ["14:30", "14:45"],
    ["18:45", "19:05"],
    ["19:45", "24:00"]
]
# 분으로 변환
NO_SHUTTLE_TIME = [[time_to_minutes(start), time_to_minutes(end)] for start, end in NO_SHUTTLE_TIME]

for day in days:

  # Real data 전처리
  day_arrivals = real_data[real_data['DAY'] == day]

  # 가짜 데이터 걸러내기
  for start, end in NO_SHUTTLE_TIME:
    day_arrivals = day_arrivals[
        ~((day_arrivals['ARRIVAL_TIME'] >= start) &
          (day_arrivals['ARRIVAL_TIME'] <= end))
    ]

  # Timetable data 전처리
  false_rows = timetable[~timetable['FIXED']]

  for _, row in false_rows.iterrows():
    planned_school_depart = row[f'{day}_SCHOOL_DEPART']
    planned_station_depart = row[f'{day}_STATION_DEPART']
    planned_school_arrive = row[f'{day}_SCHOOL_ARRIVE']

    # 해당 시간대 근처의 실제 도착 데이터 찾기
    station_arrivals = day_arrivals[day_arrivals['DEPART_AT'] == 'STA']
    nearby_station = station_arrivals[
        (station_arrivals['ARRIVAL_TIME'] >= planned_station_depart - 15) &
        (station_arrivals['ARRIVAL_TIME'] <= planned_station_depart + 15)
    ]

    school_arrivals = day_arrivals[day_arrivals['DEPART_AT'] == 'SCH']
    nearby_school = school_arrivals[
        (school_arrivals['ARRIVAL_TIME'] >= planned_school_depart - 15) &
        (school_arrivals['ARRIVAL_TIME'] <= planned_school_depart + 15)
    ]

    if len(nearby_station) > 0 or len(nearby_school) > 0:
      features = {
          'day': day,
          'planned_school_depart': planned_school_depart,
          'planned_station_depart': planned_station_depart,
          'planned_school_arrive': planned_school_arrive,
          'num_nearby_station': len(nearby_station),
          'num_nearby_school': len(nearby_school),
          'mean_station_diff': nearby_station['ARRIVAL_TIME'].mean() - planned_station_depart if len(nearby_station) > 0 else 0,
          'mean_school_diff': nearby_school['ARRIVAL_TIME'].mean() - planned_school_depart if len(nearby_school) > 0 else 0,
          'std_station_diff': nearby_station['ARRIVAL_TIME'].std() if len(nearby_station) > 1 else 0,
          'std_school_diff': nearby_school['ARRIVAL_TIME'].std() if len(nearby_school) > 1 else 0,
          'planned_travel_time': planned_station_depart - planned_school_depart
      }

      if len(nearby_station) > 0:
        target = nearby_station['ARRIVAL_TIME'].mean() - planned_station_depart
      else:
        target = nearby_school['ARRIVAL_TIME'].mean() - planned_school_depart

      training_data.append({**features, 'target': target})

In [37]:
training_data[:5]

[{'day': 'MON',
  'planned_school_depart': 475,
  'planned_station_depart': 490,
  'planned_school_arrive': 505,
  'num_nearby_station': 2,
  'num_nearby_school': 0,
  'mean_station_diff': 6.0,
  'mean_school_diff': 0,
  'std_station_diff': 1.4142135623730951,
  'std_school_diff': 0,
  'planned_travel_time': 15,
  'target': 6.0},
 {'day': 'MON',
  'planned_school_depart': 482,
  'planned_station_depart': 497,
  'planned_school_arrive': 512,
  'num_nearby_station': 2,
  'num_nearby_school': 0,
  'mean_station_diff': -1.0,
  'mean_school_diff': 0,
  'std_station_diff': 1.4142135623730951,
  'std_school_diff': 0,
  'planned_travel_time': 15,
  'target': -1.0},
 {'day': 'MON',
  'planned_school_depart': 483,
  'planned_station_depart': 498,
  'planned_school_arrive': 513,
  'num_nearby_station': 2,
  'num_nearby_school': 0,
  'mean_station_diff': -2.0,
  'mean_school_diff': 0,
  'std_station_diff': 1.4142135623730951,
  'std_school_diff': 0,
  'planned_travel_time': 15,
  'target': -2.0},
