In [31]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from google.colab import drive

In [33]:
# 1. 데이터 불러오기
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/Colab Notebooks/homerun/'
gstation_path = f'{base_path}gStation2.csv'
arrival_data_path = f'{base_path}arrival_data.csv'


gstation_df = pd.read_csv(gstation_path)
arrival_df = pd.read_csv(arrival_data_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
# 2. arrival_data 전처리
# 시간을 분으로 변환 (ARRIVAL_TIME이 문자열 형태인 경우)
if arrival_df['ARRIVAL_TIME'].dtype == 'object':
    arrival_df['MINUTES'] = arrival_df['ARRIVAL_TIME'].apply(lambda x:
        int(x.split(':')[0]) * 60 + int(x.split(':')[1]))

# 요일과 출발지를 원-핫 인코딩
arrival_df_encoded = pd.get_dummies(arrival_df, columns=['DAY', 'DEPART_AT'])

In [42]:
# 3. 특성과 타겟 분리
X = arrival_df_encoded.drop(['ARRIVAL_TIME', 'MINUTES'], axis=1)
y = arrival_df_encoded['MINUTES']

# 특성 컬럼 이름 저장
feature_columns = X.columns.tolist()

In [36]:
# 4. 학습/테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [37]:
# 5. 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42
)
rf_model.fit(X_train, y_train)

In [38]:
# 6. 모델 평가
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Average prediction error: {np.sqrt(mse):.2f} minutes")

Mean Squared Error: 51713.99
Mean Absolute Error: 212.82
Average prediction error: 227.41 minutes


In [43]:
# 7. 새로운 스케줄 생성
days = ['MON', 'TUE', 'WED', 'THU', 'FRI']
new_schedule = gstation_df.copy()

for day in days:
    # 학교 출발 시간에 대한 역 도착 시간 예측
    school_times = gstation_df[f'{day}_SCHOOL_DEPART'].copy()

    for idx, time in school_times.items():
        hours, minutes = map(int, time.split(':'))
        total_minutes = hours * 60 + minutes

        # 예측을 위한 데이터프레임 생성
        pred_data = pd.DataFrame(columns=feature_columns)
        pred_data.loc[0] = 0  # 모든 값을 0으로 초기화

        # 해당하는 요일과 출발지 설정
        pred_data[f'DAY_{day}'] = 1
        pred_data['DEPART_AT_SCH'] = 1
        pred_data['DEPART_AT_STA'] = 0

        # 예측
        predicted_minutes = int(rf_model.predict(pred_data)[0])
        predicted_hours = predicted_minutes // 60
        predicted_mins = predicted_minutes % 60

        # 예측된 시간을 스케줄에 업데이트
        new_schedule.at[idx, f'{day}_STATION_DEPART'] = f"{predicted_hours:02d}:{predicted_mins:02d}"

In [40]:
# 8. 새로운 스케줄 저장
output_path = f'{base_path}new_schedule.csv'
new_schedule.to_csv(output_path, index=False)
print("새로운 스케줄이 생성되어 저장되었습니다!")

# 9. 새로운 스케줄 확인
print("\n새로운 스케줄의 처음 5행:")
print(new_schedule.head())

새로운 스케줄이 생성되어 저장되었습니다!

새로운 스케줄의 처음 5행:
  MON_SCHOOL_DEPART MON_STATION_DEPART TUE_SCHOOL_DEPART TUE_STATION_DEPART  \
0              7:45              12:53              7:45              12:41   
1              8:00              12:53              8:00              12:41   
2              8:02              12:53              8:02              12:41   
3              8:03              12:53              8:03              12:41   
4              8:05              12:53              8:05              12:41   

  WED_SCHOOL_DEPART WED_STATION_DEPART THU_SCHOOL_DEPART THU_STATION_DEPART  \
0              7:45              12:40              7:45              12:41   
1              8:00              12:40              8:00              12:41   
2              8:02              12:40              8:02              12:41   
3              8:03              12:40              8:03              12:41   
4              8:05              12:40              8:05              12:41   

  FRI_SCHO