In [1]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import holidays

plt.rcParams['font.family'] = 'Malgun Gothic'  
plt.rcParams['axes.unicode_minus'] = False

In [2]:
df1 = pd.read_csv('train_subway_merged_ext_data.csv', encoding='cp949')
df1.head()

Unnamed: 0,tm,line,station_number,station_name,direction,stn,ta,wd,ws,rn_day,rn_hr1,hm,si,ta_chi,congestion,entrances,transfer_count
0,2021010100,1,150,서울역,상선,419,-9.6,291.1,3.3,0.0,0.0,-99.0,-99.0,-12.6,0,9,3
1,2021010101,1,150,서울역,상선,419,-9.7,284.6,2.0,0.0,0.0,-99.0,-99.0,-9.8,0,9,3
2,2021010105,1,150,서울역,상선,419,-9.3,124.7,2.4,0.0,0.0,-99.0,-99.0,-10.3,1,9,3
3,2021010106,1,150,서울역,상선,419,-9.3,126.2,1.7,0.0,0.0,-99.0,-99.0,-10.1,2,9,3
4,2021010107,1,150,서울역,상선,419,-9.1,145.7,1.3,0.0,0.0,-99.0,-99.0,-9.7,3,9,3


In [3]:
missing_cols = ['ta', 'wd', 'ws', 'rn_day', 'rn_hr1', 'hm', 'si', 'ta_chi']

# 각 컬럼에서 -99 값을 np.nan으로 변환
df1[missing_cols] = df1[missing_cols].replace(-99.0, np.nan)
df1['wd'] = df1['wd'].replace(-9.9, np.nan)
#df['hm'] = df['hm'].interpolate()
print(df1.isnull().sum())

# 분석용 변수 목록 (si 제외)
valid_cols = ['ta', 'wd', 'ws', 'rn_day', 'rn_hr1', 'ta_chi']

# 결측치 제거
df1 = df1.dropna(subset=valid_cols)
df1.isnull().sum()

tm                      0
line                    0
station_number          0
station_name            0
direction               0
stn                     0
ta                 216468
wd                 287270
ws                 230786
rn_day             351574
rn_hr1             360796
hm                 844594
si                6064242
ta_chi                352
congestion              0
entrances               0
transfer_count          0
dtype: int64


tm                      0
line                    0
station_number          0
station_name            0
direction               0
stn                     0
ta                      0
wd                      0
ws                      0
rn_day                  0
rn_hr1                  0
hm                 626728
si                5896072
ta_chi                  0
congestion              0
entrances               0
transfer_count          0
dtype: int64

In [None]:
df1['tm'] = pd.to_datetime(df1['tm'], format='%Y%m%d%H')
df1['hour'] = df1['tm'].dt.hour
df1['month'] = df1['tm'].dt.month
df1['date'] = df1['tm'].dt.date
df1['weekday'] = df1['tm'].dt.weekday

# 공휴일 객체 생성
kr_holidays = holidays.KR(years=df1['tm'].dt.year.unique())

# 날짜만 추출해 공휴일 여부 확인
df1['is_holiday'] = df1['tm'].dt.date.isin(kr_holidays)

# 3. 분류 
def classify_day(row):
    if row['is_holiday']:
        return '공휴일'
    elif row['weekday'] == 6:
        return '일요일'
    elif row['weekday'] == 5:
        return '토요일'
    else:
        return '평일'

df1['day_type'] = df1.apply(classify_day, axis=1)

# 결과 확인
df1['day_type'].value_counts()

In [None]:
# 원본 보호
df2 = df1.copy()

# 선형 보간 함수 정의
def linear_interpolate_per_station(group):
    group = group.sort_values('tm')
    group['hm'] = group['hm'].interpolate(method='linear')
    return group

# 보간 적용 + 경고 방지
df2 = df2.groupby('station_name', group_keys=False).apply(linear_interpolate_per_station).reset_index(drop=True)

# 남은 NaN 보완
df2['hm'] = df2['hm'].fillna(method='ffill').fillna(method='bfill')

# 결측치 확인
print("최종 결측치 수:", df2['hm'].isna().sum())