In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import os

# 1. 데이터 로딩
file_path = "data/merged_data/Accident_by_year_merged_hc.csv"
df = pd.read_csv(file_path, encoding="utf-8-sig")

# 2. 복사본 생성
df_filled = df.copy()

# -------------------------------
# 1단계: 2005~2007년 '거주인구' 회귀 보간
# -------------------------------
target_years = [2005, 2006, 2007]
target_column = "거주인구"

for gu in df['자치구'].unique():
    gu_data = df_filled[df_filled['자치구'] == gu]
    train = gu_data[(gu_data[target_column] != 0) & (~gu_data[target_column].isna())]
    predict = gu_data[(gu_data[target_column] == 0) & (gu_data['연도'].isin(target_years))]

    if not train.empty and not predict.empty:
        X_train = train[['연도']]
        y_train = train[target_column]
        X_pred = predict[['연도']]

        model = LinearRegression()
        model.fit(X_train, y_train)

        preds = pd.Series(np.round(model.predict(X_pred)), index=predict.index).astype("Int64")
        df_filled.loc[predict.index, target_column] = preds

# -------------------------------
# 2단계: 2024~2025년 나머지 컬럼 회귀 보간
# -------------------------------
target_columns = [col for col in df.columns if col not in ['연도', '자치구', '거주인구']]

for target in target_columns:
    for gu in df['자치구'].unique():
        gu_df = df_filled[df_filled['자치구'] == gu]

        train_df = gu_df[(gu_df[target].notna()) & (gu_df[target] != 0) & (gu_df['거주인구'].notna())]

        if target == '교통사고 발생건수':
            predict_df = gu_df[(gu_df['연도'] == 2025) & ((gu_df[target].isna()) | (gu_df[target] == 0)) & (gu_df['거주인구'].notna())]
        else:
            predict_df = gu_df[(gu_df['연도'].isin([2024, 2025])) & ((gu_df[target].isna()) | (gu_df[target] == 0)) & (gu_df['거주인구'].notna())]

        if not train_df.empty and not predict_df.empty:
            X_train = train_df[['연도', '거주인구']]
            y_train = train_df[target]
            X_pred = predict_df[['연도', '거주인구']]

            model = LinearRegression()
            model.fit(X_train, y_train)
            preds = model.predict(X_pred)

            preds = np.where(preds < 0, 0, preds)
            df_filled.loc[predict_df.index, target] = pd.Series(np.round(preds), index=predict_df.index).astype("Int64")

# 대중교통 열 제거
if '대중교통_승객수' in df_filled.columns:
    df_filled = df_filled.drop(columns=['대중교통_승객수'])

# 최종 저장
save_path = "data/regression_data/regression_final.csv"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
df_filled.to_csv(save_path, index=False, encoding="utf-8-sig")
