In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


In [3]:
df = pd.read_csv("data/merged_data/Basic_model.csv")
df['연월'] = pd.to_datetime(df['연월'])
df['연도'] = df['연월'].dt.year

filled_rows = []

for gu in df['구'].unique():
    gu_df = df[df['구'] == gu].copy()
    known = gu_df.dropna(subset=['거주인구'])
    missing = gu_df[gu_df['거주인구'].isna()]
    
    if not known.empty and not missing.empty:
        model = LinearRegression()
        model.fit(known[['연도']], known['거주인구'])
        pred = model.predict(missing[['연도']])
        
        pred_series = pd.Series(pred.round(0), index=missing.index).astype("Int64")
        gu_df.loc[missing.index, '거주인구'] = pred_series

    filled_rows.append(gu_df)

df_filled = pd.concat(filled_rows)
df_basic = df_filled[
    ['구', '연월', '거주인구', '발생건수', '사망자수', '부상자수',
     '평균기온', '평균최고기온', '극점최고기온', '평균최저기온', '극점최저기온',
     '강수량', '평균습도', '최저습도', '해면기압', '이슬점온도',
     '평균운량', '일조시간', '최심신적설', '평균풍속', '최대풍속', '최대순간풍속']
]

df_basic = df_basic[df_basic['연월'] < '2023-12-01']

df_basic.to_csv("data/regression_data/Basic_model_preprocessing.csv", encoding='utf-8-sig', index=False)


In [7]:
df = pd.read_csv("data/merged_data/merged_result_with_metrics.csv", encoding="utf-8-sig")

target_cols = [
    '기타_부상자수','기타_사망자수','길가장자리구역통행중_부상자수','길가장자리구역통행중_사망자수',
    '보도통행중_부상자수','보도통행중_사망자수','차도 통행중_부상자수','차도 통행중_사망자수',
    '횡단중_부상자수','횡단중_사망자수','뺑소니_발생건수','뺑소니_부상자수','뺑소니_사망자수',
    '자전거_발생건수','자전거_사망자수','자전거_부상자수','차대사람_사상자수','차대차_사상자수',
    '차량단독_사상자수','차대사람_발생건수','차대차_발생건수','차량단독_발생건수',
    '인구 10만명당 부상자수 (명)_x','인구 10만명당 사망자수 (명)_x','자동차 1만대당 발생건수 (건)_x',
    '발생건수','부상자수','사망자수'
]

future_years = [2024, 2025, 2026]
log_rows = []

for gu in df["구"].unique():
    if not ((df["구"] == gu) & (df["연도"] == 2026)).any():
        row_2025 = df[(df["구"] == gu) & (df["연도"] == 2025)]
        if not row_2025.empty and not pd.isna(row_2025["거주인구"].values[0]):
            row_2026 = row_2025.copy()
            row_2026["연도"] = 2026
            df = pd.concat([df, row_2026], ignore_index=True)

for target in target_cols:
    for gu in df["구"].unique():
        df_gu = df[df["구"] == gu].copy()
        train = df_gu[df_gu["연도"] <= 2023].dropna(subset=[target, "거주인구"])
        test = df_gu[df_gu["연도"].isin(future_years)]

        if train.shape[0] < 5:
            continue

        X_train = train[["연도", "거주인구"]]
        y_train = train[target]

        model = LinearRegression().fit(X_train, y_train)
        score = model.score(X_train, y_train)

        for _, row in test.iterrows():
            if pd.isna(row["거주인구"]):
                continue
            x_pred = pd.DataFrame([[row["연도"], row["거주인구"]]], columns=["연도", "거주인구"])
            y_pred = model.predict(x_pred)[0]
            y_pred = int(max(0, y_pred))
            df.loc[(df["구"] == gu) & (df["연도"] == row["연도"]), target] = y_pred

        log_rows.append({
            "구": gu,
            "항목": target,
            "모델": "선형회귀",
            "R2": round(score, 4),
            "학습데이터수": train.shape[0]
        })

df = df.sort_values(by=["구", "연도"]).reset_index(drop=True)

df.to_csv("data/regression_data/new_model.csv", index=False, encoding="utf-8-sig")

df_log = pd.DataFrame(log_rows)
df_log.to_csv("data/regression_data/모델_성능_로그.csv", index=False, encoding="utf-8-sig")


In [8]:
df = pd.read_csv("data/regression_data/new_model.csv", encoding="utf-8-sig")

log_rows = []
target = "화재_소계"
future_years = [2024, 2025, 2026]

for gu in df["구"].unique():
    df_gu = df[df["구"] == gu].copy()
    train = df_gu[(df_gu["연도"] <= 2023) & (~df_gu[target].isna()) & (~df_gu["거주인구"].isna())]
    test = df_gu[df_gu["연도"].isin(future_years)]

    if train.shape[0] < 5:
        continue

    X_train = train[["연도", "거주인구"]]
    y_train = train[target]

    model = LinearRegression().fit(X_train, y_train)
    score = model.score(X_train, y_train)

    for _, row in test.iterrows():
        if pd.isna(row["거주인구"]):
            continue
        x_pred = pd.DataFrame([[row["연도"], row["거주인구"]]], columns=["연도", "거주인구"])
        y_pred = model.predict(x_pred)[0]
        y_pred = int(max(0, y_pred))
        df.loc[(df["구"] == gu) & (df["연도"] == row["연도"]), target] = y_pred

    log_rows.append({
        "구": gu,
        "항목": target,
        "모델": "선형회귀",
        "R2": round(score, 4),
        "학습데이터수": train.shape[0]
    })

df = df.sort_values(by=["구", "연도"]).reset_index(drop=True)
df.to_csv("data/regression_data/new_model_with_fire.csv", index=False, encoding="utf-8-sig")

df_log = pd.DataFrame(log_rows)
df_log.to_csv("data/regression_data/화재_소계_예측_성능로그.csv", index=False, encoding="utf-8-sig")

In [11]:
df = pd.read_csv("data/regression_data/new_model_with_fire.csv", encoding="utf-8-sig")

weather_cols = [
    "눈_발생건수", "맑음_발생건수", "비_발생건수", "흐림_발생건수",
    "눈_사상자수", "맑음_사상자수", "비_사상자수", "흐림_사상자수"
]

future_years = [2024, 2025, 2026]
log_rows = []

for target in weather_cols:
    for gu in df["구"].unique():
        df_gu = df[df["구"] == gu].copy()
        train = df_gu[(df_gu["연도"] >= 2013) & (df_gu["연도"] <= 2023)].dropna(subset=[target, "거주인구"])
        test = df_gu[df_gu["연도"].isin(future_years)]

        if train.shape[0] < 5:
            continue

        X_train = train[["연도", "거주인구"]]
        y_train = train[target]

        model = LinearRegression().fit(X_train, y_train)
        score = model.score(X_train, y_train)

        for _, row in test.iterrows():
            if pd.isna(row["거주인구"]):
                continue
            x_pred = pd.DataFrame([[row["연도"], row["거주인구"]]], columns=["연도", "거주인구"])
            y_pred = model.predict(x_pred)[0]
            y_pred = int(max(0, y_pred))
            df.loc[(df["구"] == gu) & (df["연도"] == row["연도"]), target] = y_pred

        log_rows.append({
            "구": gu,
            "항목": target,
            "모델": "선형회귀",
            "R2": round(score, 4),
            "학습데이터수": train.shape[0]
        })

df = df.sort_values(by=["구", "연도"]).reset_index(drop=True)
df.to_csv("data/regression_data/new_model_with_weather.csv", index=False, encoding="utf-8-sig")

df_log = pd.DataFrame(log_rows)
df_log.to_csv("data/regression_data/기상별_사고_예측_성능로그.csv", index=False, encoding="utf-8-sig")

In [12]:
df = pd.read_csv("data/regression_data/new_model_with_weather.csv", encoding="utf-8-sig")

vehicle_cols = [
    "승용계_발생건수", "승용계_사상자수",
    "버스계_발생건수", "버스계_사상자수",
    "화물계_발생건수", "화물계_사상자수"
]

future_years = [2024, 2025, 2026]
log_rows = []

for target in vehicle_cols:
    for gu in df["구"].unique():
        df_gu = df[df["구"] == gu].copy()
        train = df_gu[(df_gu["연도"] >= 2017) & (df_gu["연도"] <= 2023)].dropna(subset=[target, "거주인구"])
        test = df_gu[df_gu["연도"].isin(future_years)]

        if train.shape[0] < 5:
            continue

        X_train = train[["연도", "거주인구"]]
        y_train = train[target]

        model = LinearRegression().fit(X_train, y_train)
        score = model.score(X_train, y_train)

        for _, row in test.iterrows():
            if pd.isna(row["거주인구"]):
                continue
            x_pred = pd.DataFrame([[row["연도"], row["거주인구"]]], columns=["연도", "거주인구"])
            y_pred = model.predict(x_pred)[0]
            y_pred = int(max(0, y_pred))
            df.loc[(df["구"] == gu) & (df["연도"] == row["연도"]), target] = y_pred

        log_rows.append({
            "구": gu,
            "항목": target,
            "모델": "선형회귀",
            "R2": round(score, 4),
            "학습데이터수": train.shape[0]
        })

df = df.sort_values(by=["구", "연도"]).reset_index(drop=True)
df.to_csv("data/regression_data/new_model_final.csv", index=False, encoding="utf-8-sig")

df_log = pd.DataFrame(log_rows)
df_log.to_csv("data/regression_data/차종별_예측_성능로그.csv", index=False, encoding="utf-8-sig")
