In [2]:
import pandas as pd
from itertools import product

In [4]:
df_resident = pd.read_csv('data/first_processing_data/Resident_population_preprocessing.csv')    
df_subway = pd.read_csv('data/first_processing_data/Subway_move_people.csv')

df_merged = pd.merge(df_resident, df_subway, on=['구', '연월'], how='outer')

df_merged.to_csv('data/merged_data/Population_merged.csv',encoding='utf-8-sig', index=False)

print(df_merged.head())

     구       연월      거주인구  지하철_승객_수
0  강남구  2008-01  570959.0       NaN
1  강남구  2008-02  570959.0       NaN
2  강남구  2008-03  570959.0       NaN
3  강남구  2008-04  570755.0       NaN
4  강남구  2008-05  570755.0       NaN


In [6]:
df_people_flow = pd.read_csv('data/merged_data/Population_merged.csv')
df_weather = pd.read_csv('data/first_processing_data/Weather_preprocessing.csv')
df_weather.fillna(0, inplace=True)

all_dates = sorted(df_weather['연월'].dropna().unique())
gu_list = sorted(df_people_flow['구'].dropna().unique())

gu_month_combinations = pd.DataFrame(list(product(gu_list, all_dates)), columns=['구', '연월'])

df_weather_expanded = pd.merge(gu_month_combinations, df_weather, on='연월', how='left')
df_merged_all = pd.merge(df_people_flow, df_weather_expanded, on=['구', '연월'], how='outer')

df_merged_all.to_csv('data/merged_data/Population_weather_merged.csv', index=False, encoding='utf-8-sig')

In [8]:
df_merged = pd.read_csv('data/merged_data/Population_weather_merged.csv')
df_bus = pd.read_csv('data/first_processing_data/Bus_accident_total_data_preprocessing.csv')

df_bus = df_bus.rename(columns={'자치구': '구', '년월': '연월'})

weather_cols = ['평균기온', '평균최고기온', '극점최고기온', '평균최저기온', '극점최저기온',
                '강수량', '평균습도', '최저습도', '해면기압', '이슬점온도', '평균운량',
                '일조시간', '최심신적설', '평균풍속', '최대풍속', '최대순간풍속']

non_weather_cols = [col for col in df_merged.columns if col not in weather_cols]
df_reordered = pd.concat([df_merged[non_weather_cols], df_merged[weather_cols]], axis=1)

df_temp = pd.merge(df_reordered, df_bus, on=['구', '연월'], how='left')

bus_cols = [col for col in df_bus.columns if col not in ['구', '연월']]
subway_idx = df_temp.columns.get_loc('지하철_승객_수') + 1

cols = list(df_temp.columns)
for col in bus_cols:
    cols.remove(col)
for i, col in enumerate(bus_cols):
    cols.insert(subway_idx + i, col)

df_final = df_temp[cols]
df_final.to_csv('data/merged_data/Total_data.csv', index=False,encoding='utf-8-sig')

In [12]:
df_transportation = pd.read_csv('data/first_processing_data/Transportation_preprocessing_month.csv')
df_merged = pd.read_csv('data/merged_data/Total_data.csv')

df_transportation['연월'] = pd.to_datetime(df_transportation['연월'])
df_merged['연월'] = pd.to_datetime(df_merged['연월'])

df_merged = pd.merge(df_merged, df_transportation, on=['구', '연월'], how='left')

weather_cols = ['평균기온', '평균최고기온', '극점최고기온', '평균최저기온', '극점최저기온',
                '강수량', '평균습도', '최저습도', '해면기압', '이슬점온도', '평균운량',
                '일조시간', '최심신적설', '평균풍속', '최대풍속', '최대순간풍속']

cols = list(df_merged.columns)
cols.remove('대중교통_승객_수')
insert_idx = cols.index(weather_cols[0])
cols.insert(insert_idx, '대중교통_승객_수')

df_merged = df_merged[cols]
df_merged.to_csv('data/merged_data/Total_transportation_merged.csv', index=False, encoding='utf-8-sig')

In [14]:
df = pd.read_csv('data/merged_data/Total_transportation_merged.csv')

df['연월'] = pd.to_datetime(df['연월'], errors='coerce')
df = df[df['연월'] != pd.to_datetime('2025-04')]

target_dates = [pd.to_datetime('2021-11'), pd.to_datetime('2021-12')]
df.loc[df['연월'].isin(target_dates), ['대중교통_승객_수', '지하철_승객_수']] = None

cols = list(df.columns)
cols.remove('대중교통_승객_수')
bus_idx = cols.index('버스승객수') + 1
cols.insert(bus_idx, '대중교통_승객_수')
df = df[cols]

df.to_csv('data/merged_data/Total_transportation_merged_preprocessing.csv', index=False, encoding='utf-8-sig')

In [16]:
df_traffic = pd.read_csv("data/first_processing_data/Traffic_accident_month_preprocessing.csv")
df = pd.read_csv('data/merged_data/Total_transportation_merged_preprocessing.csv')

df['연월'] = pd.to_datetime(df['연월'])
df_traffic['연월'] = pd.to_datetime(df_traffic['연월'])

df_merged = pd.merge(
    df.drop(columns=['발생건수', '사망자수', '부상자수'], errors='ignore'),
    df_traffic[['구', '연월', '발생건수', '사망자수', '부상자수']],
    on=['구', '연월'],
    how='outer'
)

final_column_order = [
    '구', '연월', '거주인구', '지하철_승객_수', '버스승객수', '대중교통_승객_수', '발생건수', '사망자수',
    '부상자수', '평균기온', '평균최고기온', '극점최고기온', '평균최저기온', '극점최저기온', '강수량', '평균습도',
    '최저습도', '해면기압', '이슬점온도', '평균운량', '일조시간', '최심신적설', '평균풍속', '최대풍속',
    '최대순간풍속'
]

df_merged = df_merged[final_column_order]

df_merged.to_csv('data/merged_data/Total_updated_data.csv', index=False, encoding='utf-8-sig')

In [18]:
df = pd.read_csv("data/merged_data/Total_updated_data.csv")
df_basic = df[
    ['구', '연월', '거주인구', '발생건수', '사망자수', '부상자수',
     '평균기온', '평균최고기온', '극점최고기온', '평균최저기온', '극점최저기온',
     '강수량', '평균습도', '최저습도', '해면기압', '이슬점온도',
     '평균운량', '일조시간', '최심신적설', '평균풍속', '최대풍속', '최대순간풍속']
]
df_basic.to_csv("data/merged_data/Basic_model.csv",encoding='utf-8-sig',index=False)

In [20]:
file1_path = "data/merged_data/Accident_by_year_merged_hc.csv"
file2_path = "data/merged_data/merged_final.csv"
output_path = "data/merged_data/merged_result.csv"

# CSV 파일 불러오기
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

# 컬럼명 통일: 자치구 → 구
if '자치구' in df1.columns:
    df1.rename(columns={'자치구': '구'}, inplace=True)
if '자치구' in df2.columns:
    df2.rename(columns={'자치구': '구'}, inplace=True)

# merged_final의 불필요한 컬럼 제거
columns_to_drop = ['거주인구', '버스승객수', '승객수', '발생건수', '부상자수']
df2.drop(columns=[col for col in columns_to_drop if col in df2.columns], inplace=True)

# 병합 수행 ('구', '연도' 기준)
df_merged = pd.merge(df1, df2, on=['구', '연도'], how='outer')

# 병합 결과에서 필요 없는 컬럼 제거
df_merged.drop(columns=['대중교통_승객수'], inplace=True)

# 결과 저장
df_merged.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"병합 완료. 결과가 '{output_path}'에 저장되었습니다.")


병합 완료. 결과가 'data/merged_data/merged_result.csv'에 저장되었습니다.


In [24]:
df_result = pd.read_csv("data/merged_data/merged_result.csv")
df_basic = pd.read_csv("data/regression_data/Basic_model_preprocessing.csv")

df_basic['연월'] = pd.to_datetime(df_basic['연월'], errors='coerce')
df_basic['연도'] = df_basic['연월'].dt.year
df_basic['월'] = df_basic['연월'].dt.month

df_filtered = df_basic[
    df_basic['연도'].isin([2005, 2006, 2007]) & 
    (df_basic['월'] == 1)
][['구', '연도', '거주인구']]

df_merged = pd.merge(df_result, df_filtered, on=['구', '연도'], how='left', suffixes=('', '_추가'))

df_merged['거주인구'] = df_merged.apply(
    lambda row: row['거주인구_추가'] if pd.isna(row['거주인구']) or row['거주인구'] == 0 else row['거주인구'],
    axis=1
)

df_merged.drop(columns=['거주인구_추가'], inplace=True)

df_merged.to_csv("data/merged_data/merged_result_updated.csv", encoding='utf-8-sig', index=False)

In [26]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/merged_data/merged_result_updated.csv", encoding='utf-8-sig')

weather_cols = [
    '눈_발생건수', '맑음_발생건수', '비_발생건수', '흐림_발생건수',
    '눈_사상자수', '맑음_사상자수', '비_사상자수', '흐림_사상자수'
]
df.loc[df['연도'].between(2005, 2012), weather_cols] = df.loc[df['연도'].between(2005, 2012), weather_cols].replace(0, np.nan)

df.loc[df['연도'] == 2005, '화재_소계'] = df.loc[df['연도'] == 2005, '화재_소계'].replace(0, np.nan)

vehicle_cols = [
    '승용계_발생건수', '승용계_사상자수',
    '버스계_발생건수', '버스계_사상자수',
    '화물계_발생건수', '화물계_사상자수'
]
df.loc[df['연도'].between(2005, 2016), vehicle_cols] = df.loc[df['연도'].between(2005, 2016), vehicle_cols].replace(0, np.nan)

df.loc[df['연도'].between(2024, 2025)] = df.loc[df['연도'].between(2024, 2025)].replace(0, np.nan)

df.to_csv("data/merged_data/merged_result_updated_last.csv", encoding='utf-8-sig', index=False)

In [28]:
df_metrics = pd.read_csv("data/first_processing_data/Traffic_accidents_multi_metric_tidy.csv", encoding="utf-8-sig")

df_pivot = df_metrics.pivot_table(
    index=["year", "district"],
    columns="metric",
    values="value"
).reset_index()

df_pivot = df_pivot.rename(columns={
    "year": "연도",
    "district": "구",
    "발생건수 (건)": "발생건수",
    "부상자수 (명)": "부상자수",
    "사망자수 (명)": "사망자수"
})

df_merged = pd.read_csv("data/merged_data/merged_result_updated_last.csv", encoding="utf-8-sig")

df_merged = df_merged.drop(columns=["발생건수"], errors="ignore")

df_combined = pd.merge(df_merged, df_pivot, on=["연도", "구"], how="left")

cols_to_drop = [
    "교통사고 발생건수",
    "인구 10만명당 부상자수 (명)_y",
    "인구 10만명당 사망자수 (명)_y",
    "자동차 1만대당 발생건수 (건)_y"
]
df_combined = df_combined.drop(columns=cols_to_drop, errors="ignore")

df_combined.to_csv("data/merged_data/merged_result_with_metrics.csv", index=False, encoding="utf-8-sig")
