In [1]:
import pandas as pd
import re
from io import StringIO
import numpy as np

In [3]:
# 자치구 코드: 자치구명으로 변경, 일 데이터 월 데이터로 합산
# 행정동별 지하철 승차 승객수 정보 + 서울시 음면동 마스터 정보 => 지하철 유동인구 정보

df_do = pd.read_csv("data/raw_data/Seoul_master_information.csv", encoding='cp949')
df_sub = pd.read_csv("data/raw_data/Seoul_subway.csv", encoding='cp949')

df_dong = df_do.copy()
df_subway = df_sub.copy()

df_seoul = df_dong[df_dong['시도_명칭'] == '서울']


df_seoul_trimmed = df_seoul[['행정동_ID', '행정동_명칭', '자치구_명칭']].drop_duplicates()

df_subway_merged = df_subway.merge(df_seoul_trimmed, on='행정동_ID', how='left')
df_subway_merged.rename(
    columns={col: col.replace('지하철_승객_수_', '') for col in df_subway_merged.columns if col.startswith('지하철_승객_수_')},
    inplace=True
)

df_subway_cleaned = df_subway_merged.dropna(subset=['지하철_승객_수'])

df_subway_cleaned_id = df_subway_cleaned.drop('행정동_ID', axis=1)

time_cols = [f"{i:02}시" for i in range(24)]  # ['00시', '01시', ..., '23시']

cols_order = ['기준_날짜', '자치구_명칭', '행정동_명칭', '지하철_승객_수'] + time_cols

df_subway_cleaned_id = df_subway_cleaned_id[cols_order]

df_grouped = (
    df_subway_cleaned_id
    .groupby(['기준_날짜', '자치구_명칭'], as_index=False)['지하철_승객_수']
    .sum()
)

df_grouped.to_csv('data/first_processing_data/Seoul_subway_dong.csv', index=False, encoding='utf-8-sig')

In [5]:
# 지하철 유동인구 csv 컬럼 이름 정제 , 날짜 정제

df_moving = pd.read_csv('data/first_processing_data/Seoul_subway_dong.csv')

df_moving['기준_날짜'] = pd.to_datetime(df_moving['기준_날짜'], format='%Y%m%d')

df_moving['연월'] = df_moving['기준_날짜'].dt.to_period('M').astype(str)

monthly_sum = df_moving.groupby(['자치구_명칭', '연월'])['지하철_승객_수'].sum().reset_index()
monthly_sum.rename(columns={'자치구_명칭': '구'}, inplace=True)
monthly_sum.to_csv('data/first_processing_data/Subway_move_people.csv',index=False,encoding='utf-8-sig')

In [7]:
# 교통사고 (사고유형별) 데이터: 건널목 데이터는 대부분 결측치 이기 때문에 제외, 결측치 열 제거 + 전처리 + 날짜 통일

df = pd.read_csv("data/raw_data/Traffic_accident_type.csv")
df_accident = df.copy()

df_accident = df_accident.drop(columns=['자치구별(1)'])
df_accident = df_accident.drop(index=1)
df_accident = df_accident.drop(columns=df_accident.columns[df_accident.iloc[0] == '건널목'])

target_years = [str(y) for y in range(2005, 2024)]

year_cols = [col for col in df_accident.columns if any(col.startswith(y) for y in target_years)]

base_cols = ['자치구별(2)', '구분별(1)'] 

df_accident = df_accident[base_cols + year_cols]
df_accident_data = df_accident.drop(index=0)
df_raw = df_accident_data.copy()
df_raw = df_raw.rename(columns={df_raw.columns[0]: "자치구", df_raw.columns[1]: "지표"})
df_melted = df_raw.melt(
    id_vars=["자치구", "지표"],
    var_name="열",
    value_name="값"
)
def parse_col(col):
    col = str(col)
    parts = col.split('.')
    year = parts[0]
    type_code = parts[1] if len(parts) > 1 else '0'
    accident_type_map = {
        '0': '합계',
        '1': '차대사람',
        '2': '차대차',
        '3': '차량단독'
    }
    accident_type = accident_type_map.get(type_code)
    if accident_type is None:
        return pd.Series([None, None])
    return pd.Series([year, accident_type])
df_melted[['연도', '사고유형']] = df_melted['열'].apply(lambda x: pd.Series(parse_col(x)))
df_melted = df_melted.dropna(subset=["연도", "사고유형"])
df_pivot = df_melted.pivot_table(
    index=["자치구", "연도", "사고유형"],
    columns="지표",
    values="값",
    aggfunc="first"
).reset_index()
df_final = df_pivot.rename(columns={
    "발생건수 (건)": "발생건수",
    "사망자수 (명)": "사망자수",
    "부상자수 (명)": "부상자수"
})
df_final.to_csv("data/first_processing_data/Traffic_accident_type_preprocessing.csv", index=False,encoding='utf-8-sig')

In [9]:
# 거주인구: 분기별로 나와있어서 분기를 연/월로 변경 

with open('data/raw_data/Resident_population.csv', 'r', encoding='utf-8-sig', errors='ignore') as f:
    lines = f.readlines()

lines = lines[:-1]
cleaned_csv = StringIO(''.join(lines))
df = pd.read_csv(cleaned_csv)

df = df.drop(columns=[col for col in df.columns if 'Unnamed' in col])

quarter_to_months = {
    '1': ['01', '02', '03'],
    '2': ['04', '05', '06'],
    '3': ['07', '08', '09'],
    '4': ['10', '11', '12']
}

expanded_columns = []
column_mapping = {}

for col in df.columns:
    match = re.match(r'^(\d{4})\.\s?(\d)/4$', str(col))
    if match:
        year, quarter = match.groups()
        months = quarter_to_months.get(quarter)
        if months:
            for m in months:
                new_col = f"{year}-{m}"
                expanded_columns.append(new_col)
                column_mapping[new_col] = col

df = df[df['구분별'] == '계']
df = df.rename(columns={'동별': '구'})

monthly_data = {}
for new_col in expanded_columns:
    monthly_data[new_col] = df[column_mapping[new_col]]

df_monthly = pd.concat(
    [df[['구']]] + [monthly_data[col] for col in expanded_columns],
    axis=1
)
df_monthly.columns = ['구'] + expanded_columns

df_long = pd.melt(
    df_monthly,
    id_vars=['구'],
    var_name='연월',
    value_name='거주인구'
)

df_long = df_long[df_long['구'] != '합계']
df_long.to_csv('data/first_processing_data/Resident_population_preprocessing.csv', index=False,encoding='utf-8-sig')

In [11]:
# 기상개황: 기상 데이터 컬럼 이름들 알아보기 쉽게 전체 변경 + 연월 통일

df_weather = pd.read_csv('data/raw_data/Weather.csv')

df_weather = df_weather.iloc[1:].copy()

df_weather.columns = [
    '연월', '평균기온', '평균최고기온', '극점최고기온', '평균최저기온', '극점최저기온',
    '강수량', '평균습도', '최저습도', '해면기압', '이슬점온도', '평균운량',
    '일조시간', '최심신적설', '평균풍속', '최대풍속', '최대순간풍속'
]

df_weather['연월'] = pd.to_datetime(df_weather['연월'].str.strip() + '-01', format='%Y. %m-%d')
df_weather['연월'] = df_weather['연월'].dt.to_period('M').astype(str)

numeric_cols = df_weather.columns.drop('연월')
df_weather[numeric_cols] = df_weather[numeric_cols].apply(pd.to_numeric, errors='coerce')

df_weather.fillna(0, inplace=True)
df_weather.to_csv('data/first_processing_data/Weather_preprocessing.csv',index=False,encoding='utf-8-sig')

In [17]:
# 감전사고_데이터 선형회귀를 위해 값이 안들어가있는것들은 nan 처리 + 연월통일

df_electricshock = pd.read_csv('data/merged_data/Disaster_Electric_total_data.csv')

df_electricshock['년월'] = df_electricshock['년월'].str.replace('="', '', regex=False).str.replace('"', '', regex=False)
df_electricshock['년월'] = pd.to_datetime(df_electricshock['년월'], format="%Y-%m")

condition_pre_2021_07 = df_electricshock['년월'] < pd.to_datetime('2021-07-01')
df_electricshock.loc[condition_pre_2021_07 & (df_electricshock['재난사고_사고건수'] == 0), '재난사고_사고건수'] = np.nan
df_electricshock.loc[condition_pre_2021_07 & (df_electricshock['재난사고_사상자수'] == 0), '재난사고_사상자수'] = np.nan

condition_post_2023_01 = df_electricshock['년월'] >= pd.to_datetime('2023-01-01')
df_electricshock.loc[condition_post_2023_01 & (df_electricshock['감전사고_사상자수'] == 0), '감전사고_사상자수'] = np.nan

df_electricshock.to_csv('data/first_processing_data/Electric_shock_accident_preprocessing.csv', index=False, encoding='utf-8-sig')

In [19]:
# 교통사고_월별 데이터 0값들이 결측치 처리 되어있어서 결측치들을 0으로 변경 + 형식 맞추기 

import pandas as pd

df_raw = pd.read_csv("data/raw_data/Traffic_accident_month_2005.csv")

df = df_raw.iloc[1:].copy()
df = df.rename(columns={df.columns[1]: "구"})
df = df.drop(columns=[df.columns[0]])

df = df[df["구"] != "소계"].reset_index(drop=True)

records = []

for year in range(2005, 2025):
    for month in range(1, 13):
        base = f"{year}. {month:02d}"
        try:
            발생건수 = df[base].astype(float)
            사망자수 = df[f"{base}.1"].replace('-', 0).astype(float)
            부상자수 = df[f"{base}.2"].astype(float)
        except KeyError:
            continue

        for i, gu in df["구"].items():
            records.append({
                "구": gu,
                "연월": f"{year}-{month:02d}-01",
                "발생건수": int(발생건수[i]),
                "부상자수": int(부상자수[i]),
                "사망자수": int(사망자수[i])
            })

df_final = pd.DataFrame(records)
df_final = df_final.sort_values(by=["구", "연월"]).reset_index(drop=True)

df_final.to_csv("data/first_processing_data/Traffic_accident_month_preprocessing.csv",encoding='utf-8-sig',index=False)

In [23]:
# 버스승객수_교통사고_통합_합본: 0값 결측치 처리 되어있어서 결측치를 0으로 변경 + 연월 통일

df = pd.read_csv('data/merged_data/Bus_Accident_total_data_2017_2025.csv')

df['사망자수'] = df['사망자수'].replace('-', pd.NA)
condition = df['사망자수'].isna() & df['부상자수'].notna()
df.loc[condition, '사망자수'] = 0

if '버스승객수' in df.columns:
    df.loc[df['버스승객수'] == 0, '버스승객수'] = pd.NA

df[['year', 'month']] = df['년월'].astype(str).str.strip().str.split('.', expand=True)
df['month'] = df['month'].str.strip().str.zfill(2)
df['년월'] = df['year'].str.strip() + '-' + df['month']
df['년월'] = df['년월'].str.replace("'", "")
df = df.drop(columns=['year', 'month'])

df['년월'] = pd.to_datetime(df['년월'])
condition_future = (df['년월'] >= '2024-01-01') & (df['년월'] <= '2025-04-01')
df.loc[condition_future, ['발생건수', '사망자수', '부상자수']] = pd.NA
df['년월'] = df['년월'].dt.strftime('%Y-%m')

df.to_csv('data/first_processing_data/Bus_accident_total_data_preprocessing.csv', index=False, encoding='utf-8-sig')