In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. 데이터 로드
traffic_df = pd.read_excel('2020 서울시 교통량 조사자료.xlsx')  # 교통량 데이터
cctv_df = pd.read_csv('서울시 광진구 불법주정차 위반 단속 CCTV 위치정보.csv', encoding='cp949')
illegal_df = pd.read_csv('서울특별시 광진구_주정차단속현황_20250310.csv', encoding='cp949')

# 2. 열 이름 확인 및 수정
print("traffic_df columns:", traffic_df.columns)
print("cctv_df columns:", cctv_df.columns)
print("illegal_df columns:", illegal_df.columns)

# 열 이름 통일
# Assuming '구분' column in traffic_df corresponds to '지역'
traffic_df.rename(columns={'구분': '지역'}, inplace=True)
cctv_df.rename(columns={'고정형CCTV지번주소': '지역', '단속지점명': 'CCTV수'}, inplace=True)
illegal_df.rename(columns={'단속지역': '지역', '단속일시': '불법신고건수'}, inplace=True)

# Aggregating data to count the number of CCTV and illegal parking reports per region
cctv_df = cctv_df.groupby('지역').size().reset_index(name='CCTV수')
illegal_df = illegal_df.groupby('지역').size().reset_index(name='불법신고건수')

# 병합 전 확인
if '지역' not in traffic_df.columns:
    raise KeyError("traffic_df에 '지역' 열이 없습니다.")
if '지역' not in cctv_df.columns:
    raise KeyError("cctv_df에 '지역' 열이 없습니다.")
if '지역' not in illegal_df.columns:
    raise KeyError("illegal_df에 '지역' 열이 없습니다.")

# 3. 데이터 병합
data = traffic_df.merge(cctv_df, on='지역', how='left')
data = data.merge(illegal_df, on='지역', how='left')

# 4. Feature Engineering
data['교통량비율'] = data['유입(전체)'] / (data['유출(전체)'] + 1)

# Create a dummy column for '불법주정차_발생여부' if it doesn't exist
if '불법주정차_발생여부' not in data.columns:
    data['불법주정차_발생여부'] = (data['불법신고건수'] > data['불법신고건수'].mean()).astype(int)

features = data[['교통량비율', 'CCTV수', '불법신고건수']]
target = data['불법주정차_발생여부']

# 5. 데이터 분할 및 모델 학습
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 6. 평가
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 7. 예측 결과 활용
if len(model.classes_) > 1:  # Check if the model has more than one class
    data['예측위험도'] = model.predict_proba(features)[:, 1]
    high_risk = data[data['예측위험도'] > 0.7]
    print(high_risk[['지역', '예측위험도']])
else:
    print("The model predicts only one class. Unable to calculate '예측위험도'.")

traffic_df columns: Index(['구분', '유입(전체)', '유입(도심)', '유입(시계)', '유입(교량)', '유입(간선도시)', '유입(고속)',
       '유출(전체)', '유출(도심)', '유출(시계)', '유출(교량)', '유출(간선도시)', '유출(고속)'],
      dtype='object')
cctv_df columns: Index(['고정형CCTV지번주소', '위도', '경도', '자치구', '단속지점명', '현장구분'], dtype='object')
illegal_df columns: Index(['단속일시', '과태료', '단속지역', '단속장소', '위반내용', '단속구분'], dtype='object')
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

The model predicts only one class. Unable to calculate '예측위험도'.
