## 횡단보도 관리번호 별 사건 개수 컬럼 추가

In [1]:
import pandas as pd

In [39]:
df = pd.read_excel('dj_09_filtered_xy.xlsx')
print(df.columns)

Index(['acc_x', 'acc_y', '시도명', '시군구명', '도로명', '소재지도로명주소', '횡단보도관리번호',
       '횡단보도종류', '자전거횡단도겸용여부', '고원식적용여부', 'grsXCrd', 'grsYCrd', '차로수', '횡단보도폭',
       '횡단보도연장', '보행자신호등유무', '보행자작동신호기유무', '음향신호기설치여부', '녹색신호시간', '적색신호시간',
       '교통섬유무', '보도턱낮춤여부', '점자블록유무', '집중조명시설유무', '관리기관명', '관리기관전화번호',
       '데이터기준일자', '제공기관코드', '제공기관명'],
      dtype='object')


In [None]:
count_by_id = df['횡단보도관리번호'].value_counts()
print(count_by_id)

In [None]:
df['사건수'] = df['횡단보도관리번호'].map(count_by_id)

In [None]:
print(df[['녹색신호시간', '사건수']].isnull().sum())

In [None]:
print(df.columns)

In [2]:
df.to_excel('dongjak_with_crosswalk_accident_count.xlsx')

NameError: name 'df' is not defined

## 상관계수 확인(피어슨, 스피어만)

In [3]:
df2 = pd.read_excel('dongjak_with_crosswalk_accident_count.xlsx')

In [4]:
from scipy.stats import pearsonr, spearmanr

In [5]:
df_valid = df2[['녹색신호시간', '사건수']].dropna()

In [41]:
print(df_valid.columns)

Index(['녹색신호시간', '사건수'], dtype='object')


In [13]:
print(df_valid['사건수'].dtype)
print(df_valid['녹색신호시간'].dtype)

int64
float64


In [14]:
pearsonr_corr, p_value1 = pearsonr(df_valid['녹색신호시간'], df_valid['사건수'])
spearmanr_corr, p_value2 = spearmanr(df_valid['녹색신호시간'], df_valid['사건수'])

In [15]:
print(df_valid['녹색신호시간'].nunique())
print(df_valid['사건수'].nunique())
print(df_valid['녹색신호시간'].head(10))
print(df_valid['사건수'].head(10))

25
6
0     33.0
2     27.0
5     31.0
7     32.0
25    28.0
26    30.0
27    25.0
30    30.0
34    27.0
40    26.0
Name: 녹색신호시간, dtype: float64
0     2
2     2
5     1
7     1
25    3
26    1
27    1
30    1
34    3
40    3
Name: 사건수, dtype: int64


In [16]:
print(f"피어슨 상관계수: {pearsonr_corr:.2f} (p={p_value1:.3f})")
print(f"스피어만 상관계수: {spearmanr_corr:.2f} (p={p_value2:.3f})")

피어슨 상관계수: -0.27 (p=0.002)
스피어만 상관계수: -0.21 (p=0.019)


## 가설에 대한 다중회귀분석

In [30]:
%pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.5-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Using cached patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading statsmodels-0.14.5-cp313-cp313-macosx_11_0_arm64.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached patsy-1.0.1-py2.py3-none-any.whl (232 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-1.0.1 statsmodels-0.14.5

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [32]:
import statsmodels.api as sm

In [65]:
print(df2['녹색신호시간'].dtypes)
print(df2['차로수'].dtypes)
print(df2['횡단보도폭'].dtypes)
print(df2['보행자신호등유무'].dtypes)
print(df2['집중조명시설유무'].dtypes)

float64
int64
float64
object
object


In [66]:
df_valid2 = df2[['녹색신호시간', '차로수', '횡단보도폭', '보행자신호등유무', '집중조명시설유무']].dropna()

In [70]:
X = df_valid2[['녹색신호시간','차로수','횡단보도폭', '보행자신호등유무', '집중조명시설유무']]
X = pd.get_dummies(X, drop_first=True) # 범주형 변수인 '보행자신호등유무', '집중조명시설유무' 컬럼 처리
X = X.astype(float) # 여전히 True/False 값으로 되어있어 숫자형으로 변환
X = sm.add_constant(X)
y = df_valid['사건수']

In [71]:
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    사건수   R-squared:                       0.157
Model:                            OLS   Adj. R-squared:                  0.129
Method:                 Least Squares   F-statistic:                     5.667
Date:                Wed, 16 Jul 2025   Prob (F-statistic):           0.000323
Time:                        17:59:19   Log-Likelihood:                -259.37
No. Observations:                 127   AIC:                             528.7
Df Residuals:                     122   BIC:                             543.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.0503      0.903      4.484      0.0