In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

data = pd.read_csv('dataset/in_and_out.csv', encoding='cp949')
data

Unnamed: 0,사용월,호선명,지하철역,04시-05시 승차인원,04시-05시 하차인원,05시-06시 승차인원,05시-06시 하차인원,06시-07시 승차인원,06시-07시 하차인원,07시-08시 승차인원,...,23시-24시 하차인원,00시-01시 승차인원,00시-01시 하차인원,01시-02시 승차인원,01시-02시 하차인원,02시-03시 승차인원,02시-03시 하차인원,03시-04시 승차인원,03시-04시 하차인원,작업일자
0,202504,1호선,동대문,650,26,11355,2005,10238,7181,18411,...,9939,938,3210,3,321,0,0,0,0,20250503
1,202504,1호선,동묘앞,117,3,2844,819,4206,5444,9689,...,4897,120,1996,3,267,0,0,0,0,20250503
2,202504,1호선,서울역,712,48,10654,9668,29272,64058,97455,...,29252,5212,8018,10,262,0,0,0,0,20250503
3,202504,1호선,시청,124,5,2267,5900,3888,28761,7841,...,3763,606,1015,0,15,0,0,0,0,20250503
4,202504,1호선,신설동,381,18,9393,2330,10420,9020,24157,...,9416,455,3039,0,0,1,2,0,0,20250503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14857,202305,중앙선,원덕,95,0,394,6,740,182,885,...,328,3,80,0,0,0,0,0,0,20230603
14858,202305,중앙선,중랑,153,5,6886,593,10126,2329,24074,...,5311,34,486,0,0,0,0,0,0,20230603
14859,202305,중앙선,지평,0,0,0,0,4,2,384,...,0,0,0,0,0,0,0,0,0,20230603
14860,202305,중앙선,팔당,0,0,259,78,726,728,1658,...,202,0,0,0,0,0,0,0,0,20230603


In [4]:
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from statsmodels.stats.contingency_tables import Table2x2

# 데이터 불러오기

# 시간대 컬럼 추출
time_slots = [col.split(' ')[0] for col in data.columns if '승차인원' in col]
time_slots = sorted(list(set(time_slots)), key=lambda x: int(x[:2]))

results = []

for i in range(3, len(time_slots)-1):
    prev_slots = time_slots[i-3:i]
    next_slot = time_slots[i+1]
    
    # 이전 3개 시간대 승차/하차 합계
    data['prev_sum'] = data[[f'{slot} 승차인원' for slot in prev_slots]].sum(axis=1)
    data['next'] = data[f'{next_slot} 승차인원']
    
    # 이진 분류 예시: 기준값(중앙값) 이상/미만
    threshold = data['prev_sum'].median()
    data['prev_binary'] = (data['prev_sum'] >= threshold).astype(int)
    data['next_binary'] = (data['next'] >= data['next'].median()).astype(int)
    
    # 2x2 교차표
    table = pd.crosstab(data['prev_binary'], data['next_binary'])
    
    # 오즈비, 신뢰구간, p-value 계산
    if table.shape == (2,2):
        oddsratio, pvalue = fisher_exact(table)
        ci = Table2x2(table.values).oddsratio_confint()
        results.append({
            '기준시간대': time_slots[i],
            '오즈비': oddsratio,
            '신뢰구간': ci,
            'p-value': round(pvalue, 2)
        })

result_df = pd.DataFrame(results)
print(result_df)


      기준시간대         오즈비                                      신뢰구간  p-value
0   03시-04시    1.417855  (1.3292167227304972, 1.5124033266343573)      0.0
1   05시-06시    4.778401    (4.458513353423438, 5.121239711976297)      0.0
2   06시-07시   34.543561  (31.532073635456985, 37.842662978726246)      0.0
3   07시-08시   37.344379  (34.045174325942774, 40.963299681825774)      0.0
4   08시-09시   37.805649   (34.45864625368403, 41.477749298400035)      0.0
5   09시-10시   27.313436  (25.023258965478757, 29.813215379382807)      0.0
6   10시-11시   21.158613  (19.454775640633862, 23.011672343177384)      0.0
7   11시-12시   35.418643     (32.3177319154325, 38.81708907842876)      0.0
8   12시-13시   79.470302      (71.422272488572, 88.42520180193652)      0.0
9   13시-14시  108.833044   (97.12553140924794, 121.95177980577184)      0.0
10  14시-15시  144.885568   (128.3960443633674, 163.49279201114317)      0.0
11  15시-16시  151.420079   (134.0345785808157, 171.06063735085672)      0.0
12  16시-17시   86.877934  