### 1. 데이터 불러오기

In [39]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [40]:
# CSV 파일 가져오기
# 가져온 CSV 파일의 출처는 보고서 내에 작성하였습니다. / 1학기 프로젝트와 같은 CSV를 활용했습니다.

daily_traffic = pd.read_csv('daily_traffic.csv', encoding='cp949')
daily_traffic

daily_traffic = daily_traffic.transpose()
daily_traffic

Unnamed: 0,0,1,2,3,4,5,6,7,8
권역,수도권본부,강원본부,대전충남본부,전북본부,광주전남본부,대구경북본부,부산경남본부,충북본부,전국
입출구,출구,출구,출구,출구,출구,출구,출구,출구,출구
2020.12.06,1078145,178826,273571,105799,227531,323017,536274,129682,2852845
2020.12.07,1423350,197219,308932,113400,231053,360733,643109,154886,3432682
2020.12.08,1405607,194636,298639,107446,225886,355062,628480,148861,3364617
...,...,...,...,...,...,...,...,...,...
2022.06.12,1236052,209919,319281,122368,253680,396703,624024,162065,3324092
2022.06.13,1494344,221271,343045,121501,240760,424691,680155,174276,3700043
2022.06.14,1494851,211687,327413,112034,226741,393165,618857,165773,3550521
2022.06.15,1458354,206979,323828,112161,238745,410879,673152,165167,3589265


In [41]:
precipitation = pd.read_csv('precipitation.csv', encoding='cp949')
precipitation

Unnamed: 0,지점번호,지점명,일시,강수량(mm),1시간최다강수량(mm),1시간최다강수량시각,Unnamed: 6
0,108,서울,2020-01-01,0.1,,,
1,108,서울,2020-01-02,,,,
2,108,서울,2020-01-03,,,,
3,108,서울,2020-01-04,,,,
4,108,서울,2020-01-05,,,,
...,...,...,...,...,...,...,...
894,108,서울,2022-06-13,18.0,15.5,4:00,
895,108,서울,2022-06-14,0.0,0.0,,
896,108,서울,2022-06-15,18.5,5.8,1:44,
897,108,서울,2022-06-16,0.7,0.3,0:35,


### 2. 데이터 전처리

In [42]:
print('일일 교통량 데이터의 결측치\n', daily_traffic.isna().sum(), end='\n\n')
print('강수량 데이터의 결측치\n', precipitation.isna().sum(), end='\n\n')

일일 교통량 데이터의 결측치
 0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

강수량 데이터의 결측치
 지점번호              0
지점명               0
일시                0
강수량(mm)         531
1시간최다강수량(mm)    669
1시간최다강수량시각      744
Unnamed: 6      750
dtype: int64



In [43]:
# 강수량 데이터의 빈 곳을 모두 0으로 채워줌 --> 로지스틱 회귀에서 1 또는 0이 타겟 데이터가 되어야 하기 때문

precipitation = precipitation.fillna(0)
print('강수량 데이터의 결측치\n', precipitation.isna().sum(), end='\n\n')

강수량 데이터의 결측치
 지점번호            0
지점명             0
일시              0
강수량(mm)         0
1시간최다강수량(mm)    0
1시간최다강수량시각      0
Unnamed: 6      0
dtype: int64



In [44]:
# 강수량 데이터만 필요하기 때문에, 나머지 칼럼들을 드롭해줌

precipitation = precipitation.rename(columns={'일시':'날짜'})
precipitation = precipitation.drop(columns=['지점번호', '지점명', '1시간최다강수량(mm)', '1시간최다강수량시각', 'Unnamed: 6'])

for i in range(len(precipitation)):
  precipitation['날짜'][i] = precipitation['날짜'][i].replace('-', '')
  
precipitation

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  precipitation['날짜'][i] = precipitation['날짜'][i].replace('-', '')


Unnamed: 0,날짜,강수량(mm)
0,20200101,0.1
1,20200102,0.0
2,20200103,0.0
3,20200104,0.0
4,20200105,0.0
...,...,...
894,20220613,18.0
895,20220614,0.0
896,20220615,18.5
897,20220616,0.7


In [45]:
# 로지스틱 회귀 분석을 사용하기 위해, 강수가 있으면 1 / 없으면 0으로 변경시킴

precipitation.loc[(precipitation['강수량(mm)'] >= 0.1), '강수 여부'] = '1'
precipitation.loc[(precipitation['강수량(mm)'] < 0.1), '강수 여부'] = '0'

precipitation = precipitation[precipitation['날짜'].between('20201206','20220616')]

precipitation

Unnamed: 0,날짜,강수량(mm),강수 여부
340,20201206,0.0,0
341,20201207,0.0,0
342,20201208,0.0,0
343,20201209,0.0,0
344,20201210,0.0,0
...,...,...,...
893,20220612,0.0,0
894,20220613,18.0,1
895,20220614,0.0,0
896,20220615,18.5,1


In [46]:
daily_traffic = daily_traffic.drop(labels=['권역', '입출구'], axis=0)
daily_traffic = daily_traffic.drop(columns=list(range(0,8)), axis=1)
daily_traffic = daily_traffic.rename(columns={8:'전국'})

daily_traffic

Unnamed: 0,전국
2020.12.06,2852845
2020.12.07,3432682
2020.12.08,3364617
2020.12.09,3384849
2020.12.10,3421378
...,...
2022.06.12,3324092
2022.06.13,3700043
2022.06.14,3550521
2022.06.15,3589265


In [47]:
daily_traffic = daily_traffic.reset_index()
daily_traffic = daily_traffic.rename(columns={'index':'날짜'})

for i in range(len(daily_traffic)):
  daily_traffic['날짜'][i] = daily_traffic['날짜'][i].replace('.', '')
  daily_traffic['전국'][i] = daily_traffic['전국'][i].replace(',', '')
daily_traffic

Unnamed: 0,날짜,전국
0,20201206,2852845
1,20201207,3432682
2,20201208,3364617
3,20201209,3384849
4,20201210,3421378
...,...,...
553,20220612,3324092
554,20220613,3700043
555,20220614,3550521
556,20220615,3589265


In [48]:
result = pd.merge(daily_traffic, precipitation)
result 

Unnamed: 0,날짜,전국,강수량(mm),강수 여부
0,20201206,2852845,0.0,0
1,20201207,3432682,0.0,0
2,20201208,3364617,0.0,0
3,20201209,3384849,0.0,0
4,20201210,3421378,0.0,0
...,...,...,...,...
553,20220612,3324092,0.0,0
554,20220613,3700043,18.0,1
555,20220614,3550521,0.0,0
556,20220615,3589265,18.5,1


In [67]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 558 entries, 0 to 557
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   날짜       558 non-null    object 
 1   전국       558 non-null    object 
 2   강수량(mm)  558 non-null    float64
 3   강수 여부    558 non-null    object 
dtypes: float64(1), object(3)
memory usage: 21.8+ KB


### 3. 로지스틱 회귀 분석

In [75]:
# 데이터셋 분리
from sklearn.model_selection import train_test_split

x = result['전국']
y = result['강수 여부']

In [76]:
x = x.to_numpy()
x = x.reshape(-1, 1)
print(x.shape)

y = y.to_numpy()
y = y.reshape(-1, 1)
print(y.shape)

x_train, x_target, y_train, y_target = train_test_split(x, y, random_state=42)

(558, 1)
(558, 1)


In [77]:
from sklearn.linear_model import LogisticRegression

# 로지스틱 회귀 모델 학습
model = LogisticRegression(penalty = 'l2')
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [78]:
from sklearn.metrics import accuracy_score

# 로지스틱 모델 학습 성능 비교
y_pred = model.predict(x_target)

# 정확도 측정
accuracy_score(y_pred, y_target)

0.7428571428571429