In [23]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import matplotlib.dates as mdates
from matplotlib import rcParams
from statsmodels.tsa.stattools import acf,adfuller
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from warnings import filterwarnings
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

url = 'https://apis.data.go.kr/1480523/WaterQualityService/getWaterMeasuringList'
key = 'a6TS+sDQ+cPdv92zVxHz2EI1fV2sWy5kXdu9x/+QasyDSIU3105j6dts2lS2dnW7Y/YFXSw32QdOqyagGTCI8Q=='

In [24]:
# 2. 주소에 있던 파라미터 그대로 설정
params = {
    'serviceKey': key,
    'pageNo': '1',
    'numOfRows': '3000',
    'resultType': 'json',
    'ptNoList': '2022A30,2022A10', # 측정소 코드
    'wmyrList': '2021,2022,2023,2024,2025',      # 연도
    'wmodList': '01,02,03,04,05,06,07,08,09,10,11,12'        # 월(01, 02, 03월)
}

try:
    # 3. API 요청
    response = requests.get(url, params=params, verify=True) # https 대응
    
    if response.status_code == 200:
        data = response.json()
        
        # 4. 데이터 추출 (구조: getWaterMeasuringList -> item)
        items = data.get('getWaterMeasuringList', {}).get('item', [])
        
        if items:
            df = pd.DataFrame(items)
            # df.set_index('ROWNO',inplace=True)

            # 5. 주요 수질 항목 한글 이름으로 변경 (보기 좋게)
            rename_map = {
                'PT_NM': '총량지점명',
                'WMCYMD': '일자',
                'ITEM_TEMP': '수온', # 단위 : ℃
                'ITEM_PH': '수소이온농도(ph)',
                'ITEM_EC': '전기전도도(EC)',    # 단위 : μS/㎝
                'ITEM_DOC': '용존산소(DO)', # 단위 : ㎎/L
                'ITEM_BOD': 'BOD',  # 단위 : ㎎/L
                'ITEM_COD': 'COD',  # 단위 : ㎎/L
                'ITEM_SS': '부유물질',  # 단위 : ㎎/L
                'ITEM_TN': '총질소(T-N)',   # 단위 : ㎎/L
                'ITEM_TP': '총인(T-P)', # 단위 : ㎎/L
                'ITEM_TOC': '총유기탄소(TOC)',  # 단위 : ㎎/L
                'ITEM_AMNT': '유량',    # 단위 : ㎥/s
                'ITEM_CLOA': '클로로필-a'
            }
            
            # 2. 필요한 컬럼만 추출하여 새 DF 생성
            waterDF = df[list(rename_map.keys())].copy()
            waterDF.rename(columns=rename_map, inplace=True)

            # 3. 데이터 타입 숫자형으로 변환 (연산 가능하게)
            num_cols = ['수온', '수소이온농도(ph)', '전기전도도(EC)', '용존산소(DO)', 'BOD', 'COD', '부유물질', '총질소(T-N)', '총인(T-P)', '총유기탄소(TOC)', '유량','클로로필-a']
            waterDF[num_cols] = waterDF[num_cols].apply(pd.to_numeric, errors='coerce')

            # # 4. 날짜 데이터 형식 변환 (YYYYMMDD -> datetime)
            waterDF['일자'] = pd.to_datetime(waterDF['일자'], errors='coerce')

            print("--- 분석 준비 완료: 핵심 수질 지표 ---")
            print(waterDF.head())
        else:
            print("응답은 성공했으나 데이터가 없습니다.")
    else:
        print(f"API 요청 실패: {response.status_code}")

except Exception as e:
    print(f"오류 발생: {e}")

--- 분석 준비 완료: 핵심 수질 지표 ---
  총량지점명         일자   수온  수소이온농도(ph)  전기전도도(EC)  용존산소(DO)  BOD  COD  부유물질  \
0    물금 2021-01-04  4.6         7.5      437.0      13.1  1.7  5.1   2.4   
1    물금 2021-01-11  2.8         7.3      461.0      14.3  2.0  5.4   4.0   
2    물금 2021-01-26  5.1         7.2      470.0      15.6  2.1  6.0   7.6   
3    물금 2021-01-18  3.1         7.3      463.0      15.2  2.2  5.9   6.4   
4    물금 2021-02-16  6.6         8.4      451.0      12.1  2.8  6.6  12.4   

   총질소(T-N)  총인(T-P)  총유기탄소(TOC)      유량  클로로필-a  
0     3.223    0.025         3.8  29.555    12.4  
1     3.268    0.031         4.0  28.599    19.4  
2     3.416    0.026         4.6  74.897    47.3  
3     3.327    0.032         4.4  47.190    33.2  
4     3.489    0.027         5.0  72.735    50.4  


In [25]:
# 필수: 지점+일자 정렬
waterDF = waterDF.sort_values(['총량지점명', '일자']).reset_index(drop=True)

# 같은 지점/같은 날짜 중복이 있으면 평균으로 1개로 정리(필요시)
waterDF = (waterDF.groupby(['총량지점명','일자'], as_index=False).mean(numeric_only=True))

# 다시 정렬
waterDF = waterDF.sort_values(['총량지점명', '일자']).reset_index(drop=True)

waterDF


Unnamed: 0,총량지점명,일자,수온,수소이온농도(ph),전기전도도(EC),용존산소(DO),BOD,COD,부유물질,총질소(T-N),총인(T-P),총유기탄소(TOC),유량,클로로필-a
0,금곡,2021-01-18,3.3,7.2,449.0,15.2,2.0,5.7,5.6,3.194,0.028,4.2,47.810,30.2
1,금곡,2021-01-26,4.9,7.2,460.0,16.0,2.5,6.3,7.2,3.473,0.025,4.7,76.170,44.9
2,금곡,2021-02-01,5.1,7.8,477.0,15.5,2.5,6.3,9.2,3.289,0.027,5.0,27.880,59.9
3,금곡,2021-02-16,6.4,7.9,462.0,11.9,3.4,6.9,13.2,3.566,0.027,5.2,73.770,53.5
4,금곡,2021-02-22,7.0,8.1,458.0,13.5,2.7,6.6,8.8,3.515,0.033,4.8,24.350,50.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,물금,2025-09-23,26.5,7.7,315.0,8.0,1.8,7.1,7.6,1.948,0.057,3.8,357.228,17.3
402,물금,2025-09-29,25.7,7.5,280.0,7.9,1.6,7.1,5.2,1.820,0.073,3.7,439.794,30.2
403,물금,2025-10-13,23.5,8.3,290.0,8.9,1.7,6.6,4.8,1.916,0.042,3.2,443.528,38.1
404,물금,2025-10-21,20.1,7.7,181.0,9.0,1.2,6.3,4.8,3.067,0.086,4.6,447.369,21.4


In [26]:
H = 7  # 예측 시점(일) : 7일 뒤

waterDF['chl_a_t7'] = waterDF.groupby('총량지점명')['클로로필-a'].shift(-H)


In [27]:
base_cols = [
    '수온','용존산소(DO)','BOD','COD','총질소(T-N)','총인(T-P)','유량'
]

# lag 피처
lags = [1, 3, 7]
for col in base_cols:
    for l in lags:
        waterDF[f'{col}_lag{l}'] = waterDF.groupby('총량지점명')[col].shift(l)

# rolling 피처(7일)
win = 7
for col in base_cols + ['클로로필-a']:
    g = waterDF.groupby('총량지점명')[col]
    waterDF[f'{col}_r{win}_mean'] = g.transform(lambda s: s.rolling(win).mean())
    waterDF[f'{col}_r{win}_max']  = g.transform(lambda s: s.rolling(win).max())

# 최근 7일 변화량(트렌드)
waterDF['클로로필-a_delta7'] = waterDF.groupby('총량지점명')['클로로필-a'].diff(7)
waterDF['총인(T-P)_delta7']  = waterDF.groupby('총량지점명')['총인(T-P)'].diff(7)
waterDF['수온_delta7']       = waterDF.groupby('총량지점명')['수온'].diff(7)

In [35]:
target = 'chl_a_t7'

feature_cols = []
# lag/rolling로 만들어진 컬럼 자동 수집
for c in waterDF.columns:
    if ('_lag' in c) or ('_r7_' in c) or ('_delta7' in c):
        feature_cols.append(c)

# 지점도 피처로 쓰려면 더미로 넣는 방식(통합 모델용)
modelDF = waterDF.copy()
modelDF = pd.get_dummies(modelDF, columns=['총량지점명'], drop_first=True)

# 더미 컬럼 추가
site_cols = [c for c in modelDF.columns if c.startswith('총량지점명_')]
feature_cols = feature_cols + site_cols

# 학습용만 추출
modelDF = modelDF.dropna(subset=[target] + feature_cols).copy()

X = modelDF[feature_cols]
y = modelDF[target]


In [29]:
split_date = pd.to_datetime('2023-01-01')

train_idx = modelDF['일자'] < split_date
test_idx  = modelDF['일자'] >= split_date

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [30]:
lr_model = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

lr_model.fit(X_train, y_train)
pred = lr_model.predict(X_test)

print("LinearRegression")
print("R²:", r2_score(y_test, pred))
print("MAE:", mean_absolute_error(y_test, pred))

LinearRegression
R²: -0.7148866366563571
MAE: 15.578083605778682


In [31]:
rf = RandomForestRegressor(
    n_estimators=400,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

print("\nRandomForestRegressor")
print("R²:", r2_score(y_test, pred_rf))
print("MAE:", mean_absolute_error(y_test, pred_rf)) #실제 오차 크기 (얼마나 틀리는지)


RandomForestRegressor
R²: -0.29378680927631495
MAE: 13.84985074626865


In [32]:
imp = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(imp.head(15))


수온_delta7           0.097612
총질소(T-N)_r7_mean    0.095197
유량_r7_mean          0.067371
클로로필-a_r7_mean      0.059071
총질소(T-N)_lag1       0.044531
COD_r7_max          0.042015
수온_lag1             0.036554
COD_lag1            0.036070
용존산소(DO)_lag1       0.029136
총질소(T-N)_lag3       0.028821
총인(T-P)_delta7      0.028334
수온_r7_max           0.027894
총인(T-P)_r7_mean     0.026633
COD_lag3            0.023405
BOD_lag7            0.019074
dtype: float64
