In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from datetime import datetime
from scipy import stats
import io
import sys
import xgboost as xgb # XGBoost 라이브러리
from sklearn.model_selection import train_test_split # 데이터 분할
from sklearn.metrics import mean_squared_error, r2_score # 회귀 모델 평가 지표
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler # 스케일링 (선택적)
from sklearn.pipeline import Pipeline # 전처리 및 모델링 파이프라인
from sklearn.impute import SimpleImputer # 결측치 처리를 위한 Imputer (XGBoost 자체 기능 대신 사용 예시)
import os

plt.rcParams['font.family'] = 'Malgun Gothic'

In [12]:
gdf = gpd.read_file("C:/Users/이정재/OneDrive/바탕 화면/공공데이터/practical_project/csv/point_1.shp")
df = pd.read_csv('C:/Users/이정재/OneDrive/바탕 화면/공공데이터/practical_project/csv/test_eng.csv')
df_1 = pd.read_csv('C:/Users/이정재/OneDrive/바탕 화면/공공데이터/practical_project/csv/test.csv',encoding='cp949')

In [13]:
df

Unnamed: 0,farm_id,individual_id,milk_yield,milking_start_time,milking_end_time,milking_session,conductivity,blood_flow,temperature,milk_fat,milk_protein,air_flow,milking_duration,age,measurement_date,daily_milk_yield_per_individual,milking_days_per_individual
0,20264,20121004020089,10,2021-09-01 02:44:00,2021-09-01 02:57:00,1,6.3,0,38.3,5.5,3.3,1.0,13.0,8,1,27,26
1,20264,20121004020089,9,2021-09-01 09:59:00,2021-09-01 10:09:00,2,6.5,0,38.0,5.2,3.3,1.2,10.0,8,1,27,26
2,20264,20121004020089,8,2021-09-01 16:24:00,2021-09-01 16:35:00,3,6.4,0,38.5,5.3,3.3,1.3,11.0,8,1,27,26
3,20264,20121004020089,11,2021-09-02 01:19:00,2021-09-02 01:31:00,1,6.3,0,39.0,4.9,3.3,1.2,12.0,8,2,30,26
4,20264,20121004020089,9,2021-09-02 08:12:00,2021-09-02 08:19:00,2,6.5,0,39.0,5.2,3.3,2.2,7.0,8,2,30,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21653,21133,20191016010101,14,2021-09-25 10:23:00,2021-09-25 10:31:00,2,7.5,0,38.6,3.6,3.2,2.2,8.0,1,25,44,13
21654,21133,20191016010101,15,2021-09-25 19:22:00,2021-09-25 19:32:00,3,7.6,0,39.2,3.4,3.3,2.1,10.0,1,25,44,13
21655,21133,20191016010101,16,2021-09-26 06:41:00,2021-09-26 06:51:00,1,7.6,0,38.7,3.5,3.2,2.0,10.0,1,26,42,13
21656,21133,20191016010101,12,2021-09-26 14:15:00,2021-09-26 14:23:00,2,7.4,0,39.4,3.5,3.2,2.0,8.0,1,26,42,13


In [14]:
df["P/F ratio"] = df["milk_protein"]/ df["milk_fat"]
df["effectiveness"] = df["milk_yield"] / df["milking_duration"]
df["conductivity_per_individual"] = df.groupby("individual_id")["conductivity"].transform("mean")

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score

# 1) 데이터 준비
y = df['milk_yield']
# drop target & 불필요 컬럼
feature_cols = ['farm_id','individual_id', 'milking_session',
    'conductivity','blood_flow','temperature','milk_fat','milk_protein',
    'air_flow','milking_duration','age',
    'milking_days_per_individual'
]
X = df[feature_cols].copy()
# 2) 범주형 인코딩: farm_id, individual_id
cat_feats = ['farm_id','individual_id']
te = TargetEncoder(cols=cat_feats)
X = te.fit_transform(X, y)
# 3) Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
# 4) 개별 모델 정의
rf  = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
xgb = XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1)
lgb = LGBMRegressor(n_estimators=200, random_state=42, n_jobs=-1)
# 5) 개별 모델 학습
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)
# 6) Voting 앙상블
voting = VotingRegressor([('rf', rf), ('xgb', xgb), ('lgb', lgb)])
voting.fit(X_train, y_train)
# 7) Stacking 앙상블 (메타모델: Ridge)
stack = StackingRegressor(
    estimators=[('rf', rf), ('xgb', xgb), ('lgb', lgb)],
    final_estimator=Ridge(),
    cv=5,
    n_jobs=-1
)
stack.fit(X_train, y_train)
# (8) 평가 예시
for name, m in [('RF', rf), ('XGB', xgb), ('LGBM', lgb),
                ('Voting', voting), ('Stacking', stack)]:
    rmse = root_mean_squared_error(y_test, m.predict(X_test))
    r2 = r2_score(y_test, m.predict(X_test))
    print(f"{name} RMSE: {rmse:.3f}, r2: {r2:.3f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 587
[LightGBM] [Info] Number of data points in the train set: 17326, number of used features: 11
[LightGBM] [Info] Start training from score 11.720362
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 587
[LightGBM] [Info] Number of data points in the train set: 17326, number of used features: 11
[LightGBM] [Info] Start training from score 11.720362


UnicodeEncodeError: 'ascii' codec can't encode characters in position 18-20: ordinal not in range(128)

In [None]:
# 특성(X)과 타겟(y) 분리
# '착유량'이 예측하려는 타겟 변수
X = df[["individual_id","air_flow","P/F ratio","milking_days_per_individual","measurement_date","milking_session","farm_id","effectiveness","conductivity_per_individual"]]
y = df['milk_yield']
# 학습 세트와 검증 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train : {X_train.shape}")
print(f"X_test : {X_test.shape}")
print("-" * 40)
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 6, 8, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 정규화
    'reg_lambda': [0, 0.1, 0.5, 1.0]  # L2 정규화
}
# --- XGBoost 모델 생성 및 학습 ---
xgb_model = xgb.XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(
    xgb_model, param_grid, n_iter=50,
    cv=5, scoring='r2', random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
# 검증 세트에 대한 예측
y_pred = random_search.predict(X_test)
# 성능 지표 계산
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"\n--- 모델 평가 결과 ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
print("-" * 40)

X_train : (17326, 9)
X_test : (4332, 9)
----------------------------------------


UnicodeEncodeError: 'ascii' codec can't encode characters in position 18-20: ordinal not in range(128)

In [None]:
df_1

Unnamed: 0,농장아이디,개체번호,착유량,착유시작일시,착유종료일시,착유회차,전도도,혈액흐름,온도,유지방,유단백,공기흐름,착유시간,나이,측정일,개체별 일일착유량,개체별 착유일수
0,20278,20130816010079,16,2021-09-01 06:52:00,2021-09-01 07:04:00,1,7.1,0,39.9,4.1,3.3,1.5,12.0,8,1,33,26
1,20278,20130816010079,17,2021-09-01 17:02:00,2021-09-01 17:11:00,2,6.8,0,40.2,4.5,3.2,2.1,9.0,8,1,33,26
2,20278,20130816010079,14,2021-09-02 01:41:00,2021-09-02 01:51:00,1,6.8,0,39.9,4.8,3.1,1.9,10.0,8,2,35,26
3,20278,20130816010079,10,2021-09-02 07:28:00,2021-09-02 07:36:00,2,6.8,0,39.6,5.0,3.1,1.7,8.0,8,2,35,26
4,20278,20130816010079,11,2021-09-02 14:33:00,2021-09-02 14:45:00,3,6.8,0,40.0,4.7,3.2,1.3,12.0,8,2,35,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21653,20264,20191027020116,9,2021-09-25 01:29:00,2021-09-25 01:35:00,1,6.2,0,38.8,3.3,3.3,3.7,6.0,1,25,29,19
21654,20264,20191027020116,9,2021-09-25 09:15:00,2021-09-25 09:21:00,2,6.4,0,39.2,2.8,3.3,3.7,6.0,1,25,29,19
21655,20264,20191027020116,11,2021-09-25 19:28:00,2021-09-25 19:35:00,3,6.4,0,39.5,2.5,3.4,3.9,7.0,1,25,29,19
21656,20264,20191027020116,13,2021-09-26 08:22:00,2021-09-26 08:29:00,1,6.2,0,39.4,2.8,3.4,4.0,7.0,1,26,26,19
