In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from datetime import datetime
from scipy import stats
import xgboost as xgb # XGBoost 라이브러리
from sklearn.model_selection import train_test_split # 데이터 분할
from sklearn.metrics import mean_squared_error, r2_score # 회귀 모델 평가 지표
from sklearn.preprocessing import StandardScaler # 스케일링 (선택적)
from sklearn.pipeline import Pipeline # 전처리 및 모델링 파이프라인
from sklearn.impute import SimpleImputer # 결측치 처리를 위한 Imputer (XGBoost 자체 기능 대신 사용 예시)

plt.rcParams['font.family'] = 'Malgun Gothic'
pd.options.display.max_rows = None
pd.options.display.max_columns = None

# 파일 열기

In [2]:
gdf = gpd.read_file("C:/Users/이정재/OneDrive/바탕 화면/공공데이터/practical_project/csv/point_1.shp")

df = pd.read_csv('C:/Users/이정재/OneDrive/바탕 화면/공공데이터/practical_project/csv/test.csv',encoding='cp949')

In [3]:
df

Unnamed: 0,농장아이디,개체번호,착유량,착유시작일시,착유종료일시,착유회차,전도도,혈액흐름,온도,유지방,유단백,공기흐름,착유시간,나이,측정일,개체별 일일착유량,개체별 착유일수
0,20278,20130816010079,16,2021-09-01 06:52:00,2021-09-01 07:04:00,1,7.1,0,39.9,4.1,3.3,1.5,12.0,8,1,33,26
1,20278,20130816010079,17,2021-09-01 17:02:00,2021-09-01 17:11:00,2,6.8,0,40.2,4.5,3.2,2.1,9.0,8,1,33,26
2,20278,20130816010079,14,2021-09-02 01:41:00,2021-09-02 01:51:00,1,6.8,0,39.9,4.8,3.1,1.9,10.0,8,2,35,26
3,20278,20130816010079,10,2021-09-02 07:28:00,2021-09-02 07:36:00,2,6.8,0,39.6,5.0,3.1,1.7,8.0,8,2,35,26
4,20278,20130816010079,11,2021-09-02 14:33:00,2021-09-02 14:45:00,3,6.8,0,40.0,4.7,3.2,1.3,12.0,8,2,35,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21653,20264,20191027020116,9,2021-09-25 01:29:00,2021-09-25 01:35:00,1,6.2,0,38.8,3.3,3.3,3.7,6.0,1,25,29,19
21654,20264,20191027020116,9,2021-09-25 09:15:00,2021-09-25 09:21:00,2,6.4,0,39.2,2.8,3.3,3.7,6.0,1,25,29,19
21655,20264,20191027020116,11,2021-09-25 19:28:00,2021-09-25 19:35:00,3,6.4,0,39.5,2.5,3.4,3.9,7.0,1,25,29,19
21656,20264,20191027020116,13,2021-09-26 08:22:00,2021-09-26 08:29:00,1,6.2,0,39.4,2.8,3.4,4.0,7.0,1,26,26,19


# 착유량 이상치 탐지

In [4]:
df['착유량'].describe()

count    21658.000000
mean        11.699464
std          3.647795
min          0.000000
25%          9.000000
50%         11.000000
75%         14.000000
max         34.000000
Name: 착유량, dtype: float64

In [5]:
# 이상치 여부 (IQR 방식)
def iqr_outlier_check(x):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    
    iqr = q3 - q1

    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return ~x.between(lower, upper)  # 이상치인 경우 True

df['이상치여부(착유량)'] = df['착유량'].transform(iqr_outlier_check)
df

Unnamed: 0,농장아이디,개체번호,착유량,착유시작일시,착유종료일시,착유회차,전도도,혈액흐름,온도,유지방,유단백,공기흐름,착유시간,나이,측정일,개체별 일일착유량,개체별 착유일수,이상치여부(착유량)
0,20278,20130816010079,16,2021-09-01 06:52:00,2021-09-01 07:04:00,1,7.1,0,39.9,4.1,3.3,1.5,12.0,8,1,33,26,False
1,20278,20130816010079,17,2021-09-01 17:02:00,2021-09-01 17:11:00,2,6.8,0,40.2,4.5,3.2,2.1,9.0,8,1,33,26,False
2,20278,20130816010079,14,2021-09-02 01:41:00,2021-09-02 01:51:00,1,6.8,0,39.9,4.8,3.1,1.9,10.0,8,2,35,26,False
3,20278,20130816010079,10,2021-09-02 07:28:00,2021-09-02 07:36:00,2,6.8,0,39.6,5.0,3.1,1.7,8.0,8,2,35,26,False
4,20278,20130816010079,11,2021-09-02 14:33:00,2021-09-02 14:45:00,3,6.8,0,40.0,4.7,3.2,1.3,12.0,8,2,35,26,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21653,20264,20191027020116,9,2021-09-25 01:29:00,2021-09-25 01:35:00,1,6.2,0,38.8,3.3,3.3,3.7,6.0,1,25,29,19,False
21654,20264,20191027020116,9,2021-09-25 09:15:00,2021-09-25 09:21:00,2,6.4,0,39.2,2.8,3.3,3.7,6.0,1,25,29,19,False
21655,20264,20191027020116,11,2021-09-25 19:28:00,2021-09-25 19:35:00,3,6.4,0,39.5,2.5,3.4,3.9,7.0,1,25,29,19,False
21656,20264,20191027020116,13,2021-09-26 08:22:00,2021-09-26 08:29:00,1,6.2,0,39.4,2.8,3.4,4.0,7.0,1,26,26,19,False


In [6]:
#이상치 여부
trouble = (
    df['이상치여부(착유량)']
    .value_counts()
    .sort_index()
    .reset_index()
)
trouble.columns = ['이상치여부(착유량)', '갯수']
trouble

Unnamed: 0,이상치여부(착유량),갯수
0,False,21343
1,True,315


In [7]:
# 2. 이상치만 필터링한 DataFrame
milk_outliers_df = df[df['이상치여부(착유량)'] == True]
milk_outliers_df

Unnamed: 0,농장아이디,개체번호,착유량,착유시작일시,착유종료일시,착유회차,전도도,혈액흐름,온도,유지방,유단백,공기흐름,착유시간,나이,측정일,개체별 일일착유량,개체별 착유일수,이상치여부(착유량)
15,20278,20130816010079,22,2021-09-08 04:16:00,2021-09-08 04:27:00,1,7.1,0,39.9,4.3,3.1,2.5,11.0,8,8,40,26,True
22,20278,20130816010079,23,2021-09-12 07:18:00,2021-09-12 07:28:00,1,7.0,0,40.2,4.3,3.2,2.8,10.0,8,12,23,26,True
28,20278,20130816010079,24,2021-09-16 06:15:00,2021-09-16 06:26:00,1,7.3,0,39.4,4.6,3.2,2.5,11.0,8,16,24,26,True
29,20278,20130816010079,27,2021-09-17 06:20:00,2021-09-17 06:33:00,1,7.2,0,39.7,4.8,3.1,2.3,13.0,8,17,37,26,True
41,20278,20130816010079,22,2021-09-23 06:03:00,2021-09-23 06:15:00,1,7.3,0,39.4,4.6,3.2,2.0,12.0,8,23,35,26,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19344,20264,20171015020066,24,2021-09-21 18:09:00,2021-09-21 18:22:00,1,7.2,0,39.8,4.4,3.1,2.5,13.0,3,21,24,26,True
20390,20264,20180802020063,23,2021-09-03 17:54:00,2021-09-03 18:04:00,2,7.2,0,39.9,2.9,3.4,3.6,10.0,3,3,36,26,True
20405,20264,20180802020063,23,2021-09-08 12:03:00,2021-09-08 12:13:00,1,7.3,0,39.9,3.0,3.4,4.2,10.0,3,8,38,26,True
20547,20264,20180821020070,22,2021-09-20 09:07:00,2021-09-20 09:22:00,1,7.4,0,39.4,3.1,3.3,1.8,15.0,3,20,43,26,True


In [8]:
# 값별 빈도수 계산
out = (
    milk_outliers_df['착유량']
    .value_counts()
    .sort_index()
    .reset_index()
)
# 컬럼명 정리
out.columns = ['착유량', '빈도']
out

Unnamed: 0,착유량,빈도
0,0,7
1,1,27
2,22,120
3,23,64
4,24,33
5,25,27
6,26,17
7,27,10
8,28,7
9,29,1


# 착유시간 이상치 탐지

In [9]:
df

Unnamed: 0,농장아이디,개체번호,착유량,착유시작일시,착유종료일시,착유회차,전도도,혈액흐름,온도,유지방,유단백,공기흐름,착유시간,나이,측정일,개체별 일일착유량,개체별 착유일수,이상치여부(착유량)
0,20278,20130816010079,16,2021-09-01 06:52:00,2021-09-01 07:04:00,1,7.1,0,39.9,4.1,3.3,1.5,12.0,8,1,33,26,False
1,20278,20130816010079,17,2021-09-01 17:02:00,2021-09-01 17:11:00,2,6.8,0,40.2,4.5,3.2,2.1,9.0,8,1,33,26,False
2,20278,20130816010079,14,2021-09-02 01:41:00,2021-09-02 01:51:00,1,6.8,0,39.9,4.8,3.1,1.9,10.0,8,2,35,26,False
3,20278,20130816010079,10,2021-09-02 07:28:00,2021-09-02 07:36:00,2,6.8,0,39.6,5.0,3.1,1.7,8.0,8,2,35,26,False
4,20278,20130816010079,11,2021-09-02 14:33:00,2021-09-02 14:45:00,3,6.8,0,40.0,4.7,3.2,1.3,12.0,8,2,35,26,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21653,20264,20191027020116,9,2021-09-25 01:29:00,2021-09-25 01:35:00,1,6.2,0,38.8,3.3,3.3,3.7,6.0,1,25,29,19,False
21654,20264,20191027020116,9,2021-09-25 09:15:00,2021-09-25 09:21:00,2,6.4,0,39.2,2.8,3.3,3.7,6.0,1,25,29,19,False
21655,20264,20191027020116,11,2021-09-25 19:28:00,2021-09-25 19:35:00,3,6.4,0,39.5,2.5,3.4,3.9,7.0,1,25,29,19,False
21656,20264,20191027020116,13,2021-09-26 08:22:00,2021-09-26 08:29:00,1,6.2,0,39.4,2.8,3.4,4.0,7.0,1,26,26,19,False


In [10]:
# 이상치 여부 (IQR 방식)
def iqr_time_outlier_check(x):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    
    iqr = q3 - q1

    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return ~x.between(lower, upper)  # 이상치인 경우 True

df['이상치여부(착유시간)'] = df['착유시간'].transform(iqr_time_outlier_check)
df

Unnamed: 0,농장아이디,개체번호,착유량,착유시작일시,착유종료일시,착유회차,전도도,혈액흐름,온도,유지방,유단백,공기흐름,착유시간,나이,측정일,개체별 일일착유량,개체별 착유일수,이상치여부(착유량),이상치여부(착유시간)
0,20278,20130816010079,16,2021-09-01 06:52:00,2021-09-01 07:04:00,1,7.1,0,39.9,4.1,3.3,1.5,12.0,8,1,33,26,False,False
1,20278,20130816010079,17,2021-09-01 17:02:00,2021-09-01 17:11:00,2,6.8,0,40.2,4.5,3.2,2.1,9.0,8,1,33,26,False,False
2,20278,20130816010079,14,2021-09-02 01:41:00,2021-09-02 01:51:00,1,6.8,0,39.9,4.8,3.1,1.9,10.0,8,2,35,26,False,False
3,20278,20130816010079,10,2021-09-02 07:28:00,2021-09-02 07:36:00,2,6.8,0,39.6,5.0,3.1,1.7,8.0,8,2,35,26,False,False
4,20278,20130816010079,11,2021-09-02 14:33:00,2021-09-02 14:45:00,3,6.8,0,40.0,4.7,3.2,1.3,12.0,8,2,35,26,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21653,20264,20191027020116,9,2021-09-25 01:29:00,2021-09-25 01:35:00,1,6.2,0,38.8,3.3,3.3,3.7,6.0,1,25,29,19,False,False
21654,20264,20191027020116,9,2021-09-25 09:15:00,2021-09-25 09:21:00,2,6.4,0,39.2,2.8,3.3,3.7,6.0,1,25,29,19,False,False
21655,20264,20191027020116,11,2021-09-25 19:28:00,2021-09-25 19:35:00,3,6.4,0,39.5,2.5,3.4,3.9,7.0,1,25,29,19,False,False
21656,20264,20191027020116,13,2021-09-26 08:22:00,2021-09-26 08:29:00,1,6.2,0,39.4,2.8,3.4,4.0,7.0,1,26,26,19,False,False


In [None]:
time_trouble = (
  df['이상치여부(착유시간)']
  .value_counts()
  .sort_index()
  .reset_index()
)
time_trouble.columns = ['이상치여부(착유시간)','갯수']
time_trouble

Unnamed: 0,이상치여뷰(착유시간),갯수
0,False,21154
1,True,504


In [12]:
time_outlier_df = df[df['이상치여부(착유시간)'] == True]
time_outlier_df

Unnamed: 0,농장아이디,개체번호,착유량,착유시작일시,착유종료일시,착유회차,전도도,혈액흐름,온도,유지방,유단백,공기흐름,착유시간,나이,측정일,개체별 일일착유량,개체별 착유일수,이상치여부(착유량),이상치여부(착유시간)
29,20278,20130816010079,27,2021-09-17 06:20:00,2021-09-17 06:33:00,1,7.2,0,39.7,4.8,3.1,2.3,13.0,8,17,37,26,True,True
33,20278,20130816010079,19,2021-09-19 06:28:00,2021-09-19 06:41:00,1,7.4,0,40.0,4.4,3.1,1.8,13.0,8,19,31,26,False,True
35,20278,20130816010079,20,2021-09-20 06:24:00,2021-09-20 06:38:00,1,7.3,0,39.7,4.6,3.2,1.7,14.0,8,20,33,26,False,True
38,20278,20130816010079,13,2021-09-21 15:52:00,2021-09-21 16:06:00,2,7.1,0,40.3,5.6,3.2,1.2,14.0,8,21,32,26,False,True
45,20278,20130816010079,20,2021-09-25 06:15:00,2021-09-25 06:28:00,1,7.0,0,39.4,5.1,3.2,1.7,13.0,8,25,32,26,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21277,20264,20190712020091,14,2021-09-21 20:04:00,2021-09-21 20:17:00,3,7.1,0,39.4,2.1,3.6,1.8,13.0,2,21,42,26,False,True
21278,20264,20190712020091,15,2021-09-22 04:57:00,2021-09-22 05:10:00,1,7.0,0,39.2,2.8,3.4,1.8,13.0,2,22,42,26,False,True
21279,20264,20190712020091,16,2021-09-22 14:19:00,2021-09-22 14:32:00,2,6.9,0,39.4,2.2,3.5,1.8,13.0,2,22,42,26,False,True
21523,20264,20190828020077,14,2021-09-01 08:46:00,2021-09-01 08:59:00,1,6.9,0,39.6,2.3,3.5,1.4,13.0,2,1,28,26,False,True


In [14]:
time_outlier_df['착유시간'].describe()

count    504.000000
mean      14.380952
std        3.678117
min       13.000000
25%       13.000000
50%       13.000000
75%       15.000000
max       80.000000
Name: 착유시간, dtype: float64

# 모델링

In [None]:
df["P/F ratio"] = df["유단백"]/ df["유지방"]
df["착유효율"] = df["착유량"] / df["착유시간"]
df["개체별 전도도율"] = df.groupby("개체번호")["전도도"].transform("mean")

In [None]:
# 특성(X)과 타겟(y) 분리
# '착유량'이 예측하려는 타겟 변수
X = df[["개체번호","공기흐름","P/F ratio","개체별 착유일수","측정일","착유회차","농장아이디","착유효율","개체별 전도도율"]]
y = df['착유량']
# 학습 세트와 검증 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"학습 데이터 X_train 형태: {X_train.shape}")
print(f"검증 데이터 X_test 형태: {X_test.shape}")
print("-" * 40)
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 6, 8, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],  # L1 정규화
    'reg_lambda': [0, 0.1, 0.5, 1.0]  # L2 정규화
}
# --- XGBoost 모델 생성 및 학습 ---
xgb_model = xgb.XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(
    xgb_model, param_grid, n_iter=50,
    cv=5, scoring='r2', random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
# 검증 세트에 대한 예측
y_pred = random_search.predict(X_test)
# 성능 지표 계산
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"\n--- 모델 평가 결과 ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")
print("-" * 40)