### **📊2Cycle**

In [1]:
import pandas as pd

a_df = pd.read_csv("./datasets/agricultural_yield_train.csv")

In [2]:
pre_a2_df = a_df.copy()

In [3]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
features, targets = pre_a2_df.iloc[:, :-1], pre_a2_df.iloc[:, -1]

std_features = std.fit_transform(features)
std_a_df = pd.DataFrame(std_features, columns=a_df.columns[:-1])
std_a_df

Unnamed: 0,Soil_Quality,Seed_Variety,Fertilizer_Amount_kg_per_hectare,Sunny_Days,Rainfall_mm,Irrigation_Schedule
0,1.490574,0.651538,-0.379691,-0.534440,-0.562263,-0.911955
1,1.210921,-1.534831,1.478208,-0.944175,0.170481,0.882073
2,-0.760181,0.651538,-0.518473,-0.260358,-0.801688,1.330580
3,1.054793,0.651538,-1.031443,1.350364,0.472624,0.882073
4,0.470857,0.651538,0.665692,-1.691294,-0.657619,0.433566
...,...,...,...,...,...,...
15995,0.157907,0.651538,-0.291644,0.319436,-1.805051,0.882073
15996,1.723213,0.651538,1.488357,2.371729,1.466779,-0.463448
15997,-0.501131,0.651538,-0.766464,0.238554,0.138373,1.330580
15998,0.717026,0.651538,1.009705,1.505851,1.943933,0.433566


In [4]:
# std_a_df 이상치 제거
for column in std_a_df.columns:
    std_a_df = std_a_df[std_a_df[column].between(-1.96, 1.96)]
std_a_df

Unnamed: 0,Soil_Quality,Seed_Variety,Fertilizer_Amount_kg_per_hectare,Sunny_Days,Rainfall_mm,Irrigation_Schedule
0,1.490574,0.651538,-0.379691,-0.534440,-0.562263,-0.911955
1,1.210921,-1.534831,1.478208,-0.944175,0.170481,0.882073
2,-0.760181,0.651538,-0.518473,-0.260358,-0.801688,1.330580
3,1.054793,0.651538,-1.031443,1.350364,0.472624,0.882073
4,0.470857,0.651538,0.665692,-1.691294,-0.657619,0.433566
...,...,...,...,...,...,...
15994,-1.551899,-1.534831,-0.037217,-1.082106,1.371297,0.882073
15995,0.157907,0.651538,-0.291644,0.319436,-1.805051,0.882073
15997,-0.501131,0.651538,-0.766464,0.238554,0.138373,1.330580
15998,0.717026,0.651538,1.009705,1.505851,1.943933,0.433566


In [5]:
# 원래 데이터 프레임에 이상치를 제거 적용
pre_a2_df = pre_a2_df.loc[std_a_df.index]
pre_a2_df

Unnamed: 0,Soil_Quality,Seed_Variety,Fertilizer_Amount_kg_per_hectare,Sunny_Days,Rainfall_mm,Irrigation_Schedule,Yield_kg_per_hectare
0,96.415657,1,147.853040,94.593926,444.267569,3,683.759119
1,92.352626,0,281.565396,90.504644,517.585491,7,678.714861
2,63.714785,1,137.864940,97.329340,420.310945,8,934.691975
3,90.084256,1,100.946659,113.404828,547.817646,7,905.842541
4,81.600341,1,223.088908,83.048176,434.726333,6,897.584665
...,...,...,...,...,...,...,...
15994,52.212040,0,172.500785,89.128052,637.738300,7,551.288654
15995,77.053550,1,154.189768,103.115855,319.915079,7,956.781949
15997,67.478487,1,120.017122,102.308627,514.372747,8,932.829561
15998,85.176890,1,247.847389,114.956634,695.035801,6,889.738438


In [6]:
import numpy as np
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

def get_evaluation(y_test, prediction):
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)
    print('MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'\
          .format(MSE, RMSE, MSLE, RMSLE, R2))

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

features, targets = pre_a2_df.iloc[:, :-1], pre_a2_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

l_r = LinearRegression()
l_r.fit(X_train, y_train)

In [8]:
prediction = l_r.predict(X_test)
get_evaluation(y_test, prediction)

MSE: 2532.9262, RMSE: 50.3282, MSLE: 0.0072, RMSLE: 0.0848, R2: 0.9318


In [9]:
from sklearn.preprocessing import PolynomialFeatures

features, targets = pre_a2_df.iloc[:, :-1], pre_a2_df.iloc[:, -1]

poly_features = PolynomialFeatures(degree=2).fit_transform(features)

X_train, X_test, y_train, y_test = \
train_test_split(poly_features, targets, test_size=0.2, random_state=0)

l_r = LinearRegression()
l_r.fit(X_train, y_train)

In [10]:
prediction = l_r.predict(X_test)
get_evaluation(y_test, prediction)

MSE: 2508.3455, RMSE: 50.0834, MSLE: 0.0068, RMSLE: 0.0824, R2: 0.9307


In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

features, targets = pre_a2_df.iloc[:, :-1], pre_a2_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

dt_r = DecisionTreeRegressor(random_state=124)
rf_r = RandomForestRegressor(random_state=124, n_estimators=1000)
gb_r = GradientBoostingRegressor(random_state=124)
xgb_r = XGBRegressor()
lgb_r = LGBMRegressor(n_estimators=100)

models = [dt_r, rf_r, gb_r, xgb_r, lgb_r]
for model in models:
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(model.__class__.__name__)
    get_evaluation(y_test, prediction)

DecisionTreeRegressor
MSE: 5684.6010, RMSE: 75.3963, MSLE: 0.0160, RMSLE: 0.1266, R2: 0.8470
RandomForestRegressor
MSE: 3005.1370, RMSE: 54.8191, MSLE: 0.0087, RMSLE: 0.0935, R2: 0.9191
GradientBoostingRegressor
MSE: 2696.3186, RMSE: 51.9261, MSLE: 0.0078, RMSLE: 0.0886, R2: 0.9274
XGBRegressor
MSE: 2978.5480, RMSE: 54.5761, MSLE: 0.0086, RMSLE: 0.0925, R2: 0.9198
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 11119, number of used features: 6
[LightGBM] [Info] Start training from score 706.786014
LGBMRegressor
MSE: 2718.2718, RMSE: 52.1370, MSLE: 0.0079, RMSLE: 0.0890, R2: 0.9268


### **📝2Cycle 결과**