In [141]:
import pandas as pd
import numpy as np

x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_train.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/x_test.csv')
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/redwine/y_test.csv')

In [142]:
drop_columns = ['ID']

x_train_drop = x_train.drop(columns=drop_columns)
x_test_drop = x_test.drop(columns=drop_columns)

In [143]:
# 이상치 클리핑 - r2 스코어가 더 낮아짐
# def clipping(df):
#     for col in df.columns:
#         q1 = df[col].quantile(0.25)
#         q3 = df[col].quantile(0.75)
#         iqr = q3 - q1
#         lower = q1 - 1.5 * iqr
#         upper = q3 + 1.5 * iqr
#         df[col] = df[col].clip(lower, upper)
#
# clipping(x_train_drop)
# clipping(x_test_drop)

In [144]:
# 로그 스케일링
def log_scale(df):
    for col in df.columns:
        df[col] = np.log1p(df[col])

log_scale(x_train_drop)
log_scale(x_test_drop)

In [145]:
y_train_t = y_train['quality']
y_test_t = y_test['quality']

In [146]:
from sklearn.model_selection import train_test_split

X_t, X_v, y_t, y_v = train_test_split(x_train_drop, y_train_t, test_size=0.33, random_state=42, stratify=y_train_t)

In [147]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=23)
rfr.fit(X_t, y_t)

In [148]:
ptl = rfr.predict(X_t)
pvl = rfr.predict(X_v)

In [149]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

print(f'train mse: {mean_squared_error(y_t, ptl)}')
print(f'train mae: {mean_absolute_error(y_t, ptl)}')
print(f'train mape: {mean_absolute_percentage_error(y_t, ptl)}')
print(f'train rmse: {np.sqrt(mean_squared_error(y_t, ptl))}')
print(f'train r2 score: {r2_score(y_t, ptl)}\n')

print(f'validation mse: {mean_squared_error(y_v, pvl)}')
print(f'validation mae: {mean_absolute_error(y_v, pvl)}')
print(f'validation mape: {mean_absolute_percentage_error(y_v, pvl)}')
print(f'validation rmse: {np.sqrt(mean_squared_error(y_v, pvl))}')
print(f'validation r2 score: {r2_score(y_v, pvl)}')

train mse: 0.05564848130841122
train mae: 0.17183411214953268
train mape: 0.03154380841121495
train rmse: 0.235899303323285
train r2 score: 0.9151128120894929

validation mse: 0.32841773049645395
validation mae: 0.4227186761229314
validation mape: 0.07772016773612518
validation rmse: 0.5730774210317956
validation r2 score: 0.49813428900845513


In [150]:
p_test_l = rfr.predict(x_test_drop)

print(f'validation mse: {mean_squared_error(y_test_t, p_test_l)}')
print(f'validation mae: {mean_absolute_error(y_test_t, p_test_l)}')
print(f'validation mape: {mean_absolute_percentage_error(y_test_t, p_test_l)}')
print(f'validation rmse: {np.sqrt(mean_squared_error(y_test_t, p_test_l))}')
print(f'validation r2 score: {r2_score(y_test_t, p_test_l)}')

validation mse: 0.2291015625
validation mae: 0.31171875
validation mape: 0.06097831101190476
validation rmse: 0.4786455499636448
validation r2 score: 0.6217105263157895
