<a href="https://colab.research.google.com/github/keonu4230/keon/blob/main/predicted_rating2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from catboost import CatBoostRegressor

In [2]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [36]:
# 1. 데이터 로드
data = pd.read_csv("preprocessed_SeriesOn.csv", encoding = 'utf-8-sig')

# 데이터 크기 확인
print("원본 데이터 크기:", data.shape)

원본 데이터 크기: (29185, 24)


In [37]:
# 데이터 전처리
# 'rating' 및 'like' 열을 제거
X = data.drop(columns=['rating', 'like'])

In [38]:
# 범주형 변수 인코딩 (Label Encoding)
categorical_columns = X.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# 타겟 변수 설정
y = data['rating'].loc[X.index]

In [39]:
# 데이터 크기 확인
print("최종 데이터 크기:", X.shape)

최종 데이터 크기: (29185, 22)


In [40]:
# 데이터 분할 (학습 데이터와 테스트 데이터)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 분할된 데이터 크기 확인
print("학습 데이터 크기:", X_train.shape)
print("테스트 데이터 크기:", X_test.shape)

학습 데이터 크기: (23348, 22)
테스트 데이터 크기: (5837, 22)


In [44]:
# 하이퍼파라미터 튜닝 및 모델 평가 함수 정의
def tune_and_evaluate(model, params, X_train, y_train, X_test, y_test):
    grid = GridSearchCV(model, params, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return best_model, rmse, mae, r2

# 모델 및 하이퍼파라미터 설정
models = [
    ('XGBoost', xgb.XGBRegressor(random_state=42), {
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }),
    ('CatBoost', CatBoostRegressor(verbose=False, random_state=42), {
        'n_estimators': [100, 500, 1000],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    })
]

# 모델 테스트 및 최적 모델 선택
best_model = None
best_score = float('inf')
best_model_name = None

for name, model, params in models:
    print(f"Training {name} model...")
    tuned_model, rmse, mae, r2 = tune_and_evaluate(model, params, X_train, y_train, X_test, y_test)
    print(f'{name} RMSE: {rmse}, MAE: {mae}, R²: {r2}')

    if rmse < best_score:
        best_score = rmse
        best_model = tuned_model
        best_model_name = name

print(f'Best model: {best_model_name} with RMSE: {best_score}')

Training XGBoost model...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
XGBoost RMSE: 2.045897821476823, MAE: 1.364774619235418, R²: 0.6129623671816327
Training CatBoost model...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
CatBoost RMSE: 2.1789641148380507, MAE: 1.5376945540839495, R²: 0.5609788218134586
Best model: XGBoost with RMSE: 2.045897821476823


In [45]:
# 전체 데이터셋에 대해 평점 예측
y_pred_all = best_model.predict(X)

# 예측 결과를 원본 데이터에 추가
data['Predicted_Rating'] = y_pred_all

In [46]:
# 전체 데이터셋에 대해 평점 예측
y_pred_all = best_model.predict(X)

# 예측 결과를 원본 데이터에 추가
data['Predicted_Rating'] = y_pred_all

# 결과 확인
print(data[['rating', 'Predicted_Rating']])

# 결과 저장
data.to_csv('predicted_ratings2.csv', index=False)

       rating  Predicted_Rating
0        6.33          6.312947
1       10.00          9.936676
2        8.89          8.849211
3        8.89          8.207392
4       10.00          9.716569
...       ...               ...
29180    8.26          8.351068
29181    8.26          8.299956
29182    6.66          6.899862
29183    6.66          6.522031
29184    4.45          4.758372

[29185 rows x 2 columns]


In [47]:
# 표 확인
df = pd.read_csv("predicted_ratings2.csv", encoding = 'utf-8-sig')

df[['rating', 'Predicted_Rating']]

Unnamed: 0,rating,Predicted_Rating
0,6.33,6.312947
1,10.00,9.936676
2,8.89,8.849211
3,8.89,8.207392
4,10.00,9.716569
...,...,...
29180,8.26,8.351069
29181,8.26,8.299956
29182,6.66,6.899862
29183,6.66,6.522031
