# Random Forest Price Prediction (Nicolas)


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

lego = pd.read_csv('lego_data_clean_translated.csv')

lego = lego.drop(columns=['toy_name', 'toy_name_en']) 
lego = pd.get_dummies(lego, columns=['colection'], drop_first=True)

lego['llavero'] = lego['llavero'].astype(int)
lego['original'] = lego['original'].astype(int)
lego['adult'] = lego['adult'].astype(int)

X = lego.drop(columns=['price', 'price_usd'])  
y = lego['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Squared Error: 701973.1789321447
R-squared: 0.8665953253669112


# Gradient Boosting Price Predictor (Preston)

In [64]:
from catboost import CatBoostRegressor

In [65]:
cat_model = CatBoostRegressor(iterations=5000, learning_rate=0.001, depth=10, l2_leaf_reg=3, bagging_temperature=0.8, early_stopping_rounds=50)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=50)

0:	learn: 2368.6276605	test: 2293.3660173	best: 2293.3660173 (0)	total: 2.37ms	remaining: 11.9s
50:	learn: 2294.1364022	test: 2213.5496732	best: 2213.5496732 (50)	total: 266ms	remaining: 25.8s
100:	learn: 2223.1960439	test: 2137.1925407	best: 2137.1925407 (100)	total: 521ms	remaining: 25.3s
150:	learn: 2155.7399513	test: 2064.9792951	best: 2064.9792951 (150)	total: 789ms	remaining: 25.3s
200:	learn: 2091.8395030	test: 1997.6052427	best: 1997.6052427 (200)	total: 1.04s	remaining: 24.8s
250:	learn: 2029.7371178	test: 1931.5555025	best: 1931.5555025 (250)	total: 1.29s	remaining: 24.5s
300:	learn: 1969.7826954	test: 1867.8314522	best: 1867.8314522 (300)	total: 1.54s	remaining: 24.1s
350:	learn: 1912.6034907	test: 1807.1659556	best: 1807.1659556 (350)	total: 1.79s	remaining: 23.7s
400:	learn: 1858.6983758	test: 1750.2696600	best: 1750.2696600 (400)	total: 2.04s	remaining: 23.4s
450:	learn: 1806.3140702	test: 1695.1326392	best: 1695.1326392 (450)	total: 2.29s	remaining: 23.1s
500:	learn: 175

<catboost.core.CatBoostRegressor at 0x21cb84247a0>

In [62]:
y_pred_grad = cat_model.predict(X_test)

In [63]:
rmse_grad = mean_squared_error(y_test, y_pred_grad , squared=False)
mse_grad = mean_squared_error(y_test, y_pred_grad )
r2_grad = r2_score(y_test, y_pred_grad)

print("Root Mean Squared Error:", rmse_grad)
print("Mean Squared Error:", mse_grad)
print("R-squared:", r2_grad)

Root Mean Squared Error: 813.69213905135
Mean Squared Error: 662094.8971539614
R-squared: 0.8741738901400524
