# The Price is Right

## Finishing off with Random Forests, XG Boost & Ensemble

In [0]:
!pip install xgboost

In [0]:
# imports

import os
import re
import math
import json
from tqdm import tqdm
import random
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from items import Item
from testing import Tester
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import xgboost as xgb

In [0]:
# CONSTANTS

DB = "products_vectorstore"

In [0]:
# environment

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN', 'your-key-if-not-using-env')

In [0]:
# Load in the test pickle file:

with open('test.pkl', 'rb') as file:
    test = pickle.load(file)
    
# training data is already in Chroma

In [0]:
client = chromadb.PersistentClient(path=DB)
collection = client.get_or_create_collection('products')

In [0]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
prices = [metadata['price'] for metadata in result['metadatas']]

# Random Forest

We will now train a Random Forest model.

Can you spot the difference from what we did in Week 6? In week 6 we used the word2vec model to form vectors; this time we'll use the vectors we already have in Chroma, from the SentenceTransformer model.

In [0]:
# This next line takes an hour on my M1 Mac!

rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(vectors, prices)

n_job = -1 means it is using every core

In [0]:
# Save the model to a file

joblib.dump(rf_model, 'random_forest_model.pkl')

In [0]:
# Load it back in again

rf_model = joblib.load('random_forest_model.pkl')

# XG Boost Model

In [0]:
train_dmatrix = xgb.DMatrix(vectors, label=prices)

params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "learning_rate": 0.1,
    "nthread": -1,
    "verbosity": 1,
    "subsample": 0.8,
}

model = xgb.train(params, train_dmatrix, num_boost_round=100)

In [0]:
joblib.dump(model,'xg_boost_model.pkl')

In [0]:
xgb_model = joblib.load('xg_boost_model.pkl')

# Agents

In [0]:
from agents.specialist_agent import SpecialistAgent
from agents.frontier_agent import FrontierAgent
from agents.random_forest_agent import RandomForestAgent
from agents.xg_boost_agent import XGBoostAgent

In [0]:
specialist = SpecialistAgent()
frontier = FrontierAgent(collection)
random_forest = RandomForestAgent()
xg_boost = XGBoostAgent()

In [0]:
def description(item):
    return item.prompt.split("to the nearest dollar?\n\n")[1].split("\n\nPrice is $")[0]

In [0]:
def rf(item):
    return random_forest.price(description(item))

In [0]:
Tester.test(rf, test)

In [0]:
def xg_b(item):
    return xg_boost.price(description(item))

In [0]:
xg_b(test[0])

In [0]:
Tester.test(xg_b, test)

# Moving towards the ensemble model

In [0]:
product = "Quadcast HyperX condenser mic for high quality audio for podcasting"

In [0]:
print(specialist.price(product))
print(frontier.price(product))
print(random_forest.price(product))
print(xg_boost.price(product))

In [0]:
specialists = []
frontiers = []
random_forests = []
xg_boosts = []
prices = []

for item in tqdm(test[1000:1250]):
    text = description(item)
    specialists.append(specialist.price(text))
    frontiers.append(frontier.price(text))
    random_forests.append(random_forest.price(text))
    xg_boosts.append(xg_boost.price(text))
    prices.append(item.price)

In [0]:
mins = [min(s,f,r,x) for s,f,r,x in zip(specialists, frontiers, random_forests, xg_boosts)]
maxes = [max(s,f,r,x) for s,f,r,x in zip(specialists, frontiers, random_forests, xg_boosts)]

X = pd.DataFrame({
    'Specialist': specialists,
    'Frontier': frontiers,
    'RandomForest': random_forests,
    'XGBoost' : xg_boosts,
    'Min': mins,
    'Max': maxes,
})

# Convert y to a Series
y = pd.Series(prices)

In [0]:
# Train a Linear Regression - current
np.random.seed(42)

lr = LinearRegression()
lr.fit(X, y)

feature_columns = X.columns.tolist()

for feature, coef in zip(feature_columns, lr.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept={lr.intercept_:.2f}")

In [0]:
# Train a Linear Regression - old vals w/o xg
np.random.seed(42)

lr = LinearRegression()
lr.fit(X, y)

feature_columns = X.columns.tolist()

for feature, coef in zip(feature_columns, lr.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept={lr.intercept_:.2f}")

In [0]:
joblib.dump(lr, 'ensemble_model.pkl')

In [0]:
from agents.ensemble_agent import EnsembleAgent
ensemble = EnsembleAgent(collection)

In [0]:
ensemble.price(product) #old val

In [0]:
ensemble.price(product)

In [0]:
def ensemble_pricer(item):
    return max(0,ensemble.price(description(item)))

In [0]:
Tester.test(ensemble_pricer, test) #old 

In [0]:
Tester.test(ensemble_pricer, test)