In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from pricer.evaluator import evaluate
from pricer.items import Item
import joblib

In [2]:
username = "kshitijchaudhary"
dataset = f"{username}/items_lite"

train, val, test = Item.from_hub(dataset)

print(f"Loaded {len(train):,} training items, {len(val):,} validation items, {len(test):,} test items")

Loaded 20,000 training items, 1,000 validation items, 1,000 test items


In [3]:
def random_pricer(item):
    return random.randrange(1, 1000)

In [4]:
random.seed(42)
evaluate(random_pricer, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[91m$436 [92m$1 [92m$29 [91m$690 [91m$252 [92m$21 [91m$85 [93m$72 [91m$719 [91m$225 [92m$20 [91m$380 [91m$894 [91m$505 [92m$11 [91m$572 [91m$354 [92m$17 [91m$179 [92m$23 [91m$90 [91m$115 [91m$433 [91m$442 [91m$304 [93m$122 [91m$291 [91m$714 [91m$567 [91m$639 [91m$539 [91m$370 [93m$66 [91m$380 [91m$489 [91m$534 [91m$769 [91m$835 [91m$207 [91m$740 [91m$626 [91m$84 [91m$680 [91m$178 [91m$129 [91m$260 [91m$142 [91m$189 [91m$836 [91m$580 [91m$310 [92m$25 [91m$380 [91m$270 [93m$47 [91m$234 [91m$861 [91m$313 [91m$417 [91m$259 [91m$591 [92m$33 [91m$657 [91m$361 [92m$79 [92m$38 [91m$757 [91m$500 [91m$263 [92m$5 [91m$534 [91m$284 [91m$570 [91m$625 [91m$584 [91m$871 [91m$759 [91m$361 [91m$575 [91m$178 [91m$602 [93m$60 [92m$17 [91m$579 [91m$207 [91m$732 [91m$115 [91m$224 [91m$756 [91m$193 [91m$866 [92m$9 [91m$370 [91m$250 [91m$456 [91m$423 [91m$821 [91m$217 [93m$103 [93m$195 [91m$264 [91m$98 [91m

In [7]:
training_prices = [item.price for item in train]
avg_price = sum(training_prices) / len(training_prices)
print(avg_price)

140.347293000021


In [11]:
def average_training_price(item):
    return avg_price

In [13]:
evaluate(average_training_price, train)

  0%|          | 0/200 [00:00<?, ?it/s]

[93m$76 [93m$61 [91m$100 [91m$309 [93m$60 [93m$50 [92m$35 [93m$66 [91m$100 [91m$115 [91m$144 [93m$61 [91m$590 [91m$100 [93m$55 [93m$60 [91m$110 [93m$65 [93m$52 [93m$80 [91m$113 [91m$132 [91m$260 [91m$122 [93m$59 [91m$97 [93m$56 [91m$93 [91m$180 [91m$125 [93m$56 [91m$90 [91m$315 [91m$604 [91m$101 [91m$126 [92m$20 [93m$79 [93m$60 [91m$111 [91m$105 [91m$580 [91m$105 [92m$19 [91m$106 [91m$169 [91m$82 [93m$50 [91m$89 [93m$70 [91m$119 [91m$100 [93m$41 [93m$58 [91m$96 [91m$120 [91m$111 [91m$261 [92m$30 [91m$113 [93m$43 [93m$72 [91m$370 [93m$40 [93m$75 [93m$45 [92m$15 [91m$89 [91m$97 [92m$4 [91m$120 [91m$107 [92m$20 [91m$190 [91m$678 [92m$18 [93m$53 [91m$117 [91m$104 [92m$30 [91m$104 [91m$124 [91m$116 [91m$112 [91m$117 [91m$107 [91m$120 [92m$38 [91m$80 [93m$57 [91m$92 [92m$2 [93m$60 [93m$60 [93m$51 [92m$11 [91m$96 [93m$62 [91m$94 [93m$71 [93m$41 [92m$4 [92m$30 [91m$122 [91m$102 [92m$31 [9

In [16]:
def get_features(item):
    return {
        "weight": item.weight,
        "weight_unknown": 1 if item.weight ==0 else 0,
        "text_length": len(item.summary)
    }

In [25]:
def list_to_dataframe(items):
    features = [get_features(item) for item in train]
    df = pd.DataFrame(features)
    df['price'] = [item.price for item in train]
    return df

train_df = list_to_dataframe(train)
test_df = list_to_dataframe(test)

In [26]:
train_df

Unnamed: 0,weight,weight_unknown,text_length,price
0,1.500000,0,308,64.30
1,0.485017,0,312,79.00
2,0.520000,0,359,240.00
3,2.270000,0,480,449.00
4,0.525000,0,428,79.99
...,...,...,...,...
19995,15.000000,0,371,82.72
19996,0.287500,0,315,149.00
19997,0.000625,0,378,50.56
19998,0.136250,0,366,35.00


In [27]:
random.seed(42)

feature_columns = ['weight', 'weight_unknown', 'text_length']

X_train = train_df[feature_columns]
y_train = train_df['price']
X_test = test_df[feature_columns]
y_test = test_df['price']

model = LinearRegression()
model.fit(X_train, y_train)

for feature, coeff in zip(feature_columns, model.coef_):
    print(f"{feature}: {coeff}")
print(f"Intercept: {model.intercept_}")

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Square Error is: {mse}")
print(f"R-Squared score is: {r2}")



weight: 3.5676099219759356
weight_unknown: 20.49967303604297
text_length: 0.20714449042916366
Intercept: 39.75504757399429
Mean Square Error is: 21774.673894264604
R-Squared score is: 0.1582639698804249


In [63]:
def linear_regression(item):
    features = [get_features(item)]
    df = pd.DataFrame(features)
    print(df)
    return model.predict(df)[0]

linear_regression(test[0])

   weight  weight_unknown  text_length
0     2.0               0          468


np.float64(143.83388893879476)

In [36]:
evaluate(linear_regression, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[93m$75 [93m$51 [93m$74 [93m$49 [93m$60 [92m$37 [92m$13 [93m$48 [91m$93 [91m$195 [91m$577 [91m$231 [91m$103 [93m$80 [92m$25 [93m$78 [93m$50 [93m$56 [93m$43 [92m$10 [92m$17 [92m$13 [93m$73 [92m$22 [91m$190 [91m$328 [91m$349 [91m$103 [93m$46 [92m$19 [91m$107 [91m$94 [93m$42 [92m$9 [91m$89 [91m$664 [93m$67 [93m$44 [93m$83 [93m$77 [93m$70 [93m$46 [93m$78 [93m$73 [91m$97 [91m$92 [91m$104 [91m$92 [92m$31 [93m$55 [91m$85 [92m$2 [91m$344 [92m$7 [93m$69 [92m$24 [91m$104 [91m$88 [92m$39 [91m$122 [91m$108 [92m$36 [92m$23 [92m$6 [93m$238 [92m$29 [91m$99 [91m$267 [92m$4 [91m$94 [91m$83 [91m$117 [91m$166 [91m$88 [93m$71 [93m$65 [92m$6 [91m$93 [91m$104 [91m$87 [93m$75 [91m$88 [91m$86 [92m$11 [91m$81 [93m$72 [93m$77 [91m$157 [93m$74 [93m$54 [91m$93 [93m$52 [91m$92 [91m$81 [91m$89 [91m$108 [91m$94 [92m$12 [91m$123 [91m$415 [92m$39 [92m$15 [93m$71 [92m$30 [91m$115 [92m$5 [91m$85 [91m$301 [91m$99 