In [2]:
import tensorflow
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.linear_model import * 
from functools import reduce
from tqdm import tqdm

In [8]:
null = None
data = [eval(l) for l in open('datasets/130k_wine_data.json')][0]

In [9]:
data_w_price = [d for d in data if d['price']]

In [10]:
random.shuffle(data_w_price)

In [11]:
train = data_w_price[:80000]
valid = data_w_price[80000:100000]
test  = data_w_price[100000:]

 Data Exploration
 ===

Data Fields:
---
1. `Points`: the number of points WineEnthusiast rated the wine on a scale of 1-100 (though they say they only post reviews for wines that score >=80)
2. `Title`: the title of the wine review, which often contains the vintage if you're interested in extracting that feature
3. `Variety`: the type of grapes used to make the wine (ie Pinot Noir)
4. `Description`: a few sentences from a sommelier describing the wine's taste, smell, look, feel, etc.
5. `Country`: the country that the wine is from
6. `Province`: the province or state that the wine is from
7. `Region 1`: the wine growing area in a province or state (ie Napa)
8. `Region 2`: sometimes there are more specific regions specified within a wine growing area (ie Rutherford inside the Napa Valley), but this value can sometimes be blank
9. `Winery`: the winery that made the wine
10. `Designation`: the vineyard within the winery where the grapes that made the wine are from
11. `Price`: the cost for a bottle of the wine
12. `Taster Name`: name of the person who tasted and reviewed the wine
13. `Taster Twitter Handle`: Twitter handle for the person who tasted and reviewed the wine

In [12]:
for k in data[0]:
    print(k + ': ' + str(len(list(set([d[k] for d in data])))))

points: 21
province: 426
variety: 708
description: 119955
region_2: 18
taster_name: 20
country: 44
title: 118840
taster_twitter_handle: 16
region_1: 1230
designation: 37980
winery: 16757
price: 391


In [13]:
one_hot_features = ['country', 'designation', 'province', 'region_1', 'region_2', 'variety', 'winery']
one_hot_bins = {k: set([d[k] for d in data]) for k in one_hot_features}

def featurize(data, keys):
    def feat(datum, keys):
        feature_vector = [[int(datum[k] == b) for b in one_hot_bins[k]] for k in keys if k in one_hot_features]
        return reduce(lambda a, b: a + b, feature_vector)
    return list(map(lambda x: feat(x, keys), tqdm(data)))

In [14]:
x = featurize(train, ['variety', 'winery'])
y = [d['price'] for d in train]

100%|██████████| 80000/80000 [08:20<00:00, 159.71it/s]


In [15]:
reg = LinearRegression()

In [None]:
reg.fit(x, y)

In [None]:
test_x = featurize(test, [variety', 'winery'])

In [None]:
reg.predict(test_x[0])

In [None]:
test_y = [d['price'] for d in test]

In [None]:
print(reg.predict(test_x[4]))
print(test_y[4])

In [None]:
test[3]

In [19]:
X = np.array([np.array(i) for i in tqdm(x)])

100%|██████████| 80000/80000 [02:12<00:00, 604.64it/s]


In [20]:
X.shape

(80000, 17465)

In [21]:
y = np.array(y)

In [22]:
reg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)