# Project Wine Reviews

In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
% matplotlib inline

pd.options.display.max_seq_items = 500
pd.set_option('max_rows', 500)
pd.set_option('max_columns', 500)

In [2]:
# importing data
wine = pd.read_csv('winemag-data_first150k.csv')

In [3]:
wine = wine.dropna(subset=['price'])

In [4]:
wine['points_bins'] = pd.qcut(wine['points'], q=5, 
                              labels=['very low','low', 'medium', 'high', 'very high'])

In [5]:
wine_price_cap = wine[wine.price < 200] 
print(len(wine_price_cap))

136368


In [6]:
wine_price_cap.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,points_bins
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,very high


In [7]:
wine_grouped = wine.groupby(['country'])
print(wine_price_cap.country.value_counts()[:10])

US             62021
Italy          18641
France         14375
Spain           8102
Chile           5753
Argentina       5580
Australia       4855
Portugal        4156
New Zealand     3070
Austria         2480
Name: country, dtype: int64


In [8]:
wine_country_cap = wine_price_cap[
    wine_price_cap.country.isin(wine_price_cap.country.value_counts()[:12].index)]
wine_country_cap.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,points_bins
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,very high


## Step 2: Decision Trees - Regression

In [9]:
# import Sklean modules
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [10]:
# convert country column into dummy varables
wine_country_dummy = pd.get_dummies(wine_country_cap['country'])
wine_country_cap_dummy = pd.concat([wine_country_cap, wine_country_dummy], axis=1)
wine_country_cap_dummy.head(1)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,points_bins,Argentina,Australia,Austria,Chile,France,Germany,Italy,New Zealand,Portugal,South Africa,Spain,US
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,Tinta de Toro,Bodega Carmen Rodríguez,very high,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
train, validate, test = np.split(wine_country_cap_dummy.sample(frac=1),
                                 [int(0.7*len(wine_country_cap_dummy)), 
                                  int(0.85*len(wine_country_cap_dummy))])
print(len(train),len(validate),len(test))

93494 20035 20035


In [12]:
X_train = train[['price','Argentina','Australia','Austria','Chile',
                            'France','Germany','Italy','New Zealand','Portugal',
                            'South Africa','Spain','US']]
X_train = np.array(X_train)
y_train = train['points']
y_train = np.array(y_train)

In [13]:
X_validate = validate[['price','Argentina','Australia','Austria','Chile',
                            'France','Germany','Italy','New Zealand','Portugal',
                            'South Africa','Spain','US']]
X_validate = np.array(X_validate)
y_validate = validate['points']
y_validate = np.array(y_validate)

In [14]:
X_test = test[['price','Argentina','Australia','Austria','Chile',
                            'France','Germany','Italy','New Zealand','Portugal',
                            'South Africa','Spain','US']]
X_test = np.array(X_test)
y_test = test['points']
y_test = np.array(y_test)

In [15]:
# build model - fit - predict optimal model based on DT_validation output
model = DecisionTreeRegressor(min_samples_leaf=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_validate)
y_pred_test = model.predict(X_test)

In [16]:
# MAE evaluation
y_training_pred = model.predict(X_train)
print('MAE over training set:    ', mean_absolute_error(y_training_pred, y_train))
print('MAE over validate set:    ', mean_absolute_error(y_pred, y_validate))
print('----------------------------')
# MSE evaluation
print('MSE over training set:    ', mean_squared_error(y_training_pred, y_train))
print('MSE over vaildate set:    ', mean_squared_error(y_pred, y_validate))
print('----------------------------')
# R squared evaluation
print('R^2 over training set:    ', model.score(X_train, y_train))
print('R^2 over validate set:    ', model.score(X_validate, y_validate))

MAE over training set:     1.97161999052
MAE over validate set:     1.96374793333
----------------------------
MSE over training set:     6.21021215344
MSE over vaildate set:     6.12482288364
----------------------------
R^2 over training set:     0.389536355583
R^2 over validate set:     0.38934802596


In [17]:
# Top 10 features (words that influence the review points)
sorted(zip(model.feature_importances_, ['price','Argentina','Australia','Austria','Chile',
                                        'France','Germany','Italy','New Zealand','Portugal',
                                        'South Africa','Spain','US']),reverse=True)

[(0.94118373463876959, 'price'),
 (0.020947059077981559, 'US'),
 (0.010062749850896465, 'Portugal'),
 (0.0062696535975796872, 'Spain'),
 (0.005250478572854273, 'Argentina'),
 (0.0045995334406191291, 'Austria'),
 (0.0031710365963955457, 'Italy'),
 (0.0018020518467852544, 'France'),
 (0.0015676144308618722, 'Germany'),
 (0.0014656782068593853, 'Australia'),
 (0.0013805888796878463, 'Chile'),
 (0.0012873074641332352, 'New Zealand'),
 (0.0010125133965760904, 'South Africa')]

### Count Vectorizer

In [18]:
from sklearn.svm import LinearSVR

In [19]:
# Step 1: Count Vectorization
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
count_vectorizer = CountVectorizer(stop_words='english', min_df=20)
X_train = count_vectorizer.fit_transform(train['description'])
X_validate = count_vectorizer.transform(validate['description'])
X_test = count_vectorizer.transform(test['description'])

In [21]:
print(X_train.shape)
print(type(X_train))

(93494, 5488)
<class 'scipy.sparse.csr.csr_matrix'>


In [22]:
# build model - fit - predict with optimal sample parameter
model = LinearSVR(C=0.1)
model.fit(X_train, y_train)
y_pred2 = model.predict(X_validate)
y_pred_test2 = model.predict(X_test)

In [23]:
print(model.score(X_train, y_train))
print(model.get_params)

0.732886538468
<bound method BaseEstimator.get_params of LinearSVR(C=0.1, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)>


In [24]:
# MAE evaluation
y_training_pred = model.predict(X_train)
print('MAE over training set:    ', mean_absolute_error(y_training_pred, y_train))
print('MAE over validate set:    ', mean_absolute_error(y_pred2, y_validate))
print('----------------------------')
# MSE evaluation
print('MSE over training set:    ', mean_squared_error(y_training_pred, y_train))
print('MSE over vaildate set:    ', mean_squared_error(y_pred2, y_validate))
print('----------------------------')
# R squared evaluation
print('R^2 over training set:    ', model.score(X_train, y_train))
print('R^2 over validate set:    ', model.score(X_validate, y_validate))

MAE over training set:     1.23931403478
MAE over validate set:     1.3334056202
----------------------------
MSE over training set:     2.71733014787
MSE over vaildate set:     2.97333707105
----------------------------
R^2 over training set:     0.732886538468
R^2 over validate set:     0.703554831476


In [25]:
sorted(zip(model.coef_, count_vectorizer.get_feature_names()),reverse=True)[:10]

[(1.4103955771903898, '2025'),
 (1.3470780402431501, 'gorgeous'),
 (1.328747805255311, 'goldeneye'),
 (1.326735599589314, 'wow'),
 (1.3247913276552603, 'stunningly'),
 (1.2967389938447833, 'auslese'),
 (1.2818850777221646, 'rancio'),
 (1.2753252328004105, '2023'),
 (1.2575827419638976, 'barolo'),
 (1.2572608436191748, 'brunello')]

## Combining Models

In [26]:
from sklearn.metrics import r2_score

In [27]:
def r2_max():
    r2 = []
    for w in np.arange(0,1,0.1):
        y_combined = (1-w) * y_pred + w * y_pred2
        score = r2_score(y_validate, y_combined)
        r2.append((w,score))
    return max(r2, key=lambda pair:pair[1])

In [28]:
w = r2_max()[0]

In [29]:
y_combined = (1-w) * y_pred + w * y_pred2

In [30]:
# MAE evaluation
print('MAE over validate set:    ', mean_absolute_error(y_combined, y_validate))

MAE over validate set:     1.28881765685


In [31]:
r2_score(y_validate, y_combined)

0.72956657002529524

## Predictions over Test Set 

In [35]:
y_combined = (1-w) * y_pred_test + w * y_pred_test2
# MAE evaluation
print('MAE over test set:    ', mean_absolute_error(y_combined, y_test))
print('MSE over test set:    ', mean_squared_error(y_combined, y_test))
print('R^2 over test set:    ', r2_score(y_test, y_combined))

MAE over test set:     1.29059404912
MSE over test set:     2.71376507102
R^2 over test set:     0.732454713314
