In [79]:
# Import initial dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from xgboost.sklearn import XGBClassifier  
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
%matplotlib inline

In [7]:
# Read in master wine table and conver to DF
df = pd.read_csv("../data/master_wine_table.csv")

In [8]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,title,variety,winery,freq,c_freq,red,rose,white,sparkling,flavor_categories_taste_notes
0,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2466.0,4856.0,1.0,0.0,0.0,0.0,"light, fruity"
1,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,13272.0,52329.0,1.0,0.0,0.0,0.0,"medium bodied, balanced"
2,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature,9472.0,52329.0,1.0,0.0,0.0,0.0,"light, fruity"
3,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini,9472.0,52329.0,1.0,0.0,0.0,0.0,"medium bodied, balanced"
4,Argentina,"Baked plum, molasses, balsamic vinegar and che...",Felix,87,30.0,Other,Cafayate,,Michael Schachner,Felix Lavaque 2010 Felix Malbec (Cafayate),Malbec,Felix Lavaque,2652.0,3502.0,1.0,0.0,0.0,0.0,"medium bodied, balanced"


In [26]:
 df.columns[df.isna().any()].tolist()

['designation', 'region_1', 'region_2', 'taster_name']

In [27]:
len(df)

109662

In [28]:
df_no_nans = df.dropna(subset = ['taster_name'])
len(df_no_nans)

86944

In [51]:
# Create feature DF
feature_df = df_no_nans[['points','country','price', 'red', 'rose', 'white', 'sparkling',
       'taster_name', 'freq', 'c_freq']]

In [52]:
# Encode the target for the train and test set so it can be fed into our NN and conver to a vector and check it
data = feature_df.copy()

data_binary_encoded = pd.get_dummies(data, columns=["country", "taster_name"])
data_binary_encoded.head()

Unnamed: 0,points,price,red,rose,white,sparkling,freq,c_freq,country_Argentina,country_Australia,...,taster_name_Kerin O’Keefe,taster_name_Lauren Buzzeo,taster_name_Matt Kettmann,taster_name_Michael Schachner,taster_name_Mike DeSimone,taster_name_Paul Gregutt,taster_name_Roger Voss,taster_name_Sean P. Sullivan,taster_name_Susan Kostrzewa,taster_name_Virginie Boone
0,87,15.0,1.0,0.0,0.0,0.0,2466.0,4856.0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,87,65.0,1.0,0.0,0.0,0.0,13272.0,52329.0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,87,19.0,1.0,0.0,0.0,0.0,9472.0,52329.0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,87,34.0,1.0,0.0,0.0,0.0,9472.0,52329.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,87,30.0,1.0,0.0,0.0,0.0,2652.0,3502.0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [53]:
data_binary_encoded.dtypes

points                              int64
price                             float64
red                               float64
rose                              float64
white                             float64
sparkling                         float64
freq                              float64
c_freq                            float64
country_Argentina                   uint8
country_Australia                   uint8
country_Austria                     uint8
country_Brazil                      uint8
country_Bulgaria                    uint8
country_Canada                      uint8
country_Chile                       uint8
country_England                     uint8
country_France                      uint8
country_Germany                     uint8
country_Greece                      uint8
country_Hungary                     uint8
country_Israel                      uint8
country_Italy                       uint8
country_Lebanon                     uint8
country_Mexico                    

In [54]:
data_binary_encoded.columns

Index(['points', 'price', 'red', 'rose', 'white', 'sparkling', 'freq',
       'c_freq', 'country_Argentina', 'country_Australia', 'country_Austria',
       'country_Brazil', 'country_Bulgaria', 'country_Canada', 'country_Chile',
       'country_England', 'country_France', 'country_Germany',
       'country_Greece', 'country_Hungary', 'country_Israel', 'country_Italy',
       'country_Lebanon', 'country_Mexico', 'country_Moldova',
       'country_Morocco', 'country_New Zealand', 'country_Portugal',
       'country_Romania', 'country_Slovenia', 'country_South Africa',
       'country_Spain', 'country_Turkey', 'country_US', 'country_Uruguay',
       'taster_name_Alexander Peartree', 'taster_name_Anna Lee C. Iijima',
       'taster_name_Anne Krebiehl MW', 'taster_name_Carrie Dykes',
       'taster_name_Christina Pickard', 'taster_name_Fiona Adams',
       'taster_name_Jeff Jenssen', 'taster_name_Jim Gordon',
       'taster_name_Joe Czerwinski', 'taster_name_Kerin O’Keefe',
       'taster_n

In [68]:
data_binary_encoded2 = data_binary_encoded.rename({'taster_name_Anne Krebiehl MW': 'taster_name_Anne Krebiehl'}, axis=1)

In [72]:
data_binary_encoded2.columns

Index(['points', 'price', 'red', 'rose', 'white', 'sparkling', 'freq',
       'c_freq', 'country_Argentina', 'country_Australia', 'country_Austria',
       'country_Brazil', 'country_Bulgaria', 'country_Canada', 'country_Chile',
       'country_England', 'country_France', 'country_Germany',
       'country_Greece', 'country_Hungary', 'country_Israel', 'country_Italy',
       'country_Lebanon', 'country_Mexico', 'country_Moldova',
       'country_Morocco', 'country_New Zealand', 'country_Portugal',
       'country_Romania', 'country_Slovenia', 'country_South Africa',
       'country_Spain', 'country_Turkey', 'country_US', 'country_Uruguay',
       'taster_name_Alexander Peartree', 'taster_name_Anna Lee C. Iijima',
       'taster_name_Anne Krebiehl MW', 'taster_name_Carrie Dykes',
       'taster_name_Christina Pickard', 'taster_name_Fiona Adams',
       'taster_name_Jeff Jenssen', 'taster_name_Jim Gordon',
       'taster_name_Joe Czerwinski', 'taster_name_Kerin O’Keefe',
       'taster_n

In [76]:
X = data_binary_encoded[['price', 'red', 'rose', 'white', 'sparkling', 'freq',
       'c_freq', 'country_Argentina', 'country_Australia', 'country_Austria',
       'country_Brazil', 'country_Bulgaria', 'country_Canada', 'country_Chile',
       'country_England', 'country_France', 'country_Germany',
       'country_Greece', 'country_Hungary', 'country_Israel', 'country_Italy',
       'country_Lebanon', 'country_Mexico', 'country_Moldova',
       'country_Morocco', 'country_New Zealand', 'country_Portugal',
       'country_Romania', 'country_Slovenia', 'country_South Africa',
       'country_Spain', 'country_Turkey', 'country_US', 'country_Uruguay',
       'taster_name_Alexander Peartree', 'taster_name_Anna Lee C. Iijima',
        'taster_name_Carrie Dykes',
       'taster_name_Christina Pickard', 'taster_name_Fiona Adams',
       'taster_name_Jeff Jenssen', 'taster_name_Jim Gordon',
       'taster_name_Joe Czerwinski', 'taster_name_Kerin O’Keefe',
       'taster_name_Lauren Buzzeo', 'taster_name_Matt Kettmann',
       'taster_name_Michael Schachner', 'taster_name_Mike DeSimone',
       'taster_name_Paul Gregutt', 'taster_name_Roger Voss',
       'taster_name_Sean P. Sullivan', 'taster_name_Susan Kostrzewa',
       'taster_name_Virginie Boone']]
Y = data_binary_encoded['points']

In [80]:
regr = linear_model.LinearRegression()
regr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [81]:
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

Intercept: 
 88.2720018987989
Coefficients: 
 [ 2.42543941e-02  4.66536892e-01 -7.78889903e-01  5.57900455e-02
  2.56562956e-01  1.63165881e-05  4.67260866e-06 -3.83752873e-01
  1.21291148e+00  1.68236949e+00 -2.31953987e+00 -8.64349746e-01
 -2.74564844e-02 -4.94189504e-01  1.70943763e+00  1.02821464e+00
  1.59789997e+00  1.67829572e+00 -3.06286549e-01  4.54506805e-01
  6.97868001e-01 -7.68285304e-01 -1.97049151e+00 -1.28490104e+00
 -1.36593306e-01  1.10486408e+00  1.15014778e+00 -2.11637431e+00
 -1.06906370e+00  2.04289348e+00  1.72411082e-01 -1.01186939e+00
 -9.59329575e-01 -8.19337002e-01 -2.73417596e+00 -9.10246527e-01
 -2.16997042e+00 -1.42560510e+00 -2.00937926e+00  2.18997123e-01
 -1.83060757e-02 -2.05985549e+00 -1.59289287e+00 -3.06459636e+00
  1.09695006e+00 -2.25981987e+00 -2.56715267e-01  2.42365315e-01
 -2.07968068e+00 -6.41952651e-02 -3.82665414e+00  6.26302553e-02]


In [82]:
coeff_df = pd.DataFrame(regr.coef_, X.columns, columns=['Coefficient'])  
coeff_df

Unnamed: 0,Coefficient
price,0.024254
red,0.466537
rose,-0.77889
white,0.05579
sparkling,0.256563
freq,1.6e-05
c_freq,5e-06
country_Argentina,-0.383753
country_Australia,1.212911
country_Austria,1.682369
