# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import nltk

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier


import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *

## Wine Reviews

In [2]:
wine_reviews = pd.read_json('wine-reviews/winemag-data-130k-v2.json')
wine_reviews = wine_reviews.head(10000)
# Preprocess
common_years = range(1975,2017)[::-1]
def getYear(row): 
    tokes = nltk.word_tokenize(row['title'])
    for t in tokes:
        try:
            y = int(t)
            if y in common_years:
                return y
        except:
            continue
    return 0
    
wine_reviews['year'] = wine_reviews.apply (lambda row: getYear(row), axis=1)
wine_reviews = wine_reviews[np.isfinite(wine_reviews['year'])]
wine_reviews = wine_reviews[(wine_reviews['year']) > 0]

wine_reviews['tokens'] = wine_reviews.apply (lambda row: nltk.word_tokenize(row['description'].lower()),axis=1)

wine_reviews = wine_reviews[np.isfinite(wine_reviews['price'])]
wine_reviews = wine_reviews[np.isfinite(wine_reviews['points'])]

In [3]:
wine_reviews.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,year,tokens
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,"[this, is, ripe, and, fruity, ,, a, wine, that..."
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,"[tart, and, snappy, ,, the, flavors, of, lime,..."
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,"[pineapple, rind, ,, lemon, pith, and, orange,..."
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,"[much, like, the, regular, bottling, from, 201..."
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,2011,"[blackberry, and, raspberry, aromas, show, a, ..."


In [4]:
maps = []
for feature in ['designation', 'province', 'winery', 'variety', 'year']:
    unique = wine_reviews[feature].unique()
    keys = range(unique.shape[0])
    maps.append(dict(zip(unique, keys)))

region_map = {}
counter = 0
for region in (wine_reviews[['region_1', 'region_2']].values).flatten():
    if region not in region_map:
        region_map[region] = counter
        counter += 1
maps.append(region_map)

In [5]:
token_map = {}
counter = 0
for tokens in (wine_reviews['tokens'].values.flatten()):
    for token in tokens:
        if token not in token_map:
            token_map[token] = counter
            counter += 1
maps.append(token_map)

In [10]:
fields = ['designation', 'province', 'winery', 'variety','year', 'region_1']
matrices = []
for i in range(len(fields)):
    field = fields[i]
    m = maps[i]
    matrix = np.zeros((wine_reviews.shape[0], len(m)))
    
    vals = wine_reviews[field].values
    for j in range(len(vals)):
        val = vals[j]
        matrix[j, m[val]] = 1
    
    k = int(.1 * matrix.shape[1])
    U, S, V_tran = svds(matrix, k=k)
    V = V_tran.T
    print (U.shape)
    
    matrices.append(U)

(10000, 531)
(10000, 23)
(10000, 532)
(10000, 33)
(10000, 2)
(10000, 71)


In [13]:
m = token_map
vals = wine_reviews['tokens'].values.flatten()
matrix = np.zeros((wine_reviews.shape[0], len(m)))
for j in range(len(vals)):
        for token in vals[j]:
            k = m[token]
            matrix[j, k] = 1

k = int(.05 * matrix.shape[1])
U, S, V_tran = svds(matrix, k=k)
V = V_tran.T


In [56]:
matrices.append(U)
matrices.append(wine_reviews['price'].values.reshape(10000,1))
data = np.concatenate(matrices, axis=1)


In [57]:
print(data.shape)

(10000, 1958)


In [58]:
X = data
y = wine_reviews['points'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [59]:
logistic = LogisticRegression(penalty='l2')
logistic.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:
logistic.score(X_test, y_test)

0.18848484848484848

# Wine Composition Data

In [22]:
# Preprocess
red_comp = pd.read_csv('wine-quality/wineQualityReds.csv', index_col=0)
red_comp['red'] = 1
red_comp['white'] = 0
white_comp = pd.read_csv('wine-quality/wineQualityWhites.csv', index_col=0)
white_comp['red'] = 0
white_comp['white'] = 1
comp = red_comp.append(white_comp)

# Normalize combined data structure
data = comp.iloc[:,0:-2]
data.head()
data_norm = (data - data.min()) / (data.max() - data.min())
# #data_norm = (data-data.mean())/data.std()
composition = data_norm
composition['red'] = comp['red']
composition['white'] = comp['white']
composition['quality'] = comp['quality']

cols = composition.columns.tolist()
tmp = cols[-1]
cols[-1] = cols[-3]
cols[-3] = tmp
composition = composition[cols]

composition.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,white,red,quality
1,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,0,1,5
2,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,0,1,5
3,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,0,1,5
4,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,0,1,6
5,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,0,1,5


### Split into Training and Testing Samples

In [23]:
X = composition.iloc[:,0:-1]
y = composition.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Baseline K-NN

In [24]:
for i in range (1,20):
    neighbors = KNeighborsClassifier(n_neighbors=i)
    neighbors.fit(X_train, y_train) 
    score = neighbors.score(X_test, y_test)
    print ("Using %d Nearest Neighbors : Test Score is %f" % (i, score))

Using 1 Nearest Neighbors : Test Score is 0.605594
Using 2 Nearest Neighbors : Test Score is 0.532867
Using 3 Nearest Neighbors : Test Score is 0.538462
Using 4 Nearest Neighbors : Test Score is 0.535664
Using 5 Nearest Neighbors : Test Score is 0.553380
Using 6 Nearest Neighbors : Test Score is 0.550583
Using 7 Nearest Neighbors : Test Score is 0.555711
Using 8 Nearest Neighbors : Test Score is 0.554779
Using 9 Nearest Neighbors : Test Score is 0.550583
Using 10 Nearest Neighbors : Test Score is 0.548252
Using 11 Nearest Neighbors : Test Score is 0.546853
Using 12 Nearest Neighbors : Test Score is 0.534732
Using 13 Nearest Neighbors : Test Score is 0.538928
Using 14 Nearest Neighbors : Test Score is 0.531469
Using 15 Nearest Neighbors : Test Score is 0.535198
Using 16 Nearest Neighbors : Test Score is 0.533800
Using 17 Nearest Neighbors : Test Score is 0.531002
Using 18 Nearest Neighbors : Test Score is 0.534266
Using 19 Nearest Neighbors : Test Score is 0.538462


### Random Forests Approach

In [25]:
forest = RandomForestClassifier(max_depth=100, max_features=12, criterion='gini', random_state=0)
forest.fit(X_train, y_train)
print ('Forest Accuracy: %f \n' % forest.score(X_test, y_test))

Forest Accuracy: 0.634965 



In [26]:
cols = composition.columns.tolist()
for i in range(len(forest.feature_importances_)):
    print ("Feature: %s is weighted at %f" % (cols[i], forest.feature_importances_[i]))

Feature: fixed.acidity is weighted at 0.076307
Feature: volatile.acidity is weighted at 0.110024
Feature: citric.acid is weighted at 0.071525
Feature: residual.sugar is weighted at 0.082213
Feature: chlorides is weighted at 0.082387
Feature: free.sulfur.dioxide is weighted at 0.085802
Feature: total.sulfur.dioxide is weighted at 0.095711
Feature: density is weighted at 0.077123
Feature: pH is weighted at 0.087605
Feature: sulphates is weighted at 0.087070
Feature: alcohol is weighted at 0.141901
Feature: white is weighted at 0.001738
Feature: red is weighted at 0.000593


We still are not getting very good results, so let's try to simplify the problem:
    1. Determining a score out of 10 is difficult
        * It puts us under the assumption that there is a meaningful difference between a wine rated 5 and another
              rated at a 6
        * This might not be true
        * The marginal difference in taste should not affect a beginner
        
    2. This means that we can simplify the labels into two buckets
        * Good to Great wine rated 6-9
        * Bad to Ok wine rated 3-5
          
        

In [27]:
def simplifyClassificationBinary(row):
    if row['quality'] >= 6:
        return 1
    else:
        return -1

composition['class'] = composition.apply (lambda row: simplifyClassificationBinary(row), axis=1)
composition.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,white,red,quality,class
1,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,0,1,5,-1
2,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,0,1,5,-1
3,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,0,1,5,-1
4,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,0,1,6,1
5,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,0,1,5,-1


In [28]:
print('We have %f of the data as good wine' % (np.sum(composition['class'] == 1)/float(6497)))
print('We have %f of the data as bad wine' % (np.sum(composition['class'] == -1)/float(6497)))


We have 0.633061 of the data as good wine
We have 0.366939 of the data as bad wine


In [29]:
X = composition.iloc[:,0:-2]
y = composition.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

for i in range (1,20):
    neighbors = KNeighborsClassifier(n_neighbors=i)
    neighbors.fit(X_train, y_train) 
    score = neighbors.score(X_test, y_test)
    print ("Using %d Nearest Neighbors : Test Accuracy is %f" % (i, score))

Using 1 Nearest Neighbors : Test Accuracy is 0.761772
Using 2 Nearest Neighbors : Test Accuracy is 0.700233
Using 3 Nearest Neighbors : Test Accuracy is 0.741259
Using 4 Nearest Neighbors : Test Accuracy is 0.720280
Using 5 Nearest Neighbors : Test Accuracy is 0.749184
Using 6 Nearest Neighbors : Test Accuracy is 0.737995
Using 7 Nearest Neighbors : Test Accuracy is 0.749184
Using 8 Nearest Neighbors : Test Accuracy is 0.748252
Using 9 Nearest Neighbors : Test Accuracy is 0.755711
Using 10 Nearest Neighbors : Test Accuracy is 0.741725
Using 11 Nearest Neighbors : Test Accuracy is 0.750117
Using 12 Nearest Neighbors : Test Accuracy is 0.739860
Using 13 Nearest Neighbors : Test Accuracy is 0.746387
Using 14 Nearest Neighbors : Test Accuracy is 0.737529
Using 15 Nearest Neighbors : Test Accuracy is 0.751049
Using 16 Nearest Neighbors : Test Accuracy is 0.741259
Using 17 Nearest Neighbors : Test Accuracy is 0.745455
Using 18 Nearest Neighbors : Test Accuracy is 0.738462
Using 19 Nearest Ne

In [30]:
forest = RandomForestClassifier(max_depth=100, max_features=2, criterion='entropy', random_state=0)
forest.fit(X_train, y_train)
print ('Forest Accuracy: %f \n' % forest.score(X_test, y_test))

cols = composition.columns.tolist()
for i in range(len(forest.feature_importances_)):
    print ("Feature: %s is weighted at %f" % (cols[i], forest.feature_importances_[i]))

Forest Accuracy: 0.807925 

Feature: fixed.acidity is weighted at 0.062797
Feature: volatile.acidity is weighted at 0.110059
Feature: citric.acid is weighted at 0.086491
Feature: residual.sugar is weighted at 0.082673
Feature: chlorides is weighted at 0.089271
Feature: free.sulfur.dioxide is weighted at 0.079578
Feature: total.sulfur.dioxide is weighted at 0.084125
Feature: density is weighted at 0.107958
Feature: pH is weighted at 0.077060
Feature: sulphates is weighted at 0.088018
Feature: alcohol is weighted at 0.122911
Feature: white is weighted at 0.006512
Feature: red is weighted at 0.002548


Now we get 80% accuracy! Which is good, but consumers may still not put their faith in an application that can only tell them whether wine is good or bad 80% of time. In order to try to utilize the robustness of decision trees, we'll use a boosted tree and our KNN classifier as a linear combination.

In [31]:
boostedTree = GradientBoostingClassifier(max_depth=None, n_estimators = 100, max_features=3, criterion='mse', random_state=0)
boostedTree.fit(X_train, y_train)
forest = RandomForestClassifier(max_depth=100, max_features=2, criterion='entropy', random_state=0)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=100, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [32]:
neighbors = KNeighborsClassifier(n_neighbors=1)
neighbors.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [33]:
pred = (forest.predict_proba(X_test)) + (2*boostedTree.predict_proba(X_test)) + neighbors.predict_proba(X_test)
y_pred = np.zeros(pred.shape[0])
for i in range(len(pred)):
    if pred[i,1] > pred[i,0]:
        y_pred[i] = 1
    else:
        y_pred[i] = -1
accuracy_score(y_test,y_pred)

0.8135198135198135