# LOGISTIC REGRESSION (BLIND TASTING - PREDICT VARIETY)

### 1. Import Libraries

In [1]:
import re

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

from scipy.sparse import hstack
from scipy.stats import pearsonr

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.svm import LinearSVC
from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import utils
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

### 2. Load Data and Preprocess It

In [2]:

# df = pd.read_csv("../data/winemag-data-130k-v2.csv", nrows=50000,index_col=0)
df = pd.read_csv("../data/winemag-data_first150k.csv", nrows=50000, index_col=0)

In [3]:
# Drop columns
df = df.drop(['designation', 'province', 'region_1', 'region_2', 'winery'], axis = 1)

# Test and remove duplicates
df[df.duplicated('description',keep=False)].head()
df = df.drop_duplicates('description')

In [4]:
# Test for null values
df.apply(lambda x: sum(x.isnull()),axis=0)

country           2
description       0
points            0
price          4102
variety           0
dtype: int64

In [5]:
# Fill null prices with mean value
df['price'].fillna(df['price'].mean(), inplace=True)
df.apply(lambda x: sum(x.isnull()),axis=0) 

country        2
description    0
points         0
price          0
variety        0
dtype: int64

In [6]:
# Drop those 2 rows without country value filled in
df.dropna(axis='rows',inplace=True)
df.apply(lambda x: sum(x.isnull()),axis=0) 

country        0
description    0
points         0
price          0
variety        0
dtype: int64

In [7]:
# Preprocess 
#   - Lower each word
#   - Strip and split
#   - Remove stopwords
#   - Stem step (bring each word to it's base form)
#   - Join again

stopword_list = stopwords.words('english')
ps = PorterStemmer()
for i in range(0,len(df['description'])):
    try:
        description = re.sub('[^a-zA-Z]',' ',df['description'][i])
        description = description.lower().strip()
        description_words = description.split()
        description_words = [word for word in description_words if not word in stopword_list]
        description_words = [ps.stem(word) for word in description_words]
        df['description'][i] = ' '.join(description_words)
    except:
        pass

In [8]:
# Drop wine types with less than 200 observations

df = df.groupby('variety').filter(lambda x: len(x) >100)
df = df.groupby('variety').filter(lambda x: len(x) >200)

### 3. Prepare Training and Test Sets

In [9]:
# X = Input data
# y = Output data (labels)
X = df.drop(['country','points', 'variety'], axis = 1)
y = df.variety

# Split X and y into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(28404, 2) (9469, 2) (28404,) (9469,)


In [12]:
# Get varieties as unique values, sorted

output = set()
for x in df.variety:
    x = x.lower()
    x = x.split()
    for y in x:
        output.add(y)

variety_list = sorted(output)
variety_list

# Remove extra characters and stopwords
extras = ['',' ',""," ",'.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', 'cab',"%"]
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
stop.update(variety_list)
stop.update(extras)

In [13]:
# Vectorize input and output data

vect = CountVectorizer(stop_words = stop)
X_train_dtm = vect.fit_transform(X_train.description)
price = X_train.price.values[:,None]
X_train_dtm = hstack((X_train_dtm, price))

X_test_dtm = vect.transform(X_test.description)
price_test = X_test.price.values[:,None]
X_test_dtm = hstack((X_test_dtm, price_test))

  'stop_words.' % sorted(inconsistent))


In [14]:
%%time

# Let's use Logistic Regression and extract the model
models = {}
for z in wine:
    model = LogisticRegression()
    y = y_train == z
    model.fit(X_train_dtm, y)
    models[z] = model

testing_probs = pd.DataFrame(columns = wine)



Wall time: 27.8 s


In [15]:
# Predict and view accuracy
for variety in wine:
    testing_probs[variety] = models[variety].predict_proba(X_test_dtm)[:,1]


predicted_wine = testing_probs.idxmax(axis=1)

comparison = pd.DataFrame({'actual':y_test.values, 'predicted':predicted_wine.values})   
  

print('Accuracy Score:',accuracy_score(comparison.actual, comparison.predicted)*100,"%")

# Display some predictions
comparison

Accuracy Score: 58.93969796176999 %


Unnamed: 0,actual,predicted
0,Riesling,Riesling
1,White Blend,White Blend
2,Pinot Noir,Pinot Noir
3,Pinot Noir,Pinot Noir
4,Riesling,Riesling
...,...,...
9464,Syrah,Bordeaux-style Red Blend
9465,Merlot,Bordeaux-style Red Blend
9466,Riesling,Riesling
9467,Zinfandel,Cabernet Sauvignon
