# Predicting the Origin of Wine - ML 101 with Examples

#### Step 1: Import Packages

In [1]:
import sys
import scipy
import numpy
import matplotlib
import pandas
import sklearn
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import datasets, svm, metrics
from sklearn.neural_network import MLPClassifier
from random import randint


#### Step 2: Pull in Data

Pull in the data from UCI - a free repository of ML data sets:

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wineNames = ["Cultivator", "Alcohol", "Malic_Acid", "Ash",
                 "Alcalinity_of_Ash", "Magnesium", "Total_phenols",
                 "Falvanoids", "Nonflavanoid_phenols", "Proanthocyanins",
                 "Color_intensity", "Hue", "OD280", "Proline"]
wine = pd.read_csv(url, names = wineNames)

#### Step 3: Construct Trainin and Test Sets

Separate data into training and test sets - very important!

In [3]:
X = wine.drop('Cultivator', axis = 1)
y = wine['Cultivator']

X_train, X_test, y_train, y_test = train_test_split(X, y)

#### Step 4: Normalize Data

Normalize data to have 0 mean and unit variance

In [4]:
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

So what does our data look like?

In [5]:
print(X.loc[0:20, 'Alcohol':'Magnesium'])

    Alcohol  Malic_Acid   Ash  Alcalinity_of_Ash  Magnesium
0     14.23        1.71  2.43               15.6        127
1     13.20        1.78  2.14               11.2        100
2     13.16        2.36  2.67               18.6        101
3     14.37        1.95  2.50               16.8        113
4     13.24        2.59  2.87               21.0        118
5     14.20        1.76  2.45               15.2        112
6     14.39        1.87  2.45               14.6         96
7     14.06        2.15  2.61               17.6        121
8     14.83        1.64  2.17               14.0         97
9     13.86        1.35  2.27               16.0         98
10    14.10        2.16  2.30               18.0        105
11    14.12        1.48  2.32               16.8         95
12    13.75        1.73  2.41               16.0         89
13    14.75        1.73  2.39               11.4         91
14    14.38        1.87  2.38               12.0        102
15    13.63        1.81  2.70           

# Ok - let's learn!
&nbsp;

#### Algo 1: Logistic Regression

In [6]:
lr = LogisticRegression()

lr.fit(X_train, y_train)

predictions = lr.predict(X_test)

predictions_train = lr.predict(X_train)

print("Accuracy of Logistic Regression model on Training Set: " + str(accuracy_score(y_train, predictions_train)))
print("Accuracy of Logistic Regression model on Test Set: " + str(accuracy_score(y_test, predictions)))
print()
print()
print(classification_report(y_test, predictions))


Accuracy of Logistic Regression model on Training Set: 1.0
Accuracy of Logistic Regression model on Test Set: 0.977777777778


             precision    recall  f1-score   support

          1       1.00      1.00      1.00        16
          2       1.00      0.92      0.96        12
          3       0.94      1.00      0.97        17

avg / total       0.98      0.98      0.98        45



#### Algo 2: Neural Network

In [9]:
mlp = MLPClassifier(hidden_layer_sizes = (50, 50), max_iter = 500)

mlp.fit(X_train, y_train)

predictions = mlp.predict(X_test)

predictions_train = mlp.predict(X_train)

print("Accuracy of Neural Network on Training Set: " + str(accuracy_score(y_train, predictions_train)))
print("Accuracy of Neural Network on Test Set: " + str(accuracy_score(y_test, predictions)))
print()
print()
print(classification_report(y_test, predictions))


Accuracy of Neural Network on Training Set: 1.0
Accuracy of Neural Network on Test Set: 1.0


             precision    recall  f1-score   support

          1       1.00      1.00      1.00        16
          2       1.00      1.00      1.00        12
          3       1.00      1.00      1.00        17

avg / total       1.00      1.00      1.00        45

