In [1]:
import requests
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
http_request = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv')

In [3]:
http_text = http_request.text.split("\n")
http_text

['"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"',
 '7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6',
 '6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6',
 '8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6',
 '7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6',
 '7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6',
 '8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6',
 '6.2;0.32;0.16;7;0.045;30;136;0.9949;3.18;0.47;9.6;6',
 '7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6',
 '6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6',
 '8.1;0.22;0.43;1.5;0.044;28;129;0.9938;3.22;0.45;11;6',
 '8.1;0.27;0.41;1.45;0.033;11;63;0.9908;2.99;0.56;12;5',
 '8.6;0.23;0.4;4.2;0.035;17;109;0.9947;3.14;0.53;9.7;5',
 '7.9;0.18;0.37;1.2;0.04;16;75;0.992;3.18;0.63;10.8;5',
 '6.6;0.16;0.4;1.5;0.044;48;143;0.9912;3.54;0.52;12.4;7',
 '8.3;0.42;0.62;19.25;0.04;41;172;1.0002;2.

In [5]:
data_array = []
for lines in http_text:
    data_array.append(lines.split(";"))
col_names = []
for col in data_array[0]:
    col_names.append(col.strip('"'))

data_frame = pd.DataFrame(data=data_array[1:],columns=col_names).apply(pd.to_numeric)

In [5]:
data_frame

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6.0
1,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6.0
2,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6.0
3,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6.0
4,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6.0
5,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6.0
6,6.2,0.320,0.16,7.00,0.045,30.0,136.0,0.99490,3.18,0.47,9.600000,6.0
7,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6.0
8,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6.0
9,8.1,0.220,0.43,1.50,0.044,28.0,129.0,0.99380,3.22,0.45,11.000000,6.0


In [7]:
numrecs = data_frame.shape[0]
print(numrecs)
data_frame = data_frame.ix[0:numrecs-2,:]
data_frame

4899


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6.0
1,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6.0
2,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6.0
3,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6.0
4,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6.0
5,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6.0
6,6.2,0.320,0.16,7.00,0.045,30.0,136.0,0.99490,3.18,0.47,9.600000,6.0
7,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6.0
8,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6.0
9,8.1,0.220,0.43,1.50,0.044,28.0,129.0,0.99380,3.22,0.45,11.000000,6.0


In [10]:
label = data_frame["quality"]
predictors = data_frame[col_names[:-1]]


scaler_model = MinMaxScaler().fit_transform(predictors.values)
bins = np.linspace(0, 1, 10)
print(bins)
digitized = np.digitize(scaler_model, bins)


[ 0.          0.11111111  0.22222222  0.33333333  0.44444444  0.55555556
  0.66666667  0.77777778  0.88888889  1.        ]


In [43]:
clf = svm.SVC()
clf.fit(X=digitized,y=label.values)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
score_knn = cross_val_score(clf, digitized, label.values, cv=4)
print("Cross Validation score : " + str(score_knn))
print("Cross Validation Mean score : " + str(score_knn.mean()))

Cross Validation score : [ 0.48655257  0.50816993  0.53676471  0.52825553]
Cross Validation Mean score : 0.514935684004


In [49]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label.values, clf.predict(X=digitized))

array([[   2,    0,    6,   12,    0,    0,    0],
       [   0,   11,   89,   61,    2,    0,    0],
       [   0,    0,  869,  582,    6,    0,    0],
       [   0,    1,  316, 1785,   95,    0,    0],
       [   0,    0,   36,  579,  264,    0,    0],
       [   0,    0,    7,  120,   48,    0,    0],
       [   0,    0,    0,    2,    3,    0,    0]])