In [1]:
#import stuff
import pandas as pd

In [2]:
#import data
data = pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")

In [3]:
#after data.head() check list of columns helps to check for spaces
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [4]:
#get info about the data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
#give more info regarding their spread
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [6]:
#check for null NaN values
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [7]:
#quality is what we need to model for and find out in the end so classification
data.quality.unique()

array([5, 6, 7, 4, 8, 3])

In [8]:
#another method to check the various classifications of quality
data['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [9]:
#first need to do train test split before scaling, we drop quality as we need to predict quality
X = data.drop("quality",axis =1)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [10]:
y = data["quality"]
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# test_size will tell what % to be used for test data, random_state is the random seed so that everyone will get uniform random numbers
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.33, random_state =42)

In [13]:
X_test.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
803,7.7,0.56,0.08,2.5,0.114,14.0,46.0,0.9971,3.24,0.66,9.6
124,7.8,0.5,0.17,1.6,0.082,21.0,102.0,0.996,3.39,0.48,9.5
350,10.7,0.67,0.22,2.7,0.107,17.0,34.0,1.0004,3.28,0.98,9.9
682,8.5,0.46,0.31,2.25,0.078,32.0,58.0,0.998,3.33,0.54,9.8
1326,6.7,0.46,0.24,1.7,0.077,18.0,34.0,0.9948,3.39,0.6,10.6


In [14]:
#from data.describe().T we know that it is having values like 15 and 0.1 hence need scaling
from sklearn.preprocessing import StandardScaler 
#we can use robust scaler instead of standard scaler. robust scaler accounts for outliers

In [22]:
#make an instance or object of the class
scaler = StandardScaler()

In [23]:
#calculate mean and std. dev. 
scaler.fit(X_train)

In [24]:
print(scaler.mean_)

[ 8.30345472  0.53246499  0.26933707  2.54691877  0.08772736 15.91223156
 46.76330532  0.99677933  3.31453782  0.65881419 10.41521942]


In [25]:
scaler.transform(X_train)

array([[ 2.40069523, -1.03103722,  1.12742595, ..., -1.26096312,
         0.52726134, -0.01431863],
       [-0.93967131,  1.22920403, -1.32502245, ...,  1.52622836,
        -0.28225704,  2.24363201],
       [-0.99827424,  0.55113165, -1.37611513, ..., -0.74241587,
        -1.20742091, -0.86105011],
       ...,
       [-0.6466567 ,  0.49462562, -1.06955908, ...,  1.26695473,
        -0.68701624, -0.86105011],
       [-0.23643625, -1.87862768,  0.4121285 , ...,  0.03540501,
         0.81637505,  1.39690052],
       [-1.46709761, -1.3700734 , -0.04770558, ...,  0.48913386,
        -0.68701624,  2.90220094]])

In [26]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
548,12.4,0.35,0.49,2.6,0.079,27.0,69.0,0.9994,3.12,0.75,10.4
355,6.7,0.75,0.01,2.4,0.078,17.0,32.0,0.9955,3.55,0.61,12.8
1296,6.6,0.63,0.0,4.3,0.093,51.0,77.5,0.99558,3.2,0.45,9.5
209,11.0,0.3,0.58,2.1,0.054,7.0,19.0,0.998,3.31,0.88,10.5
140,8.4,0.745,0.11,1.9,0.09,16.0,63.0,0.9965,3.19,0.82,9.6


In [38]:
#using fit transform as one step instead of two separate steps
X_train_transform = scaler.fit_transform(X_train)
X_train_transform

array([[ 2.40069523, -1.03103722,  1.12742595, ..., -1.26096312,
         0.52726134, -0.01431863],
       [-0.93967131,  1.22920403, -1.32502245, ...,  1.52622836,
        -0.28225704,  2.24363201],
       [-0.99827424,  0.55113165, -1.37611513, ..., -0.74241587,
        -1.20742091, -0.86105011],
       ...,
       [-0.6466567 ,  0.49462562, -1.06955908, ...,  1.26695473,
        -0.68701624, -0.86105011],
       [-0.23643625, -1.87862768,  0.4121285 , ...,  0.03540501,
         0.81637505,  1.39690052],
       [-1.46709761, -1.3700734 , -0.04770558, ...,  0.48913386,
        -0.68701624,  2.90220094]])

In [30]:
y_train

548     6
355     6
1296    5
209     7
140     5
       ..
1130    6
1294    6
860     5
1459    7
1126    6
Name: quality, Length: 1071, dtype: int64

In [41]:
from sklearn.svm import SVC
model = SVC()

In [42]:
model.fit(X_train_transform,y_train)

In [43]:
model.score(X_train_transform,y_train)

0.6778711484593838

In [48]:
#check using test data but first need to scale test data, we don't do fit because it has to use the mean and std dev obtained from X_train so as to prevent over fitting
X_test_transform = scaler.transform(X_test)

In [52]:
#use model to predict y (quality) values for the test data
y_predict = model.predict(X_test_transform)

In [53]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.5984848484848485

In [55]:
#compare with logistic regression for multi-class classification
from sklearn.linear_model import LogisticRegression

In [56]:
model2 = LogisticRegression()

In [57]:
model2.fit(X_train_transform, y_train)

In [60]:
y_predict2 = model2.predict(X_test_transform)

In [61]:
accuracy_score(y_test, y_predict2)

0.571969696969697