## Importing Libraries

In [1]:
import pandas as pd 
import numpy  as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv("winequality_red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Descriptive Statistics

In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


## Assigning the Target and features for training and test data

In [5]:
X = df[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','sulphates','pH','alcohol']]
y = df[['quality']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
clf.score(X_test, y_test)

0.351388533251739

## Logistic regression

In [9]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='liblinear',multi_class='ovr')
clf.fit(X_train, y_train.values.ravel()) 


#.values will give the values in an array. (shape: (n,1)
#.ravel will convert that array shape to (n, )

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

## Support Vector Machine

In [10]:
from sklearn.svm import SVC

In [11]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train.values.ravel())
svm.score(X_test, y_test)

0.5604166666666667

In [12]:
clf.score(X_test, y_test)

0.5604166666666667

## Decision Tree

In [13]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X, y)
model.score(X,y)


1.0

## Random Forest 

In [14]:
 from sklearn.ensemble import RandomForestClassifier

In [15]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train.values.ravel())
rf.score(X_test, y_test)

0.6604166666666667

## Cross Validation 

In [16]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y.values.ravel(),cv=3)

array([0.51962617, 0.58161351, 0.55743879])

In [19]:
cross_val_score(LinearRegression(), X, y,cv=3)

array([0.27041861, 0.3549773 , 0.30932782])

In [20]:
cross_val_score(SVC(gamma='auto'), X, y.values.ravel(),cv=3)

array([0.51028037, 0.49155722, 0.47457627])

In [21]:
cross_val_score(tree.DecisionTreeClassifier(),X,y,cv=3)

array([0.45233645, 0.42776735, 0.47269303])

In [22]:
cross_val_score(RandomForestClassifier(n_estimators=40), X, y.values.ravel(),cv=3)

array([0.52523364, 0.56472795, 0.58380414])

# Accuracies of all Algorithms

In [23]:
Linear_score = cross_val_score(LinearRegression(), X, y,cv=3)
np.average(Linear_score)

0.3115745778431927

In [24]:
Logistic_score =cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y.values.ravel(),cv=3)
np.average(Logistic_score)

0.5528928237980021

In [25]:
SVC_score = cross_val_score(SVC(gamma='auto'), X, y.values.ravel(),cv=3)
np.mean(SVC_score)

0.49213795609425226

In [26]:
Decision_score = cross_val_score(tree.DecisionTreeClassifier(),X,y,cv=3)
np.mean(Decision_score)

0.4596714380389568

In [27]:
Random_score =cross_val_score(RandomForestClassifier(n_estimators=40), X, y.values.ravel(),cv=3)
np.mean(Random_score)

0.5510543043060148

## Scores 

**Linear Regression** = `31.15 %`

**Logistic Regression** = **`55.28 %`**

**SVM** = `49.21 %`

**Decision Tree** = `45.59 %`

**Random Forest** = `55.10 %`


**Random Forest** and **Logistic Regression** produce nearly same results , **Logistic Regression** has a slightly upper hand 


